From e261ecea18786f777ded7506ce3ea976f1d50c08 Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Tue, 10 Sep 2024 05:57:10 -0700
Subject: [PATCH 1/4] 2.23.4-1

Add scalable init API
 * Add new ncclCommInitRankScalable to allow for passing multiple
   unique IDs to the init function.
 * Spreads the load onto multiple bootstrap roots, allowing for
   constant bootstrap time.
 * Requires multiple ranks to create a unique ID, and the CPU-side
   ID exchange code to call allgather[v] instead of broadcast.

Accelerate init bootstrap operations
 * Reduce the number of calls to allgather.
 * Allow roots to reply early to ranks when information is already
   available.
 * Add an option to use ncclNet instead of sockets to perform
   bootstrap allgather operations.

Add PAT algorithms for Allgather and ReduceScatter
 * Parallel Aggregated Trees, variation of Bruck algorithm.
 * Logarithmic number of network steps for small sizes at scale.
 * Only supports one rank per node at the moment.

Add support for registered buffers for intra-node communication.
 * Allow registered user buffers to be accessed directly intra-node
 * Avoids extra copies in algorithms which permit it, saving
   memory bandwidth and helping with compute overlap.

Add profiler plugin API
 * New plugin API for profiling
 * Supports various levels of profiling, with a hierarchy.

Asynchronous graph allocation
 * Make calls to cudaMalloc and cudaMemcpy during graph allocation
   asynchronous.
 * Significantly speeds up graph capture.

Use fatal IB asynchronous events to stop network operation
 * Avoids many other error messages
 * Only fatal errors are affected; potentially transient errors
   (e.g. port down) do not cause an immediate stop.

Set P2P level to PXB on AMD CPUs when using more than 2 GPUs per node
 * P2P would cause a significant performance degradation when using
   many GPUs, and therefore many interleaved data flows.
 * Disable P2P through the CPU when we have 3+ GPUs per node; keep it
   enabled when we only have 2 GPUs.

Improve the init logs to report the real NCCL function.
 * Make the log report ncclCommInitRank or ncclCommSplit, rather than
   the generic ncclCommInitRankFunc.

Add a parameter to set the location of the user configuration file.
 * Add NCCL_CONF_FILE environment variable to set where the user's
   configuration file resides.

Increase default IB timeout
 * Increase IB timeout value from 18 to 20.
 * Should help avoid fatal errors on large RoCE systems.

Add new check for nvidia peermem
 * On linux kernels 6.6+, /sys/kernel/mm/memory_peers is no longer
   present; check for /sys/module/nvidia_peermem/version instead.

Fix old performance regression when mixing small and large operations.
 * Improves distribution of work on channels.

Fix crash when NUMA IDs are equal to -1.
 * Can happen when a NIC is a virtual NIC, or when linux doesn't
   know which NUMA node a device is attached to
 * Issue NVIDIA/nccl-tests#233

Fix tree graph search when NCCL_CROSS_NIC is set to 1.
 * Would force NCCL to use the balanced_tree pattern, thereby
   disabling LL128 on platforms with 1 GPU+1 NIC per PCI switch.
 * Would also try to use alternate rings even though it was not
   needed.

Compiler tweaks and fixes
 * PR #1177
 * PR #1228

Fix stack smash
 * PR #1325

Fixes for multi-node NVLink + IB operation

Coverity fixes and comments.
---
 ext-profiler/example/Makefile           |  16 +
 ext-profiler/example/event.c            |  30 +
 ext-profiler/example/event.h            | 167 +++++
 ext-profiler/example/nccl/common.h      |  15 +
 ext-profiler/example/nccl/err.h         |  19 +
 ext-profiler/example/nccl/profiler.h    |  18 +
 ext-profiler/example/nccl/profiler_v1.h | 150 ++++
 ext-profiler/example/nccl/types.h       |  21 +
 ext-profiler/example/plugin.c           | 492 ++++++++++++
 ext-profiler/example/print_event.c      | 277 +++++++
 ext-profiler/example/print_event.h      |  13 +
 ext-tuner/example/nccl/tuner.h          |   3 +-
 makefiles/common.mk                     |   7 +
 makefiles/version.mk                    |   4 +-
 src/bootstrap.cc                        | 950 ++++++++++++++++++------
 src/collectives.cc                      |   1 +
 src/debug.cc                            |  32 +-
 src/device/all_gather.h                 |  59 +-
 src/device/all_reduce.h                 | 115 ++-
 src/device/broadcast.h                  |  15 +-
 src/device/common.h                     |  16 +-
 src/device/common_kernel.h              |  18 +-
 src/device/generate.py                  |  11 +-
 src/device/network/unpack/unpack.h      |   6 +-
 src/device/op128.h                      |  49 +-
 src/device/primitives.h                 |  12 +-
 src/device/prims_ll.h                   |  47 +-
 src/device/prims_ll128.h                |  10 +-
 src/device/prims_simple.h               | 681 +++++++++++------
 src/device/reduce.h                     |   3 +
 src/device/reduce_kernel.h              |  56 +-
 src/device/reduce_scatter.h             |  56 +-
 src/device/sendrecv.h                   |  13 +-
 src/enqueue.cc                          | 525 ++++++++++---
 src/graph/connect.cc                    |  51 +-
 src/graph/paths.cc                      | 108 ++-
 src/graph/rings.cc                      |  28 +-
 src/graph/search.cc                     | 108 ++-
 src/graph/topo.cc                       | 181 ++---
 src/graph/topo.h                        |   7 +-
 src/graph/tuning.cc                     |  80 +-
 src/graph/xml.cc                        |   7 +-
 src/group.cc                            |  58 +-
 src/include/alloc.h                     | 145 +++-
 src/include/bitops.h                    |  11 +
 src/include/bootstrap.h                 |   4 +-
 src/include/checks.h                    |  61 +-
 src/include/collectives.h               | 486 ++++++++++++
 src/include/comm.h                      | 131 +++-
 src/include/cudawrap.h                  |   2 +
 src/include/device.h                    |  23 +-
 src/include/graph.h                     |   7 +-
 src/include/nccl_common.h               |   3 +-
 src/include/nvtx.h                      |  27 +-
 src/include/p2p.h                       |  31 +-
 src/include/profiler.h                  |  58 +-
 src/include/proxy.h                     |  41 +-
 src/include/register.h                  |  13 +-
 src/include/shm.h                       |  47 +-
 src/include/timer.h                     |  14 +-
 src/include/transport.h                 |  10 +-
 src/include/utils.h                     |   1 -
 src/init.cc                             | 422 ++++++++---
 src/misc/argcheck.cc                    |   4 +
 src/misc/cudawrap.cc                    |  28 +-
 src/misc/ipcsocket.cc                   |  23 +-
 src/misc/nvmlwrap.cc                    |   4 +
 src/misc/param.cc                       |  28 +-
 src/misc/profiler.cc                    | 595 ++++++++++++---
 src/misc/shmutils.cc                    |  13 +-
 src/misc/socket.cc                      |  16 +-
 src/misc/tuner.cc                       |   2 +
 src/misc/utils.cc                       |  21 +-
 src/nccl.h.in                           |   7 +
 src/net.cc                              |   9 +-
 src/proxy.cc                            | 304 ++++++--
 src/register.cc                         |  25 +-
 src/transport.cc                        | 108 +--
 src/transport/coll_net.cc               |  39 +-
 src/transport/generic.cc                |  23 +
 src/transport/net.cc                    | 135 ++--
 src/transport/net_ib.cc                 | 344 ++++++---
 src/transport/net_socket.cc             |  54 +-
 src/transport/nvls.cc                   |  76 +-
 src/transport/p2p.cc                    | 564 ++++++++++++--
 src/transport/shm.cc                    | 414 ++++++++---
 86 files changed, 6943 insertions(+), 1965 deletions(-)
 create mode 100644 ext-profiler/example/Makefile
 create mode 100644 ext-profiler/example/event.c
 create mode 100644 ext-profiler/example/event.h
 create mode 100644 ext-profiler/example/nccl/common.h
 create mode 100644 ext-profiler/example/nccl/err.h
 create mode 100644 ext-profiler/example/nccl/profiler.h
 create mode 100644 ext-profiler/example/nccl/profiler_v1.h
 create mode 100644 ext-profiler/example/nccl/types.h
 create mode 100644 ext-profiler/example/plugin.c
 create mode 100644 ext-profiler/example/print_event.c
 create mode 100644 ext-profiler/example/print_event.h

diff --git a/ext-profiler/example/Makefile b/ext-profiler/example/Makefile
new file mode 100644
index 0000000000..ee8e0cf081
--- /dev/null
+++ b/ext-profiler/example/Makefile
@@ -0,0 +1,16 @@
+#
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+NCCL_HOME := ../../build
+INC := -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl
+PLUGIN_SO := libnccl-profiler.so
+
+default: $(PLUGIN_SO)
+
+$(PLUGIN_SO): plugin.c event.c print_event.c
+	$(CC) $(INC) -g -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
+
+clean:
+	rm -f $(PLUGIN_SO)
diff --git a/ext-profiler/example/event.c b/ext-profiler/example/event.c
new file mode 100644
index 0000000000..717fe86884
--- /dev/null
+++ b/ext-profiler/example/event.c
@@ -0,0 +1,30 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <stdio.h>
+#include "event.h"
+
+int taskEventQueueEmpty(struct group* g) {
+  return g->eventHead == NULL;
+}
+
+void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event) {
+  event->next = NULL;
+  if (g->eventHead) g->eventTail->next = event;
+  else g->eventHead = event;
+  g->eventTail = event;
+}
+
+struct taskEventBase* taskEventQueueHead(struct group* g) {
+  return g->eventHead;
+}
+
+struct taskEventBase* taskEventQueueDequeue(struct group* g) {
+  struct taskEventBase* tmp = g->eventHead;
+  g->eventHead = g->eventHead->next;
+  if (g->eventHead == NULL) g->eventTail = NULL;
+  return tmp;
+}
diff --git a/ext-profiler/example/event.h b/ext-profiler/example/event.h
new file mode 100644
index 0000000000..7432808133
--- /dev/null
+++ b/ext-profiler/example/event.h
@@ -0,0 +1,167 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef EVENT_H_
+#define EVENT_H_
+
+#include <sys/types.h>
+#include <stdint.h>
+#include <unistd.h>
+#include "profiler.h"
+
+#define MAX_CHANNELS                     32
+#define MAX_STEPS                        16
+
+#define PROXY_OP_SEND_STATE_OFFSET       (ncclProfilerProxyOpSendPosted)
+#define PROXY_OP_RECV_STATE_OFFSET       (ncclProfilerProxyOpRecvPosted)
+#define PROXY_STEP_SEND_STATE_OFFSET     (ncclProfilerProxyStepSendGPUWait)
+#define PROXY_STEP_RECV_STATE_OFFSET     (ncclProfilerProxyStepRecvWait)
+
+#define NUM_PROXY_OP_SEND_STATES         (ncclProfilerProxyOpSendDone      - ncclProfilerProxyOpSendPosted    + 1)
+#define NUM_PROXY_OP_RECV_STATES         (ncclProfilerProxyOpRecvDone      - ncclProfilerProxyOpRecvPosted    + 1)
+#define NUM_PROXY_STEP_SEND_STATES       (ncclProfilerProxyStepSendWait    - ncclProfilerProxyStepSendGPUWait + 1)
+#define NUM_PROXY_STEP_RECV_STATES       (ncclProfilerProxyStepRecvGPUWait - ncclProfilerProxyStepRecvWait    + 1)
+
+#define PROXY_OP_SEND_STATE_IDX(state)   (state - PROXY_OP_SEND_STATE_OFFSET)
+#define PROXY_OP_RECV_STATE_IDX(state)   (state - PROXY_OP_RECV_STATE_OFFSET)
+#define PROXY_STEP_SEND_STATE_IDX(state) (state - PROXY_STEP_SEND_STATE_OFFSET)
+#define PROXY_STEP_RECV_STATE_IDX(state) (state - PROXY_STEP_RECV_STATE_OFFSET)
+
+#define MAX_PROXY_OP_STATES              ((NUM_PROXY_OP_SEND_STATES   > NUM_PROXY_OP_RECV_STATES  ) ? NUM_PROXY_OP_SEND_STATES   : NUM_PROXY_OP_RECV_STATES)
+#define MAX_PROXY_STEP_STATES            ((NUM_PROXY_STEP_SEND_STATES > NUM_PROXY_STEP_RECV_STATES) ? NUM_PROXY_STEP_SEND_STATES : NUM_PROXY_STEP_RECV_STATES)
+
+#define MAX_COMM_CLIQUES                 (32 * 8)
+
+struct proxyOp;
+
+struct proxyStep {
+  uint8_t type;                     // type of event: network transfer
+  int step;                         // network transfer id in given channel
+  int isSend;                       // send/recv channel operation
+  double timestamp[MAX_PROXY_STEP_STATES];
+  double startTs;
+  double stopTs;
+  struct proxyOp* parent;
+};
+
+struct proxyOp {
+  uint8_t type;                     // type of event: proxy operation
+  uint8_t channelId;                // channel id for this proxy operation
+  pid_t pid;
+  int rank;
+  int peer;                         // peer rank for this proxy operation
+  int nSteps;                       // total number of network transfers for this proxy operation
+  int chunkSize;                    // chunk size for this proxy operation
+  int isSend;                       // send/recv channel operation
+  size_t transSize;                 // transfer data size for this proxy operation
+  struct {
+    int steps;                      // completed steps for this proxy operation state
+    double timestamp;
+  } states[MAX_PROXY_OP_STATES];
+  double startTs;
+  double stopTs;
+  int stepCount;                    // last processed network operation for this proxy operation
+  struct proxyStep step[MAX_STEPS]; // array of network transfer events
+  struct taskEventBase* parent;     // parent event p2p/collective
+};
+
+struct group;
+struct context;
+
+struct proxyCtrl {
+  uint8_t type;
+  struct context* ctx;              // profiler context
+  double startTs;
+  double stopTs;
+  int state;
+  int appended;                     // appended proxy operations
+};
+
+// task level event base structure
+struct taskEventBase {
+  uint8_t type;                     // event type: collective/p2p
+  int rank;                         // rank of the operation in NCCL communicator
+  const char* name;                 // FIXME: unused
+  uint64_t commHash;                // communicator identifier
+  uint8_t func;                     // ncclFunc*
+  int refCount;                     // number of references for this operation
+  struct group* parent;             // parent event group
+  struct taskEventBase* next;       // next top level event in group
+  double startTs;
+  double stopTs;
+};
+
+struct collective {
+  struct taskEventBase base;        // base structure for this event
+  uint64_t seqNumber;               // sequence number for this collective in communicator
+  void const* sendBuff;
+  void* recvBuff;
+  size_t count;
+  size_t trafficBytes;
+  int root;
+  uint8_t datatype;
+  uint8_t nMaxChannels;
+  uint8_t algo;
+  uint8_t proto;
+  int op;
+  int nWarps;
+  int isCollnet;
+  int isNvls;
+  struct proxyOp send[MAX_CHANNELS];// array of send proxy operation events
+  struct proxyOp recv[MAX_CHANNELS];// array of recv proxy operation events
+};
+
+struct p2p {
+  struct taskEventBase base;        // base structure for this event
+  uint8_t func;
+  void const* buff;
+  size_t count;
+  uint8_t datatype;
+  int peer;
+  struct proxyOp op;
+};
+
+struct group {
+  uint8_t type;
+  struct context* ctx;              // profiler context
+  int groupId;
+  int refCount;
+  struct taskEventBase* eventHead;  // queue head for task events
+  struct taskEventBase* eventTail;  // queue tail for task events
+  double startTs;
+  double stopTs;
+  struct group* next;               // next group event in queue
+};
+
+// arrays for different event objects
+struct context {
+  int groupPoolSize;
+  int groupPoolBase;
+  int groupPoolIndex;
+  struct group* groupPool;
+
+  int collPoolSize;
+  int collPoolBase;
+  int collPoolIndex;
+  struct collective* collPool;
+
+  int p2pPoolSize;
+  int p2pPoolBase;
+  int p2pPoolIndex;
+  struct p2p* p2pPool;
+
+  int proxyCtrlPoolSize;
+  int proxyCtrlPoolBase;
+  int proxyCtrlPoolIndex;
+  struct proxyCtrl* proxyCtrlPool;
+};
+
+int taskEventQueueEmpty(struct group* g);
+void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event);
+struct taskEventBase* taskEventQueueHead(struct group* g);
+struct taskEventBase* taskEventQueueDequeue(struct group* g);
+
+#endif
diff --git a/ext-profiler/example/nccl/common.h b/ext-profiler/example/nccl/common.h
new file mode 100644
index 0000000000..912925225c
--- /dev/null
+++ b/ext-profiler/example/nccl/common.h
@@ -0,0 +1,15 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef COMMON_H_
+#define COMMON_H_
+
+typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
+typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
+
+typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
+
+#endif
diff --git a/ext-profiler/example/nccl/err.h b/ext-profiler/example/nccl/err.h
new file mode 100644
index 0000000000..644392413e
--- /dev/null
+++ b/ext-profiler/example/nccl/err.h
@@ -0,0 +1,19 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_ERR_H_
+#define NCCL_ERR_H_
+
+/* Error type for plugins */
+typedef enum { ncclSuccess                 =  0,
+               ncclUnhandledCudaError      =  1,
+               ncclSystemError             =  2,
+               ncclInternalError           =  3,
+               ncclInvalidArgument         =  4,
+               ncclInvalidUsage            =  5,
+               ncclRemoteError             =  6 } ncclResult_t;
+
+#endif
diff --git a/ext-profiler/example/nccl/profiler.h b/ext-profiler/example/nccl/profiler.h
new file mode 100644
index 0000000000..db7bc3feae
--- /dev/null
+++ b/ext-profiler/example/nccl/profiler.h
@@ -0,0 +1,18 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_PROFILER_H_
+#define NCCL_PROFILER_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "common.h"
+#include "err.h"
+
+#include "profiler_v1.h"
+
+#endif // end include guard
diff --git a/ext-profiler/example/nccl/profiler_v1.h b/ext-profiler/example/nccl/profiler_v1.h
new file mode 100644
index 0000000000..8724a1c662
--- /dev/null
+++ b/ext-profiler/example/nccl/profiler_v1.h
@@ -0,0 +1,150 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_PROFILER_V1_H_
+#define NCCL_PROFILER_V1_H_
+
+#include <stdint.h>
+
+enum {
+  ncclProfileGroup     = (1 << 0),  // group event type
+  ncclProfileColl      = (1 << 1),  // host collective call event type
+  ncclProfileP2p       = (1 << 2),  // host point-to-point call event type
+  ncclProfileProxyOp   = (1 << 3),  // proxy operation event type
+  ncclProfileProxyStep = (1 << 4),  // proxy step event type
+  ncclProfileProxyCtrl = (1 << 5),  // proxy control event type
+  ncclProfileNumEvents = (     6),
+};
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint64_t seqNumber;
+      uint8_t func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      uint8_t datatype;
+      uint32_t op;
+      size_t trafficBytes;
+      uint8_t nMaxChannels;
+      uint8_t nWarps;
+      uint8_t algo;
+      uint8_t proto;
+      int isCollnet;
+      int isNvls;
+    } coll;
+
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint8_t func;
+      void* buff;
+      uint8_t datatype;
+      size_t count;
+      int peer;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+  };
+} ncclProfilerEventDescr_v1_t;
+
+typedef enum {
+  ncclProfilerProxyOpSendPosted,
+  ncclProfilerProxyOpSendRemFifoWait,
+  ncclProfilerProxyOpSendTransmitted,
+  ncclProfilerProxyOpSendDone,
+  ncclProfilerProxyOpRecvPosted,
+  ncclProfilerProxyOpRecvReceived,
+  ncclProfilerProxyOpRecvTransmitted,
+  ncclProfilerProxyOpRecvDone,
+
+  /* Legacy proxy profiler states */
+  ncclProfilerProxyStepSendGPUWait,
+  ncclProfilerProxyStepSendWait,
+  ncclProfilerProxyStepRecvWait,
+  ncclProfilerProxyStepRecvFlushWait,
+  ncclProfilerProxyStepRecvGPUWait,
+
+  /* Legacy proxy control states */
+  ncclProfilerProxyCtrlIdle,
+  ncclProfilerProxyCtrlActive,
+  ncclProfilerProxyCtrlSleep,
+  ncclProfilerProxyCtrlWakeup,
+  ncclProfilerProxyCtrlAppend,
+  ncclProfilerProxyCtrlAppendEnd,
+} ncclProfilerEventState_v1_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+    int steps;
+  } proxyOp;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+} ncclProfilerEventStateArgs_v1_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v1_t;
+
+typedef ncclProfilerEventDescr_v1_t ncclProfilerEventDescr_t;
+typedef ncclProfilerEventState_v1_t ncclProfilerEventState_t;
+typedef ncclProfilerEventStateArgs_v1_t ncclProfilerEventStateArgs_t;
+typedef ncclProfiler_v1_t ncclProfiler_t;
+
+#endif
diff --git a/ext-profiler/example/nccl/types.h b/ext-profiler/example/nccl/types.h
new file mode 100644
index 0000000000..f43fdc1636
--- /dev/null
+++ b/ext-profiler/example/nccl/types.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NCCL_TYPES_H_
+#define NCCL_TYPES_H_
+
+/* Data types */
+typedef enum { ncclInt8       = 0, ncclChar       = 0,
+               ncclUint8      = 1,
+               ncclInt32      = 2, ncclInt        = 2,
+               ncclUint32     = 3,
+               ncclInt64      = 4,
+               ncclUint64     = 5,
+               ncclFloat16    = 6, ncclHalf       = 6,
+               ncclFloat32    = 7, ncclFloat      = 7,
+               ncclFloat64    = 8, ncclDouble     = 8,
+               ncclBfloat16   = 9,
+} ncclDataType_t;
+
+#endif
diff --git a/ext-profiler/example/plugin.c b/ext-profiler/example/plugin.c
new file mode 100644
index 0000000000..f9de60813a
--- /dev/null
+++ b/ext-profiler/example/plugin.c
@@ -0,0 +1,492 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <stdio.h>
+#include <pthread.h>
+#include <string.h>
+#include <linux/limits.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <x86intrin.h>
+#include "event.h"
+#include "print_event.h"
+
+#define __hidden __attribute__ ((visibility("hidden")))
+
+static int initialized;             // initialization counter for profiler
+static double startTime;            // profiler start time
+
+static int groupPoolSize = 16;
+static int collPoolSize = 16;
+static int p2pPoolSize = 1024;
+static int proxyCtrlPoolSize = 16;
+static int detachPoolSize = 128;
+static int detachPoolBase;
+static int detachPoolIndex;
+static int detachPoolDone;
+static struct proxyOp* detachPool;
+
+static double freq = -1;
+__hidden void calibrate() {
+  struct timeval tv;
+  gettimeofday(&tv, NULL);
+  uint64_t timeCycles = __rdtsc();
+  double time = - tv.tv_sec*1e6 - tv.tv_usec;
+  uint64_t total = 0ULL;
+  for (int i = 0; i < 10000; i++) total += __rdtsc();
+  gettimeofday(&tv, NULL);
+  timeCycles = __rdtsc() - timeCycles;
+  time += tv.tv_sec*1e6 + tv.tv_usec;
+  freq = timeCycles / time;
+}
+
+__hidden double gettime(void) {
+  return __rdtsc() / freq;
+}
+
+static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
+static pid_t pid;
+
+__hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask) {
+  pthread_mutex_lock(&lock);
+  if (__atomic_fetch_add(&initialized, 1, __ATOMIC_RELAXED) == 0) {
+    // first thread initializes event mask, environment and detach pool
+    __atomic_store_n(eActivationMask, ncclProfileColl | ncclProfileP2p, __ATOMIC_RELAXED);
+    if (getenv("NCCL_PROFILE_EVENT_MASK")) {
+      __atomic_store_n(eActivationMask, atoi(getenv("NCCL_PROFILE_EVENT_MASK")), __ATOMIC_RELAXED);
+    }
+    if (getenv("NCCL_PROFILE_GROUP_POOL_SIZE")) {
+      groupPoolSize = atoi(getenv("NCCL_PROFILE_GROUP_POOL_SIZE"));
+    }
+    if (getenv("NCCL_PROFILE_COLL_POOL_SIZE")) {
+      collPoolSize = atoi(getenv("NCCL_PROFILE_COLL_POOL_SIZE"));
+    }
+    if (getenv("NCCL_PROFILE_P2P_POOL_SIZE")) {
+      p2pPoolSize = atoi(getenv("NCCL_PROFILE_P2P_POOL_SIZE"));
+    }
+    if (getenv("NCCL_PROFILE_PROXY_CTRL_POOL_SIZE")) {
+      proxyCtrlPoolSize = atoi(getenv("NCCL_PROFILE_PROXY_CTRL_POOL_SIZE"));
+    }
+    if (getenv("NCCL_PROFILE_PROXY_DETACH_POOL_SIZE")) {
+      detachPoolSize = atoi(getenv("NCCL_PROFILE_PROXY_DETACH_POOL_SIZE"));
+    }
+    // detach pool is used to store PXN proxyOps and is shared among threads
+    detachPool = (struct proxyOp *)calloc(detachPoolSize, sizeof(*detachPool));
+    if (detachPool == NULL) {
+      pthread_mutex_unlock(&lock);
+      return ncclSystemError;
+    }
+    // Pid of the process initializing the profiler first.
+    // This is compared against the pid of proxyOp events
+    // to figure out if they have a parent event in this
+    // process address space.
+    pid = getpid();
+
+    // calibrate and start timer
+    calibrate();
+    startTime = gettime();
+  }
+  pthread_mutex_unlock(&lock);
+
+  // pre-allocate memory for event object pools in dedicated profiler context
+  struct context* ctx = (struct context *)calloc(1, sizeof(*ctx));
+  ctx->groupPool = (struct group *)calloc(groupPoolSize, sizeof(*ctx->groupPool));
+  if (ctx->groupPool == NULL) goto fail;
+
+  ctx->collPool = (struct collective *)calloc(collPoolSize, sizeof(*ctx->collPool));
+  if (ctx->collPool == NULL) goto fail;
+
+  ctx->p2pPool = (struct p2p *)calloc(p2pPoolSize, sizeof(*ctx->p2pPool));
+  if (ctx->p2pPool == NULL) goto fail;
+
+  ctx->proxyCtrlPool = (struct proxyCtrl *)calloc(proxyCtrlPoolSize, sizeof(*ctx->proxyCtrlPool));
+  if (ctx->proxyCtrlPool == NULL) goto fail;
+
+  *context = ctx;
+  return ncclSuccess;
+
+fail:
+  // cleanup resources
+  if (ctx->proxyCtrlPool) free(ctx->proxyCtrlPool);
+  if (ctx->p2pPool) free(ctx->p2pPool);
+  if (ctx->collPool) free(ctx->collPool);
+  if (ctx->groupPool) free(ctx->groupPool);
+  free(ctx);
+  if (detachPool) free(detachPool);
+  return ncclSystemError;
+}
+
+__hidden ncclResult_t exampleProfilerFinalize(void* context) {
+  FILE* fh = NULL;
+  char filename[PATH_MAX] = { 0 };
+  char hostname[64] = { 0 };
+  gethostname(hostname, 64);
+  const char* dump = getenv("NCCL_PROFILE_DUMP_FILE");
+  if (dump) {
+    sprintf(filename, "%s-%s-%ld.txt", dump, hostname, syscall(SYS_gettid));
+    fh = fopen(filename, "w");
+    fprintf(fh, "[\n");
+  }
+
+  // print last N groups/collectives/p2ps
+  struct context* ctx = (struct context *)context;
+  int start = (ctx->groupPoolIndex - groupPoolSize >= 0) ? ctx->groupPoolIndex - groupPoolSize : 0;
+  int end = ctx->groupPoolIndex;
+  for (int i = start; i < end; i++) {
+    printEvent(fh, &ctx->groupPool[i%groupPoolSize]);
+  }
+
+  start = (ctx->proxyCtrlPoolIndex - proxyCtrlPoolSize >= 0) ? ctx->proxyCtrlPoolIndex - proxyCtrlPoolSize : 0;
+  end = ctx->proxyCtrlPoolIndex;
+  for (int i = start; i < end; i++) {
+    printEvent(fh, &ctx->proxyCtrlPool[i%proxyCtrlPoolSize]);
+  }
+
+  free(ctx->groupPool);
+  free(ctx->collPool);
+  free(ctx->p2pPool);
+  free(ctx->proxyCtrlPool);
+  free(ctx);
+
+  // last thread cleans up shared detach pool
+  if (__atomic_fetch_sub(&initialized, 1, __ATOMIC_RELAXED) - 1 == 0) {
+    start = (detachPoolIndex - detachPoolSize >= 0) ? detachPoolIndex - detachPoolSize : 0;
+    end = detachPoolIndex;
+    for (int i = start; i < end; i++) {
+      printEvent(fh, &detachPool[i%detachPoolSize]);
+    }
+    free(detachPool);
+  }
+
+  if (fh) fprintf(fh, "{}]\n");
+  if (fh) fclose(fh);
+
+  return ncclSuccess;
+}
+
+__hidden void updateEvent(void* handle);
+
+__hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr) {
+  *eHandle = NULL;
+  struct context* ctx = (struct context *)context;
+  if (eDescr->type == ncclProfileGroup) {
+    struct group* event;
+    int groupId = __atomic_fetch_add(&ctx->groupPoolIndex, 1, __ATOMIC_RELAXED);
+    if ((groupId - __atomic_load_n(&ctx->groupPoolBase, __ATOMIC_RELAXED)) < groupPoolSize) {
+      // if there are available group events grab one
+      event = &ctx->groupPool[groupId%groupPoolSize];
+      while (!taskEventQueueEmpty(event)) {
+        struct taskEventBase* base = taskEventQueueDequeue(event);
+        if (base->type == ncclProfileColl) {
+          struct collective* c = (struct collective *)base;
+          // reset event proxyOps & proxySteps
+          memset(c->send, 0, sizeof(struct proxyOp)*MAX_CHANNELS);
+          memset(c->recv, 0, sizeof(struct proxyOp)*MAX_CHANNELS);
+          // release collective events in the group and return them to the collective pool
+          __atomic_fetch_add(&ctx->collPoolBase, 1, __ATOMIC_RELAXED);
+        } else if (base->type == ncclProfileP2p) {
+          struct p2p* p = (struct p2p *)base;
+          // reset event proxyOp and proxySteps
+          memset(&p->op, 0, sizeof(struct proxyOp));
+          // release p2p events in the group and return them to the p2p pool
+          __atomic_fetch_add(&ctx->p2pPoolBase, 1, __ATOMIC_RELAXED);
+        }
+      }
+    } else {
+      // else drop this event
+      __atomic_fetch_sub(&ctx->groupPoolIndex, 1, __ATOMIC_RELAXED);
+      return ncclSuccess;
+    }
+    event->type = ncclProfileGroup;
+    __atomic_store_n(&event->refCount, 1, __ATOMIC_RELAXED);
+    event->ctx = ctx;
+    event->groupId = groupId;
+    event->startTs = gettime() - startTime;
+    *eHandle = event;
+    debugEvent(event, "GroupStart");
+  } else if (eDescr->type == ncclProfileColl) {
+    // the parent might be null if we run out of events
+    struct group* parent = (struct group *)eDescr->parentObj;
+    if (parent == NULL) return ncclSuccess;
+
+    struct collective* event;
+    int collId = __atomic_fetch_add(&ctx->collPoolIndex, 1, __ATOMIC_RELAXED);
+    if ((collId - __atomic_load_n(&ctx->collPoolBase, __ATOMIC_RELAXED)) < collPoolSize) {
+      // if there are available collective events grab one
+      event = &ctx->collPool[collId%collPoolSize];
+    } else {
+      // else drop this event
+      __atomic_fetch_sub(&ctx->collPoolIndex, 1, __ATOMIC_RELAXED);
+      return ncclSuccess;
+    }
+
+    event->base.type = ncclProfileColl;
+    event->base.rank = eDescr->rank;
+    event->base.name = eDescr->coll.name;
+    event->base.commHash = eDescr->coll.commHash;
+    event->base.func = eDescr->coll.func;
+    event->base.startTs = gettime() - startTime;
+    event->base.parent = parent;
+    event->seqNumber = eDescr->coll.seqNumber;
+    event->sendBuff = eDescr->coll.sendBuff;
+    event->recvBuff = eDescr->coll.recvBuff;
+    event->count = eDescr->coll.count;
+    event->root = eDescr->coll.root;
+    event->datatype = eDescr->coll.datatype;
+    event->op = eDescr->coll.op;
+    event->trafficBytes = eDescr->coll.trafficBytes;
+    event->nMaxChannels = eDescr->coll.nMaxChannels;
+    event->nWarps = eDescr->coll.nWarps;
+    event->algo = eDescr->coll.algo;
+    event->proto = eDescr->coll.proto;
+    event->isCollnet = eDescr->coll.isCollnet;
+    event->isNvls = eDescr->coll.isNvls;
+    *eHandle = event;
+    taskEventQueueEnqueue(parent, (struct taskEventBase *)event);
+    // increment the group ref counter so the event will staty open
+    __atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED);
+    debugEvent(event, "CollStart");
+  } else if (eDescr->type == ncclProfileP2p) {
+    // the parent might be null if we run out of events
+    struct group* parent = (struct group *)eDescr->parentObj;
+    if (parent == NULL) return ncclSuccess;
+
+    struct p2p* event;
+    int p2pId = __atomic_fetch_add(&ctx->p2pPoolIndex, 1, __ATOMIC_RELAXED);
+    if ((p2pId - __atomic_load_n(&ctx->p2pPoolBase, __ATOMIC_RELAXED)) < p2pPoolSize) {
+      // if there are available p2p events grab one
+      event = &ctx->p2pPool[p2pId%p2pPoolSize];
+    } else {
+      // else drop this event
+      __atomic_fetch_sub(&ctx->p2pPoolIndex, 1, __ATOMIC_RELAXED);
+      return ncclSuccess;
+    }
+
+    event->base.type = ncclProfileP2p;
+    event->base.rank = eDescr->rank;
+    event->base.name = eDescr->p2p.name;
+    event->base.commHash = eDescr->p2p.commHash;
+    event->base.func = eDescr->p2p.func;
+    event->base.next = parent->eventHead;
+    event->base.startTs = gettime() - startTime;
+    event->base.parent = parent;
+    event->buff = eDescr->p2p.buff;
+    event->count = eDescr->p2p.count;
+    event->datatype = eDescr->p2p.datatype;
+    event->peer = eDescr->p2p.peer;
+    *eHandle = event;
+    // increment the group ref counter so the event will staty open
+    taskEventQueueEnqueue(parent, (struct taskEventBase *)event);
+    __atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED);
+    debugEvent(event, "P2pStart");
+  } else if (eDescr->type == ncclProfileProxyCtrl) {
+    int proxyCtrlId = __atomic_fetch_add(&ctx->proxyCtrlPoolIndex, 1, __ATOMIC_RELAXED);
+    struct proxyCtrl* event = &ctx->proxyCtrlPool[proxyCtrlId%proxyCtrlPoolSize];
+    event->type = ncclProfileProxyCtrl;
+    event->ctx = ctx;
+    event->startTs = gettime() - startTime;
+    *eHandle = event;
+  } else if (eDescr->type == ncclProfileProxyOp) {
+    // the eventBase might be null if we run out of events
+    struct taskEventBase* eventBase = (struct taskEventBase *)eDescr->parentObj;
+    if (eventBase == NULL) return ncclSuccess;
+
+    if (eDescr->proxyOp.pid != pid) {
+      // PXN captured proxyOp events
+      struct proxyOp* event;
+      int detachId = __atomic_fetch_add(&detachPoolIndex, 1, __ATOMIC_RELAXED);
+      if ((detachId - detachPoolBase) < detachPoolSize) {
+        // if there are available detached proxyOp events grab one
+        event = &detachPool[detachId%detachPoolSize];
+      } else {
+        // else drop this event
+        __atomic_fetch_sub(&detachPoolIndex, 1, __ATOMIC_RELAXED);
+        return ncclSuccess;
+      }
+
+      event->type = ncclProfileProxyOp;
+      event->channelId = eDescr->proxyOp.channelId;
+      event->pid = eDescr->proxyOp.pid;
+      event->rank = eDescr->rank;
+      event->peer = eDescr->proxyOp.peer;
+      event->nSteps = eDescr->proxyOp.nSteps;
+      event->chunkSize = eDescr->proxyOp.chunkSize;
+      event->isSend = eDescr->proxyOp.isSend;
+      event->startTs = gettime() - startTime;
+      event->parent = NULL;
+      *eHandle = event;
+      debugEvent(event, "PxnProxyOpStart");
+      return ncclSuccess;
+    }
+
+    if (eventBase->type == ncclProfileColl) {
+      struct collective* parent = (struct collective *)eDescr->parentObj;
+      struct proxyOp* event = (eDescr->proxyOp.isSend) ? &parent->send[eDescr->proxyOp.channelId] : &parent->recv[eDescr->proxyOp.channelId];
+      event->type = ncclProfileProxyOp;
+      event->channelId = eDescr->proxyOp.channelId;
+      event->pid = eDescr->proxyOp.pid;
+      event->rank = eDescr->rank;
+      event->peer = eDescr->proxyOp.peer;
+      event->nSteps = eDescr->proxyOp.nSteps;
+      event->chunkSize = eDescr->proxyOp.chunkSize;
+      event->isSend = eDescr->proxyOp.isSend;
+      event->parent = eventBase;
+      event->startTs = gettime() - startTime;
+      *eHandle = event;
+      __atomic_store_n(&parent->base.refCount, 1, __ATOMIC_RELAXED);
+      debugEvent(event, "ProxyOpStart");
+    } else { // ncclProfileP2p
+      struct p2p* parent = (struct p2p *)eDescr->parentObj;
+      struct proxyOp* event = &parent->op;
+      event->type = ncclProfileProxyOp;
+      event->channelId = eDescr->proxyOp.channelId;
+      event->pid = eDescr->proxyOp.pid;
+      event->rank = eDescr->rank;
+      event->peer = eDescr->proxyOp.peer;
+      event->nSteps = eDescr->proxyOp.nSteps;
+      event->chunkSize = eDescr->proxyOp.chunkSize;
+      event->isSend = eDescr->proxyOp.isSend;
+      event->parent = eventBase;
+      event->startTs = gettime() - startTime;
+      *eHandle = event;
+      __atomic_store_n(&parent->base.refCount, 1, __ATOMIC_RELAXED);
+      debugEvent(event, "ProxyOpStart");
+    }
+ } else if (eDescr->type == ncclProfileProxyStep) {
+    // the parent might be null if we run out of events
+    struct proxyOp* parent = (struct proxyOp *)eDescr->parentObj;
+    if (parent == NULL) return ncclSuccess;
+
+    int s = parent->stepCount++ % MAX_STEPS;
+    struct proxyStep* event = &parent->step[s];
+    event->type = ncclProfileProxyStep;
+    event->step = eDescr->proxyStep.step;
+    event->isSend = parent->isSend;
+    event->parent = parent;
+    event->startTs = gettime() - startTime;
+    *eHandle = event;
+    debugEvent(event, "ProxyStepStart");
+  }
+  return ncclSuccess;
+}
+
+void updateEvent(void* handle) {
+  uint8_t type = *(uint8_t *)handle;
+  if (type == ncclProfileGroup) {
+    struct group* event = (struct group *)handle;
+    if (__atomic_fetch_sub(&event->refCount, 1, __ATOMIC_RELAXED) == 1) {
+      event->stopTs = gettime() - startTime;
+      // return group event to the pool
+      __atomic_fetch_add(&event->ctx->groupPoolBase, 1, __ATOMIC_RELAXED);
+    }
+    debugEvent(event, "GroupStop");
+  } else if (type == ncclProfileColl) {
+    struct collective* event = (struct collective *)handle;
+    if (__atomic_fetch_sub(&event->base.refCount, 1, __ATOMIC_RELAXED) == 1) {
+      event->base.stopTs = gettime() - startTime;
+      debugEvent(event, "CollStop");
+      updateEvent(event->base.parent);
+      return;
+    }
+    debugEvent(event, "CollStop");
+  } else if (type == ncclProfileP2p) {
+    struct p2p* event = (struct p2p *)handle;
+    if (__atomic_fetch_sub(&event->base.refCount, 1, __ATOMIC_RELAXED) == 1) {
+      event->base.stopTs = gettime() - startTime;
+      debugEvent(event, "P2pStop");
+      updateEvent(event->base.parent);
+      return;
+    }
+    debugEvent(event, "P2pStop");
+  } else if (type == ncclProfileProxyOp) {
+    struct proxyOp* event = (struct proxyOp *)handle;
+    event->stopTs = gettime() - startTime;
+    if (event->pid != pid) {
+      // only for proxyOps that don't have a parent collective/p2p (i.e., PXN)
+      int done = __atomic_fetch_add(&detachPoolDone, 1, __ATOMIC_RELAXED) + 1;
+      if (done == detachPoolSize) {
+        // reset the event completed (done) counter
+        __atomic_store_n(&detachPoolDone, 0, __ATOMIC_RELAXED);
+        // update the base pointer to the top of the pool
+        int index = __atomic_load_n(&detachPoolIndex, __ATOMIC_RELAXED);
+        __atomic_store_n(&detachPoolBase, index, __ATOMIC_RELAXED);
+      }
+      debugEvent(event, "ProxyOpStop");
+      return;
+    }
+    updateEvent(event->parent);
+    debugEvent(event, "ProxyOpStop");
+  } else if (type == ncclProfileProxyStep) {
+    struct proxyStep* event = (struct proxyStep *)handle;
+    event->stopTs = gettime() - startTime;
+    debugEvent(event, "ProxyStepStop");
+  } else if (type == ncclProfileProxyCtrl) {
+    struct proxyCtrl* event = (struct proxyCtrl *)handle;
+    event->stopTs = gettime() - startTime;
+    debugEvent(event, "ProxyCtrlStop");
+  }
+}
+
+__hidden ncclResult_t exampleProfilerStopEvent(void* eHandle) {
+  // the event handle might be null if we run out of events
+  if (eHandle == NULL) return ncclSuccess;
+
+  uint8_t type = *(uint8_t *)eHandle;
+  if (type == ncclProfileGroup) {
+    // stopping the group event in NCCL core does not
+    // mean the group has completed. It means the group
+    // was submitted/enqueued so we need to keep the event open
+    struct group* event = (struct group *)eHandle;
+    event->stopTs = gettime() - startTime;
+    return ncclSuccess;
+  } else if (type == ncclProfileColl) {
+    // stopping the collective event in NCCL core does not
+    // mean the collective has completed. It means the collective
+    // was submitted/enqueued so we need to keep the event open
+    struct collective* event = (struct collective *)eHandle;
+    event->base.stopTs = gettime() - startTime;
+    return ncclSuccess;
+  }
+  updateEvent(eHandle);
+  return ncclSuccess;
+}
+
+__hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs) {
+  // the event handle might be null if we run out of events
+  if (eHandle == NULL) return ncclSuccess;
+
+  debugEvent(eHandle, "RecordEventState");
+  uint8_t type = *(uint8_t *)eHandle;
+  if (type == ncclProfileProxyOp) {
+    struct proxyOp* event = (struct proxyOp *)eHandle;
+    int steps = event->states[event->isSend ? PROXY_OP_SEND_STATE_IDX(eState) : PROXY_OP_RECV_STATE_IDX(eState)].steps;
+    if (eState == ncclProfilerProxyOpSendRemFifoWait && eStateArgs->proxyOp.steps == steps) return ncclSuccess;
+    event->states[event->isSend ? PROXY_OP_SEND_STATE_IDX(eState) : PROXY_OP_RECV_STATE_IDX(eState)].steps = eStateArgs->proxyOp.steps;
+    event->states[event->isSend ? PROXY_OP_SEND_STATE_IDX(eState) : PROXY_OP_RECV_STATE_IDX(eState)].timestamp = gettime() - startTime;
+    event->transSize = eStateArgs->proxyOp.transSize;
+  } else if (type == ncclProfileProxyStep) {
+    struct proxyStep* event = (struct proxyStep *)eHandle;
+    event->timestamp[event->isSend ? PROXY_STEP_SEND_STATE_IDX(eState) : PROXY_STEP_RECV_STATE_IDX(eState)] = gettime() - startTime;
+  } else if (type == ncclProfileProxyCtrl) {
+    struct proxyCtrl* event = (struct proxyCtrl *)eHandle;
+    if (eState == ncclProfilerProxyCtrlAppendEnd) {
+      event->appended = eStateArgs->proxyCtrl.appendedProxyOps;
+    }
+    event->state = eState;
+  }
+  return ncclSuccess;
+}
+
+ncclProfiler_v1_t ncclProfiler_v1 = {
+  "Example-profiler",
+  exampleProfilerInit,
+  exampleProfilerStartEvent,
+  exampleProfilerStopEvent,
+  exampleProfilerRecordEventState,
+  exampleProfilerFinalize,
+};
diff --git a/ext-profiler/example/print_event.c b/ext-profiler/example/print_event.c
new file mode 100644
index 0000000000..490ba7ce44
--- /dev/null
+++ b/ext-profiler/example/print_event.c
@@ -0,0 +1,277 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <stdio.h>
+#include "profiler.h"
+#include "event.h"
+#include "print_event.h"
+
+#define __hidden __attribute__ ((visibility("hidden")))
+
+__hidden const char* ncclFuncToString(int func) {
+  switch(func) {
+    case 0:
+      return "ncclBroadcast";
+    case 1:
+      return "ncclReduce";
+    case 2:
+      return "ncclAllGather";
+    case 3:
+      return "ncclReduceScatter";
+    case 4:
+      return "ncclAllReduce";
+    case 5:
+      return "ncclSendRecv";
+    case 6:
+      return "ncclSend";
+    case 7:
+      return "ncclRecv";
+  }
+  return NULL;
+}
+
+__hidden const char* ncclAlgoToString(int algo) {
+  switch(algo) {
+    case 0:
+      return "Tree";
+    case 1:
+      return "Ring";
+    case 2:
+      return "CollnetDirect";
+    case 3:
+      return "CollnetChain";
+    case 4:
+      return "Nvls";
+    case 5:
+      return "NvlsTree";
+  }
+}
+
+__hidden const char* ncclProtoToString(int proto) {
+  switch(proto) {
+    case 0:
+      return "LL";
+    case 1:
+      return "LL128";
+    case 2:
+      return "Simple";
+  }
+}
+
+// FIXME: chrome tracing asynchronous events (following used) allow event nesting for events that have same id and category
+// It appears that nesting more than three events causes issues. Therefore, every event is given an increasing id and a
+// category that matches the type of event (GROUP, COLL, P2P, PROXY, NET)
+static __thread int groupId;
+__hidden void printGroupEventHeader(FILE* fh, struct group* event) {
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GROUP\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"groupId\": %d}},\n",
+          "Group", groupId, getpid(), 1, event->startTs, event->groupId);
+}
+
+__hidden void printGroupEventTrailer(FILE* fh, struct group* event) {
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GROUP\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+          "Group", groupId++, getpid(), 1, event->stopTs);
+}
+
+static __thread int collId;
+__hidden void printCollEventHeader(FILE* fh, struct collective* event) {
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"SeqNum\": %lu, \"CommHash\": %lu, \"Rank\": %d, \"Count\": %lu, \"Datatype\": %d, \"Algorithm\": \"%s\", \"Protocol\": \"%s\", \"nMaxChannels\": %d}},\n",
+          ncclFuncToString(event->base.func), collId, getpid(), 1, event->base.startTs, event->seqNumber, event->base.commHash, event->base.rank, event->count, event->datatype, ncclAlgoToString(event->algo), ncclProtoToString(event->proto), event->nMaxChannels);
+}
+
+__hidden void printCollEventTrailer(FILE* fh, struct collective* event) {
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+          ncclFuncToString(event->base.func), collId++, getpid(), 1, event->base.stopTs);
+}
+
+static __thread int p2pId;
+__hidden void printP2pEventHeader(FILE* fh, struct p2p* event) {
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"CommHash\": %lu, \"Rank\": %d, \"Peer\": %d, \"Count\": %lu, \"Datatype\": %d}},\n",
+          ncclFuncToString(event->base.func), p2pId, getpid(), 1, event->base.startTs, event->base.commHash, event->base.rank, event->peer, event->count, event->datatype);
+}
+
+__hidden void printP2pEventTrailer(FILE* fh, struct p2p* event) {
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+          ncclFuncToString(event->base.func), p2pId++, getpid(), 1, event->base.stopTs);
+}
+
+static __thread int proxyOpId;
+__hidden void printProxyOpEventHeader(FILE* fh, struct proxyOp* event) {
+  if (event->isSend) {
+    int posted = PROXY_OP_SEND_STATE_IDX(ncclProfilerProxyOpSendPosted);
+    int remFifoWait = PROXY_OP_SEND_STATE_IDX(ncclProfilerProxyOpSendRemFifoWait);
+    int transmitted = PROXY_OP_SEND_STATE_IDX(ncclProfilerProxyOpSendTransmitted);
+    int done = PROXY_OP_SEND_STATE_IDX(ncclProfilerProxyOpSendDone);
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d, \"Peer\": %d, \"Steps\": %d, \"ChunkSize\": %d, \"transSize\": %lu, \"POSTED\": {\"step\": %d, \"ts\": %f}, \"REM_FIFO_WAIT\": {\"step\": %d, \"ts\": %f}, \"TRANSMITTED\": {\"step\": %d, \"ts\": %f}, \"DONE\": {\"step\": %d, \"ts\": %f}}},\n",
+            "Send", proxyOpId, getpid(), 1, event->startTs, event->channelId, event->peer, event->nSteps, event->chunkSize, event->transSize, event->states[posted].steps, event->states[posted].timestamp, event->states[remFifoWait].steps, event->states[remFifoWait].timestamp, event->states[transmitted].steps, event->states[transmitted].timestamp, event->states[done].steps, event->states[done].timestamp);
+  } else {
+    int posted = PROXY_OP_RECV_STATE_IDX(ncclProfilerProxyOpRecvPosted);
+    int received = PROXY_OP_RECV_STATE_IDX(ncclProfilerProxyOpRecvReceived);
+    int transmitted = PROXY_OP_RECV_STATE_IDX(ncclProfilerProxyOpRecvTransmitted);
+    int done = PROXY_OP_RECV_STATE_IDX(ncclProfilerProxyOpRecvDone);
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d, \"Peer\": %d, \"Steps\": %d, \"ChunkSize\": %d, \"transSize\": %lu, \"POSTED\": {\"step\": %d, \"ts\": %f}, \"RECEIVED\": {\"step\": %d, \"ts\": %f}, \"TRANSMITTED\": {\"step\": %d, \"ts\": %f}, \"DONE\": {\"step\": %d, \"ts\": %f}}},\n",
+            "Recv", proxyOpId, getpid(), 1, event->startTs, event->channelId, event->peer, event->nSteps, event->chunkSize, event->transSize, event->states[posted].steps, event->states[posted].timestamp, event->states[received].steps, event->states[received].timestamp, event->states[transmitted].steps, event->states[transmitted].timestamp, event->states[done].steps, event->states[done].timestamp);
+  }
+}
+
+__hidden void printProxyOpEventTrailer(FILE* fh, struct proxyOp* event) {
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+          event->isSend ? "Send" : "Recv", proxyOpId++, getpid(), 1, event->stopTs);
+}
+
+static __thread int proxyStepId;
+__hidden void printProxyStepEvent(FILE* fh, struct proxyStep* event) {
+  if (event->isSend) {
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
+            "SendBufferWait", proxyStepId, getpid(), 1, event->startTs, event->step);
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+            "SendBufferWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendGPUWait)]);
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
+            "SendGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendGPUWait)], event->step);
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+            "SendGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendWait)]);
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
+            "SendWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendWait)], event->step);
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+            "SendWait", proxyStepId++, getpid(), 1, event->stopTs);
+  } else {
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
+            "RecvBufferWait", proxyStepId, getpid(), 1, event->startTs, event->step);
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+            "RecvBufferWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvWait)]);
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
+            "RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvWait)], event->step);
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+            "RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvFlushWait)]);
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
+            "RecvFlushWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvFlushWait)], event->step);
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+            "RecvFlushWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvGPUWait)]);
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
+            "RecvGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvGPUWait)], event->step);
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+            "RecvGpuWait", proxyStepId++, getpid(), 1, event->stopTs);
+  }
+}
+
+static __thread int proxyCtrlId;
+__hidden void printProxyCtrlEvent(FILE* fh, struct proxyCtrl* event) {
+  const char* str;
+  if (event->state == ncclProfilerProxyCtrlIdle || event->state == ncclProfilerProxyCtrlActive) {
+    str = "Idle";
+  } else if (event->state == ncclProfilerProxyCtrlSleep || event->state == ncclProfilerProxyCtrlWakeup) {
+    str = "Sleep";
+  } else if (event->state == ncclProfilerProxyCtrlAppend || event->state == ncclProfilerProxyCtrlAppendEnd) {
+    str = "Append";
+  }
+  if (event->state == ncclProfilerProxyCtrlAppendEnd) {
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"appended\": %d}},\n",
+            str, proxyCtrlId, getpid(), 1, event->startTs, event->appended);
+  } else {
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+            str, proxyCtrlId, getpid(), 1, event->startTs);
+  }
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+          str, proxyCtrlId++, getpid(), 1, event->stopTs);
+}
+
+//#define DEBUG_EVENTS
+void debugEvent(void* eHandle, const char* tag) {
+#ifdef DEBUG_EVENTS
+  char filename[64] = { 0 };
+  sprintf(filename, "EventDebug-%d", getpid());
+  FILE* fh = fopen(filename, "a+");
+  uint8_t type = *(uint8_t *)eHandle;
+  if (type == ncclProfileGroup) {
+    struct group* event = (struct group *)eHandle;
+    fprintf(fh, "Group event %p tag = %s {\n", event, tag);
+    fprintf(fh, "  refCount          = %d\n", __atomic_load_n(&event->refCount, __ATOMIC_RELAXED));
+    fprintf(fh, "  startTs           = %f\n", event->startTs);
+    fprintf(fh, "  stopTs            = %f\n", event->stopTs);
+    fprintf(fh, "}\n");
+  } else if (type == ncclProfileColl) {
+    struct collective* event = (struct collective *)eHandle;
+    fprintf(fh, "Collective event %p tag = %s {\n", event, tag);
+    fprintf(fh, "  refCount          = %d\n", __atomic_load_n(&event->base.refCount, __ATOMIC_RELAXED));
+    fprintf(fh, "  parent            = %p\n", event->base.parent);
+    for (int i = 0; i < MAX_CHANNELS; i++ ) if (event->send[i].type == ncclProfileProxyOp) fprintf(fh, "  send[%d]           = %p\n", i, &event->send[i]);
+    for (int i = 0; i < MAX_CHANNELS; i++ ) if (event->recv[i].type == ncclProfileProxyOp) fprintf(fh, "  recv[%d]           = %p\n", i, &event->recv[i]);
+    fprintf(fh, "  startTs           = %f\n", event->base.startTs);
+    fprintf(fh, "  stopTs            = %f\n", event->base.stopTs);
+    fprintf(fh, "}\n");
+  } else if (type == ncclProfileP2p) {
+    struct p2p* event = (struct p2p *)eHandle;
+    fprintf(fh, "P2p event %p tag = %s {\n", event, tag);
+    fprintf(fh, "  refCount          = %d\n", __atomic_load_n(&event->base.refCount, __ATOMIC_RELAXED));
+    fprintf(fh, "  parent            = %p\n", event->base.parent);
+    fprintf(fh, "  op                = %p\n", &event->op);
+    fprintf(fh, "  startTs           = %f\n", event->base.startTs);
+    fprintf(fh, "  stopTs            = %f\n", event->base.stopTs);
+    fprintf(fh, "}\n");
+  } else if (type == ncclProfileProxyOp) {
+    struct proxyOp* event = (struct proxyOp *)eHandle;
+    fprintf(fh, "ProxyOp event %p tag = %s {\n", event, tag);
+    fprintf(fh, "  type              = %s\n", event->isSend ? "Send" : "Recv");
+    fprintf(fh, "  channel           = %d\n", event->channelId);
+    fprintf(fh, "  parent            = %p\n", event->parent);
+    fprintf(fh, "  rank              = %d\n", event->rank);
+    fprintf(fh, "  startTs           = %f\n", event->startTs);
+    fprintf(fh, "  stopTs            = %f\n", event->stopTs);
+    fprintf(fh, "}\n");
+  } else if (type == ncclProfileProxyStep) {
+    struct proxyStep* event = (struct proxyStep *)eHandle;
+    fprintf(fh, "ProxyStep event %p tag = %s {\n", event, tag);
+    fprintf(fh, "  type              = %s\n", event->isSend ? "Send" : "Recv");
+    fprintf(fh, "  parent            = %p\n", event->parent);
+    fprintf(fh, "  startTs           = %f\n", event->startTs);
+    fprintf(fh, "  stopTs            = %f\n", event->stopTs);
+    fprintf(fh, "}\n");
+  }
+  fclose(fh);
+#endif
+}
+
+void printEvent(FILE* fh, void* handle) {
+  if (handle == NULL || fh == NULL) return;
+  uint8_t type = *(uint8_t *)handle;
+  if (type == ncclProfileGroup) {
+    struct group* g = (struct group *)handle;
+    printGroupEventHeader(fh, g);
+    struct taskEventBase* base = taskEventQueueHead(g);
+    while (base) {
+      struct taskEventBase* next = base->next;
+      printEvent(fh, base);
+      base = next;
+    }
+    printGroupEventTrailer(fh, g);
+  } else if (type == ncclProfileColl) {
+    struct collective* c = (struct collective *)handle;
+    printCollEventHeader(fh, c);
+    for (int i = 0; i < MAX_CHANNELS; i++) {
+      printEvent(fh, &c->send[i]);
+      printEvent(fh, &c->recv[i]);
+    }
+    printCollEventTrailer(fh, c);
+  } else if (type == ncclProfileP2p) {
+    struct p2p* p = (struct p2p *)handle;
+    printP2pEventHeader(fh, p);
+    printEvent(fh, &p->op);
+    printP2pEventTrailer(fh, p);
+  } else if (type == ncclProfileProxyOp) {
+    struct proxyOp* p = (struct proxyOp *)handle;
+    printProxyOpEventHeader(fh, p);
+    for (int i = 0; i < MAX_STEPS; i++) {
+      printEvent(fh, &p->step[i]);
+    }
+    printProxyOpEventTrailer(fh, p);
+  } else if (type == ncclProfileProxyStep) {
+    struct proxyStep* p = (struct proxyStep *)handle;
+    printProxyStepEvent(fh, p);
+  } else if (type == ncclProfileProxyCtrl) {
+    struct proxyCtrl* p = (struct proxyCtrl *)handle;
+    printProxyCtrlEvent(fh, p);
+  }
+  return;
+}
diff --git a/ext-profiler/example/print_event.h b/ext-profiler/example/print_event.h
new file mode 100644
index 0000000000..8e2db4c2d8
--- /dev/null
+++ b/ext-profiler/example/print_event.h
@@ -0,0 +1,13 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PRINT_EVENT_H_
+#define PRINT_EVENT_H_
+
+void debugEvent(void* eHandle, const char* tag);
+void printEvent(FILE* fh, void* handle);
+
+#endif
diff --git a/ext-tuner/example/nccl/tuner.h b/ext-tuner/example/nccl/tuner.h
index a1f18d3935..aafabd72d8 100644
--- a/ext-tuner/example/nccl/tuner.h
+++ b/ext-tuner/example/nccl/tuner.h
@@ -27,7 +27,7 @@ typedef enum {
   ncclNumFuncs = 8
 } ncclFunc_t;
 
-#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
+#define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet*
 #define NCCL_ALGO_UNDEF -1
 #define NCCL_ALGO_TREE 0
 #define NCCL_ALGO_RING 1
@@ -35,6 +35,7 @@ typedef enum {
 #define NCCL_ALGO_COLLNET_CHAIN 3
 #define NCCL_ALGO_NVLS 4
 #define NCCL_ALGO_NVLS_TREE 5
+#define NCCL_ALGO_PAT 6
 
 #define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
 #define NCCL_PROTO_UNDEF -1
diff --git a/makefiles/common.mk b/makefiles/common.mk
index a037cf348b..59e4151cee 100644
--- a/makefiles/common.mk
+++ b/makefiles/common.mk
@@ -10,6 +10,7 @@ VERBOSE ?= 0
 KEEP ?= 0
 DEBUG ?= 0
 ASAN ?= 0
+UBSAN ?= 0
 TRACE ?= 0
 PROFAPI ?= 1
 NVTX ?= 1
@@ -93,6 +94,12 @@ LDFLAGS += -fsanitize=address -static-libasan
 NVLDFLAGS += -Xcompiler -fsanitize=address,-static-libasan
 endif
 
+ifneq ($(UBSAN), 0)
+CXXFLAGS += -fsanitize=undefined
+LDFLAGS += -fsanitize=undefined -static-libubsan
+NVLDFLAGS += -Xcompiler -fsanitize=undefined,-static-libubsan
+endif
+
 ifneq ($(VERBOSE), 0)
 NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
 CXXFLAGS  += -Wall -Wextra
diff --git a/makefiles/version.mk b/makefiles/version.mk
index 9039cb7dd2..bcc0ff3ce1 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 22
-NCCL_PATCH   := 3
+NCCL_MINOR   := 23
+NCCL_PATCH   := 4
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
diff --git a/src/bootstrap.cc b/src/bootstrap.cc
index a7d7754406..c1d085e4ce 100644
--- a/src/bootstrap.cc
+++ b/src/bootstrap.cc
@@ -14,6 +14,67 @@
 #include "proxy.h"
 #include "param.h"
 
+#define BOOTSTRAP_N_CHECK_ABORT           10000
+#define BOOTSTRAP_TAG_CONNECT             (0x1 << 31)
+#define BOOTSTRAP_TAG_ALLGATHER           (0x1 << 30)
+#define BOOTSTRAP_TAG_COMMSPLIT           (0x1 << 29)
+#define BOOTSTRAP_TAG_INTRANODE_ALLGATHER (0x1 << 28)
+
+#define BOOTSTRAP_INIT_TIME_CREATE 0
+#define BOOTSTRAP_INIT_TIME_SEND   1
+#define BOOTSTRAP_INIT_TIME_RECV   2
+#define BOOTSTRAP_INIT_TIME_RING   3
+#define BOOTSTRAP_INIT_TIME_TOTAL  4
+#define BOOTSTRAP_INIT_TIME_DELAY  5
+#define BOOTSTRAP_INIT_TIME_N      6
+#define BOOTSTRAP_INIT_ROOT_WAIT   0
+#define BOOTSTRAP_INIT_ROOT_SEND   1
+#define BOOTSTRAP_INIT_ROOT_RECV   2
+#define BOOTSTRAP_INIT_ROOT_N      3
+#define BOOTSTRAP_PROF_OPEN(time) \
+  do {                            \
+    time = clockNano();           \
+  } while (0)
+#define BOOTSTRAP_PROF_CLOSE(time) \
+  do {                             \
+    time = clockNano() - time;     \
+  } while (0)
+
+#define BOOTSTRAP_PID(i, n) (((i) + (n)) % (n))
+// returns the first rank associated to the root. must have root >=0
+// if root >= n_roots, it does NOT assume periodicity
+static int firstRankFromRoot(int root, int n_ranks, int nRoots) {
+  return root * (n_ranks / nRoots) + std::min(root, n_ranks % nRoots);
+}
+// returns the root of a rank, must have rank >=0
+// if rank >= n_ranks, it does NOT assume periodicity
+static int rootIdFromRank(int rank, int nRanks, int nRoots) {
+  int rmr = nRanks % nRoots; // rank mod root
+  int rpr = nRanks / nRoots; // rank per root
+  int D = rmr * (rpr + 1);
+  if (rank < D)
+    return rank / (rpr + 1);
+  else
+    return (rank - D) / rpr + rmr;
+}
+// return the number of child for a root, root will be periodized
+static int nRankFromRoot(int root, int nRanks, int nRoots) {
+  int ir = BOOTSTRAP_PID(root, nRoots);
+  int rmr = nRanks % nRoots; // rank mod root
+  int rpr = nRanks / nRoots; // rank per root
+  return rpr + ((ir < rmr) ? 1 : 0);
+}
+// return the local id of a given rank for a given root
+// root will be periodize, rank will not
+static int localIdFromRoot(int rank, int root, int nRanks, int nRoots) {
+  int ir = BOOTSTRAP_PID(root, nRoots);
+  return rank - firstRankFromRoot(ir, nRanks, nRoots);
+}
+// return the number of child for a root, root will be periodized
+static int isFirstFromRoot(int rank, int root, int nRanks, int nRoots) {
+  return (rank == firstRankFromRoot(root, nRanks, nRoots));
+}
+
 struct bootstrapRootArgs {
   struct ncclSocket* listenSock;
   uint64_t magic;
@@ -25,6 +86,8 @@ static union ncclSocketAddress bootstrapNetIfAddr;
 static int bootstrapNetInitDone = 0;
 pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER;
 
+NCCL_PARAM(BootstrapNetEnable,"OOB_NET_ENABLE", 0);
+
 ncclResult_t bootstrapNetInit() {
   if (bootstrapNetInitDone == 0) {
     pthread_mutex_lock(&bootstrapNetLock);
@@ -53,7 +116,7 @@ ncclResult_t bootstrapNetInit() {
       char line[SOCKET_NAME_MAXLEN+MAX_IF_NAME_SIZE+2];
       sprintf(line, " %s:", bootstrapNetIfName);
       ncclSocketToString(&bootstrapNetIfAddr, line+strlen(line));
-      INFO(NCCL_INIT, "Bootstrap : Using%s", line);
+      INFO(NCCL_BOOTSTRAP, "Bootstrap : Using%s", line);
       bootstrapNetInitDone = 1;
     }
     pthread_mutex_unlock(&bootstrapNetLock);
@@ -64,40 +127,119 @@ ncclResult_t bootstrapNetInit() {
 /* Socket Interface Selection type */
 enum bootstrapInterface_t { findSubnetIf = -1, dontCareIf = -2 };
 
-// Additional sync functions
-static ncclResult_t bootstrapNetSend(struct ncclSocket* sock, void* data, int size) {
-  NCCLCHECK(ncclSocketSend(sock, &size, sizeof(int)));
-  NCCLCHECK(ncclSocketSend(sock, data, size));
+// check abort function
+static ncclResult_t checkAbort(volatile uint32_t* flag, int* cntr) {
+  if ((*cntr % BOOTSTRAP_N_CHECK_ABORT) == 0) {
+    if (flag && __atomic_load_n(flag, __ATOMIC_ACQUIRE)) {
+      TRACE(NCCL_BOOTSTRAP, "bootstrap: abort called");
+      return ncclInternalError;
+    }
+  }
+  *cntr = (*cntr + 1) % BOOTSTRAP_N_CHECK_ABORT;
   return ncclSuccess;
 }
-static ncclResult_t bootstrapNetRecv(struct ncclSocket* sock, void* data, int size) {
+// send/recv functions
+static ncclResult_t netReg(ncclNet_t* net, void* comm, void* data, int size, void** handle) {
+  NCCLCHECK(net->regMr(comm, data, size, NCCL_PTR_HOST, handle));
+  return ncclSuccess;
+}
+static ncclResult_t netDereg(ncclNet_t* net, void* comm, void** handle) {
+  NCCLCHECK(net->deregMr(comm, *handle));
+  *handle = NULL;
+  return ncclSuccess;
+}
+static ncclResult_t netIsend(ncclNet_t* net, void* sendComm, void* data, int size, void* dataHandle, int tag, void** sendReq,
+                             int* done) {
+  if (*done) return ncclSuccess;
+  if (!*sendReq) {
+    NCCLCHECK(net->isend(sendComm, data, size, tag, dataHandle, sendReq));
+  }
+  if (*sendReq) {
+    NCCLCHECK(net->test(*sendReq, done, NULL));
+    if (*done) {
+      *sendReq = NULL;
+    }
+  }
+  return ncclSuccess;
+}
+static ncclResult_t netIrecv(ncclNet_t* net, void* recvComm, void* data, int size, void* dataHandle, int tag, void** recvReq,
+                             int* done) {
+  if (*done) return ncclSuccess;
+  if (!*recvReq) {
+    NCCLCHECK(net->irecv(recvComm, 1, &data, &size, &tag, &dataHandle, recvReq));
+  }
+  if (*recvReq) {
+    NCCLCHECK(net->test(*recvReq, done, NULL));
+    if (*done) {
+      *recvReq = NULL;
+    }
+  }
+  return ncclSuccess;
+}
+static ncclResult_t netSendRecv(ncclNet_t* net, void* sendComm, void* sendData, int sendSize, void* sendDataHandle, void* recvComm,
+                                void* recvData, int recvSize, void* recvDataHandle, int tag, volatile uint32_t* abortFlag) {
+  int abortCounter = 0;
+  int doneSend = 0, doneRecv = 0;
+  void *sendReq = NULL, *recvReq = NULL;
+  do {
+    NCCLCHECK(checkAbort(abortFlag, &abortCounter));
+    if (!doneRecv) {
+      NCCLCHECK(netIrecv(net, recvComm, recvData, recvSize, recvDataHandle, tag, &recvReq, &doneRecv));
+    }
+    if (!doneSend) {
+      NCCLCHECK(netIsend(net, sendComm, sendData, sendSize, sendDataHandle, tag, &sendReq, &doneSend));
+    }
+  } while (!doneSend || !doneRecv);
+  return ncclSuccess;
+}
+
+// Additional socket based functions, first send the size, then send the message
+static ncclResult_t socketSend(struct ncclSocket* sock, void* data, int size) {
+  NCCLCHECK(ncclSocketSend(sock, &size, sizeof(int)));
+  if (size > 0)
+    NCCLCHECK(ncclSocketSend(sock, data, size));
+  return ncclSuccess;
+}
+static ncclResult_t socketRecv(struct ncclSocket* sock, void* data, int size) {
   int recvSize;
   NCCLCHECK(ncclSocketRecv(sock, &recvSize, sizeof(int)));
   if (recvSize > size) {
     WARN("Message truncated : received %d bytes instead of %d", recvSize, size);
     return ncclInternalError;
   }
-  NCCLCHECK(ncclSocketRecv(sock, data, std::min(recvSize, size)));
+  int actualSize = std::min(recvSize, size);
+  if (actualSize > 0)
+    NCCLCHECK(ncclSocketRecv(sock, data, actualSize));
   return ncclSuccess;
 }
-static ncclResult_t bootstrapNetSendRecv(struct ncclSocket* sendSock, void* sendData, int sendSize, struct ncclSocket* recvSock, void* recvData, int recvSize) {
+static ncclResult_t socketSendRecv(struct ncclSocket* sendSock, void* sendData, int sendSize, struct ncclSocket* recvSock,
+                                   void* recvData, int recvSize) {
   int senderRecvSize;
   NCCLCHECK(ncclSocketSendRecv(sendSock, &sendSize, sizeof(int), recvSock, &senderRecvSize, sizeof(int)));
   if (senderRecvSize > recvSize) {
     WARN("Message truncated : received %d bytes instead of %d", senderRecvSize, recvSize);
     return ncclInternalError;
   }
-  NCCLCHECK(ncclSocketSendRecv(sendSock, sendData, sendSize, recvSock, recvData, recvSize));
+  NCCLCHECK(ncclSocketSendRecv(sendSock, sendData, sendSize, recvSock, recvData, std::min(recvSize, senderRecvSize)));
   return ncclSuccess;
 }
 
-struct extInfo {
-  int rank;
-  int nranks;
-  union ncclSocketAddress extAddressListenRoot;
-  union ncclSocketAddress extAddressListen;
+union ringConnectInfo {
+  union ncclSocketAddress addr;
+  char handle[NCCL_NET_HANDLE_MAXSIZE];
 };
 
+struct extInfo {
+  int rank;                                  // rank of the process reaching out
+  int nranks;                                // total number of ranks
+  int iroot;                                 // current root index
+  int nroots;                                // total number of roots
+  union ncclSocketAddress listenRootAddress; // address of my listenSocket for the root
+  union ringConnectInfo connectInfo;
+};
+#define NET_HANDLE(h, rank)    ((h) + (rank * NCCL_NET_HANDLE_MAXSIZE))
+#define BOOTSTRAP_HANDLE(h, i) ((struct ncclBootstrapHandle*)((char*)h + i * NCCL_UNIQUE_ID_BYTES))
+
 #include <sys/resource.h>
 
 static ncclResult_t setFilesLimit() {
@@ -108,95 +250,148 @@ static ncclResult_t setFilesLimit() {
   return ncclSuccess;
 }
 
-static void *bootstrapRoot(void* rargs) {
+static ncclResult_t rootSend(union ncclSocketAddress* addr, uint64_t magic, union ringConnectInfo* info) {
+  ncclResult_t res = ncclSuccess;
+  struct ncclSocket sock;
+  NCCLCHECKGOTO(ncclSocketInit(&sock, addr, magic, ncclSocketTypeBootstrap), res, fail);
+  NCCLCHECKGOTO(ncclSocketConnect(&sock), res, fail);
+  NCCLCHECKGOTO(socketSend(&sock, info, sizeof(union ringConnectInfo)), res, fail);
+  NCCLCHECK(ncclSocketClose(&sock));
+  return res;
+fail:
+  (void)ncclSocketClose(&sock);
+  return res;
+}
+static void* bootstrapRoot(void* rargs) {
+  uint64_t timers[BOOTSTRAP_INIT_ROOT_N] = {0};
   struct bootstrapRootArgs* args = (struct bootstrapRootArgs*)rargs;
   struct ncclSocket* listenSock = args->listenSock;
   uint64_t magic = args->magic;
   ncclResult_t res = ncclSuccess;
   int nranks = 0, c = 0;
+  int iroot = 0, nroots = 0, localId = 0;
+  int nrecv = 0, n2send = 0;
   struct extInfo info;
-  union ncclSocketAddress *rankAddresses = NULL;
-  union ncclSocketAddress *rankAddressesRoot = NULL; // for initial rank <-> root information exchange
-  union ncclSocketAddress *zero = NULL;
-  NCCLCHECKGOTO(ncclCalloc(&zero, 1), res, out);
+  union ringConnectInfo* rankInfo = NULL;
+  union ncclSocketAddress* rankAddressesRoot = NULL; // for initial rank <-> root information exchange
+  // get zeros for comparison
+  char zeroHandle[NCCL_NET_HANDLE_MAXSIZE];
+  union ncclSocketAddress zeroAddress;
+  union ringConnectInfo zeroInfo;
+  memset(&zeroAddress, 0, sizeof(union ncclSocketAddress));
+  memset(&zeroHandle, 0, NCCL_NET_HANDLE_MAXSIZE);
+  memset(&zeroInfo, 0, sizeof(union ringConnectInfo));
   setFilesLimit();
 
-  TRACE(NCCL_INIT, "BEGIN");
+  TRACE(NCCL_BOOTSTRAP, "BEGIN");
+  BOOTSTRAP_PROF_OPEN(timers[BOOTSTRAP_INIT_ROOT_WAIT]);
   /* Receive addresses from all ranks */
   do {
     struct ncclSocket sock;
     NCCLCHECKGOTO(ncclSocketInit(&sock), res, out);
     NCCLCHECKGOTO(ncclSocketAccept(&sock, listenSock), res, out);
-    NCCLCHECKGOTO(bootstrapNetRecv(&sock, &info, sizeof(info)), res, out);
+    NCCLCHECKGOTO(socketRecv(&sock, &info, sizeof(info)), res, out);
     NCCLCHECKGOTO(ncclSocketClose(&sock), res, out);
 
     if (c == 0) {
+      BOOTSTRAP_PROF_CLOSE(timers[BOOTSTRAP_INIT_ROOT_WAIT]);
+      BOOTSTRAP_PROF_OPEN(timers[BOOTSTRAP_INIT_ROOT_RECV]);
       nranks = info.nranks;
-      NCCLCHECKGOTO(ncclCalloc(&rankAddresses, nranks), res, out);
-      NCCLCHECKGOTO(ncclCalloc(&rankAddressesRoot, nranks), res, out);
+      iroot = info.iroot;
+      nroots = info.nroots;
+      // if the number of root > 1, we will receive one extra info from the first local_id of the next root
+      n2send = nRankFromRoot(iroot, nranks, nroots);
+      nrecv = n2send + ((nroots > 1) ? 1 : 0);
+      NCCLCHECKGOTO(ncclCalloc(&rankInfo, nrecv * sizeof(union ringConnectInfo)), res, out);
+      NCCLCHECKGOTO(ncclCalloc(&rankAddressesRoot, nrecv), res, out);
     }
 
-    if (nranks != info.nranks) {
-      WARN("Bootstrap Root : mismatch in rank count from procs %d : %d", nranks, info.nranks);
+    if (nranks != info.nranks || nroots != info.nroots || iroot != info.iroot) {
+      WARN("Bootstrap Root : mismatch in info from procs, nranks %d vs %d, nroots %d vs %d, iroot %d vs %d", nranks, info.nranks, nroots, info.nroots, iroot, info.iroot);
       goto out;
     }
 
-    if (memcmp(zero, &rankAddressesRoot[info.rank], sizeof(union ncclSocketAddress)) != 0) {
+    localId = localIdFromRoot(info.rank, iroot, nranks, nroots);
+    if (memcmp(&zeroAddress, &rankAddressesRoot[localId], sizeof(union ncclSocketAddress)) != 0 ||
+        memcmp(&zeroInfo, &rankInfo[localId], sizeof(union ringConnectInfo)) != 0) {
       WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, nranks);
       goto out;
     }
-
-    // Save the connection handle for that rank
-    memcpy(rankAddressesRoot+info.rank, &info.extAddressListenRoot, sizeof(union ncclSocketAddress));
-    memcpy(rankAddresses+info.rank, &info.extAddressListen, sizeof(union ncclSocketAddress));
-
+    // if the previous has already checked in, send the newly received handle, if not save the handle for later
+    // if we have more than 1 root, I do not own the previous of local_id = 0
+    // if we have prev > n2send, we do not send anything
+    int prev = (nroots > 1) ? (localId - 1) : BOOTSTRAP_PID(localId - 1, nrecv);
+    if (prev >= 0 && prev < n2send && memcmp(&zeroAddress, &rankAddressesRoot[prev], sizeof(union ncclSocketAddress)) != 0) {
+      NCCLCHECKGOTO(rootSend(&rankAddressesRoot[prev], magic, &info.connectInfo), res, out);
+    } else {
+      memcpy(&rankInfo[localId], &info.connectInfo, sizeof(union ringConnectInfo));
+    }
+    // if the next rank has checked in, send the newly received info, if not save the addr for later
+    // for nroots >=1, I will always own the information of the next connection
+    // if the local_id id must be [0 ; n2send[ otherwise we do not answer
+    int next = BOOTSTRAP_PID(localId + 1, nrecv);
+    if (localId >= 0 && localId < n2send && memcmp(&zeroInfo, &rankInfo[next], sizeof(union ringConnectInfo)) != 0) {
+      NCCLCHECKGOTO(rootSend(&info.listenRootAddress, magic, &rankInfo[next]), res, out);
+    } else {
+      memcpy(rankAddressesRoot + localId, &info.listenRootAddress, sizeof(union ncclSocketAddress));
+    }
     ++c;
-    TRACE(NCCL_INIT, "Received connect from rank %d total %d/%d",  info.rank, c, nranks);
-  } while (c < nranks);
-  TRACE(NCCL_INIT, "COLLECTED ALL %d HANDLES", nranks);
+    TRACE(NCCL_BOOTSTRAP, "Received connect from rank %d total %d/%d", info.rank, c, nrecv);
+  } while (c < nrecv);
+  TRACE(NCCL_BOOTSTRAP, "COLLECTED ALL %d HANDLES", nrecv);
+  BOOTSTRAP_PROF_CLOSE(timers[BOOTSTRAP_INIT_ROOT_RECV]);
 
-  // Send the connect handle for the next rank in the AllGather ring
-  for (int r=0; r<nranks; ++r) {
-    int next = (r+1) % nranks;
-    struct ncclSocket sock;
-    NCCLCHECKGOTO(ncclSocketInit(&sock, rankAddressesRoot+r, magic, ncclSocketTypeBootstrap), res, out);
-    NCCLCHECKGOTO(ncclSocketConnect(&sock), res, out);
-    NCCLCHECKGOTO(bootstrapNetSend(&sock, rankAddresses+next, sizeof(union ncclSocketAddress)), res, out);
-    NCCLCHECKGOTO(ncclSocketClose(&sock), res, out);
+  // send the remaining info to the ranks who haven't received anything
+  BOOTSTRAP_PROF_OPEN(timers[BOOTSTRAP_INIT_ROOT_SEND]);
+  // here we need to send info only to my own local process
+  for (int r = 0; r < n2send; ++r) {
+    // use nrecv to periodize: if 1 root, we will send the first one to the last one, if >1 roots we will send the additional one we have received
+    int next = BOOTSTRAP_PID(r + 1, nrecv);
+    if (memcmp(&zeroAddress, &rankAddressesRoot[r], sizeof(union ncclSocketAddress)) != 0 &&
+        memcmp(&zeroInfo, &rankInfo[next], sizeof(union ringConnectInfo)) != 0) {
+      NCCLCHECKGOTO(rootSend(&rankAddressesRoot[r], magic, &rankInfo[next]), res, out);
+    }
   }
-  TRACE(NCCL_INIT, "SENT OUT ALL %d HANDLES", nranks);
-
+  BOOTSTRAP_PROF_CLOSE(timers[BOOTSTRAP_INIT_ROOT_SEND]);
+  TRACE(NCCL_BOOTSTRAP | NCCL_PROFILE, "Root timings (wait %f, recv %f, send %f)", timers[BOOTSTRAP_INIT_ROOT_WAIT] / 1e9, timers[BOOTSTRAP_INIT_ROOT_RECV] / 1e9, timers[BOOTSTRAP_INIT_ROOT_SEND] / 1e9);
 out:
   if (listenSock != NULL) {
-    ncclSocketClose(listenSock);
+    (void)ncclSocketClose(listenSock);
     free(listenSock);
   }
-  if (rankAddresses) free(rankAddresses);
-  if (rankAddressesRoot) free(rankAddressesRoot);
-  if (zero) free(zero);
+  if (rankInfo)
+    free(rankInfo);
+  if (rankAddressesRoot)
+    free(rankAddressesRoot);
   free(rargs);
 
-  TRACE(NCCL_INIT, "DONE");
+  TRACE(NCCL_BOOTSTRAP, "DONE");
   return NULL;
 }
 
 ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv) {
-  struct ncclSocket* listenSock;
-  struct bootstrapRootArgs* args;
+  ncclResult_t ret = ncclSuccess;
+  struct ncclSocket* listenSock = NULL;
+  struct bootstrapRootArgs* args = NULL;
   pthread_t thread;
 
   NCCLCHECK(ncclCalloc(&listenSock, 1));
-  NCCLCHECK(ncclSocketInit(listenSock, &handle->addr, handle->magic, ncclSocketTypeBootstrap, NULL, 0));
-  NCCLCHECK(ncclSocketListen(listenSock));
-  NCCLCHECK(ncclSocketGetAddr(listenSock, &handle->addr));
+  NCCLCHECKGOTO(ncclSocketInit(listenSock, &handle->addr, handle->magic, ncclSocketTypeBootstrap, NULL, 0), ret, fail);
+  NCCLCHECKGOTO(ncclSocketListen(listenSock), ret, fail);
+  NCCLCHECKGOTO(ncclSocketGetAddr(listenSock, &handle->addr), ret, fail);
 
-  NCCLCHECK(ncclCalloc(&args, 1));
+  NCCLCHECKGOTO(ncclCalloc(&args, 1), ret, fail);
   args->listenSock = listenSock;
   args->magic = handle->magic;
-  NEQCHECK(pthread_create(&thread, NULL, bootstrapRoot, (void*)args), 0);
+  PTHREADCHECKGOTO(pthread_create(&thread, NULL, bootstrapRoot, (void*)args), "pthread_create", ret, fail);
   ncclSetThreadName(thread, "NCCL BootstrapR");
-  NEQCHECK(pthread_detach(thread), 0); // will not be pthread_join()'d
-  return ncclSuccess;
+  PTHREADCHECKGOTO(pthread_detach(thread), "pthread_detach", ret, fail); // will not be pthread_join()'d
+exit:
+  return ret;
+fail:
+  if (listenSock) free(listenSock);
+  if (args) free(args);
+  goto exit;
 }
 
 ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle) {
@@ -226,218 +421,419 @@ struct unexConn {
   struct unexConn* next;
 };
 
+struct bootstrapRing_t {
+  union {
+    struct {
+      void *sendComm, *recvComm;
+      ncclNetDeviceHandle_t *sendDevHandle, *recvDevHandle;
+    } net;
+    struct {
+      struct ncclSocket recv;
+      struct ncclSocket send;
+    } socket;
+  };
+};
+struct bootstrapListen_t {
+  struct ncclSocket peerSocket; // socket for peers to contact me in P2P
+  union {
+    struct {
+      int dev;
+      void* comm;
+      char handle[NCCL_NET_HANDLE_MAXSIZE];
+    } net;
+    struct ncclSocket socket; // socket to be used for the ring
+  };
+};
+
 struct bootstrapState {
-  struct ncclSocket listenSock;
-  struct ncclSocket ringRecvSocket;
-  struct ncclSocket ringSendSocket;
-  union ncclSocketAddress* peerCommAddresses;
-  union ncclSocketAddress* peerProxyAddresses;
+  struct bootstrapRing_t ring;
+  struct bootstrapListen_t listen;
+  ncclNet_t* net;
   uint64_t* peerProxyAddressesUDS;
+  union ncclSocketAddress* peerProxyAddresses;
+  union ncclSocketAddress* peerP2pAddresses;
   struct unexConn* unexpectedConnections;
   int cudaDev;
   int rank;
   int nranks;
   uint64_t magic;
-  volatile uint32_t *abortFlag;
+  volatile uint32_t* abortFlag;
 };
+#define STATE_RING(s, f) (s->ring.f)
+#define STATE_LISTEN(s, f) (s->listen.f)
 
-ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm* comm) {
+// helper functions
+static ncclResult_t createListenSocket(struct ncclComm* comm, uint64_t magic, struct ncclSocket* socket, union ncclSocketAddress* addr,
+                                       ncclSocketType type) {
+  NCCLCHECK(ncclSocketInit(socket, &bootstrapNetIfAddr, magic, type, comm->abortFlag));
+  NCCLCHECK(ncclSocketListen(socket));
+  NCCLCHECK(ncclSocketGetAddr(socket, addr));
+  return ncclSuccess;
+}
+static ncclResult_t getUDS(uint64_t* peerUDS) {
+  uint64_t randId;
+  NCCLCHECK(getRandomData(&randId, sizeof(randId)));
+  *peerUDS = getPidHash() + randId;
+  return ncclSuccess;
+}
+#define MAX_OOB_DEVS 16
+static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) {
+  static int devOOB = -1;
+  if (devOOB < 0) {
+    pthread_mutex_lock(&bootstrapNetLock);
+    if (devOOB < 0) {
+      char* userIfEnv = getenv("NCCL_OOB_NET_IFNAME");
+      if (userIfEnv && strlen(userIfEnv) > 0) {
+        INFO(NCCL_BOOTSTRAP | NCCL_ENV, "NCCL_OOB_NET_IFNAME set to %s", userIfEnv);
+        bool searchNot = userIfEnv && userIfEnv[0] == '^';
+        if (searchNot) userIfEnv++;
+        bool searchExact = userIfEnv && userIfEnv[0] == '=';
+        if (searchExact) userIfEnv++;
+        struct netIf userIfs[MAX_OOB_DEVS];
+        int nUserIfs = parseStringList(userIfEnv, userIfs, MAX_OOB_DEVS);
+        // loop over the device and return the first one matching
+        int devId = 0;
+        int nDev = 0;
+        NCCLCHECK(comm->ncclNet->devices(&nDev));
+        while (devId < nDev) {
+          ncclNetProperties_t props;
+          comm->ncclNet->getProperties(devId, &props);
+          // check against user specified HCAs/ports
+          bool found = matchIfList(props.name, props.port, userIfs, nUserIfs, searchExact) ^ searchNot;
+          if (found) {
+            devOOB = devId;
+            break;
+          }
+          devId++;
+        }
+        if (devOOB == -1) {
+          WARN("no device found matching NCCL_OOB_NET_IFNAME=%s, ignoring", userIfEnv);
+          goto noEnv;
+        }
+      } else {
+      noEnv:
+        // default choice is device 0
+        devOOB = 0;
+      }
+    }
+    pthread_mutex_unlock(&bootstrapNetLock);
+  }
+  *dev = devOOB;
+  return ncclSuccess;
+}
+
+static ncclResult_t netRingConnect(ncclNet_t* net, struct bootstrapListen_t* listen, char peerHandle[NCCL_NET_HANDLE_MAXSIZE],
+                                   void** sendComm, ncclNetDeviceHandle_t** sendDevHandle,
+                                   void** recvComm, ncclNetDeviceHandle_t** recvDevHandle, volatile uint32_t* abortFlag) {
+
+  int abortCounter = 0;
+  do {
+    NCCLCHECK(checkAbort(abortFlag, &abortCounter));
+    if (!*sendComm)
+      NCCLCHECK(net->connect(listen->net.dev, peerHandle, sendComm, sendDevHandle));
+    if (!*recvComm)
+      NCCLCHECK(net->accept(listen->net.comm, recvComm, recvDevHandle));
+  } while (!*sendComm || !*recvComm);
+  return ncclSuccess;
+}
+static ncclResult_t socketRingConnect(ncclSocketAddress* addr, struct ncclSocket* sendSocket, struct ncclSocket* listenSock, struct ncclSocket* recvSocket, uint64_t magic, volatile uint32_t* abortFlag) {
+  NCCLCHECK(ncclSocketInit(sendSocket, addr, magic, ncclSocketTypeBootstrap, abortFlag));
+  NCCLCHECK(ncclSocketConnect(sendSocket));
+  NCCLCHECK(ncclSocketInit(recvSocket));
+  NCCLCHECK(ncclSocketAccept(recvSocket, listenSock));
+  return ncclSuccess;
+}
+static ncclResult_t ringAllInfo(struct ncclComm* comm, struct bootstrapState* state,
+                                union ncclSocketAddress* peerAddresss,
+                                union ncclSocketAddress* peerProxy, uint64_t* peerUDS) {
+  ncclResult_t res = ncclSuccess;
+  int rank = comm->rank;
+  int nRanks = comm->nRanks;
+  struct bootstrapRingData {
+    union ncclSocketAddress peerAddress;
+    union ncclSocketAddress peerProxy;
+    uint64_t peerUDS;
+  }* ringData = NULL;
+
+  NCCLCHECK(ncclCalloc(&ringData, nRanks));
+  // pack
+  if (peerAddresss)
+    memcpy(&(ringData[rank].peerAddress), peerAddresss + rank, sizeof(union ncclSocketAddress));
+  if (peerProxy)
+    memcpy(&(ringData[rank].peerProxy), peerProxy + rank, sizeof(union ncclSocketAddress));
+  if (peerUDS)
+    memcpy(&(ringData[rank].peerUDS), peerUDS + rank, sizeof(uint64_t));
+
+  // allgather
+  NCCLCHECKGOTO(bootstrapAllGather(state, ringData, sizeof(struct bootstrapRingData)), res, exit);
+
+  // unpack
+  for (int irank = 0; irank < nRanks; ++irank) {
+    if (peerAddresss)
+      memcpy(peerAddresss + irank, &(ringData[irank].peerAddress), sizeof(union ncclSocketAddress));
+    if (peerProxy)
+      memcpy(peerProxy + irank, &(ringData[irank].peerProxy), sizeof(union ncclSocketAddress));
+    if (peerUDS)
+      memcpy(peerUDS + irank, &(ringData[irank].peerUDS), sizeof(uint64_t));
+  }
+
+exit:
+  free(ringData);
+  return ncclSuccess;
+}
+
+static ncclResult_t sendToRoot(struct ncclBootstrapHandle* handle, struct ncclComm* comm, struct extInfo* info) {
+  ncclResult_t ret = ncclSuccess;
+  struct ncclSocket sock;
+  NCCLCHECK(ncclSocketInit(&sock, &handle->addr, handle->magic, ncclSocketTypeBootstrap, comm->abortFlag));
+  NCCLCHECKGOTO(ncclSocketConnect(&sock), ret, fail);
+  NCCLCHECKGOTO(socketSend(&sock, info, sizeof(struct extInfo)), ret, fail);
+  NCCLCHECK(ncclSocketClose(&sock));
+  return ret;
+fail:
+  (void)ncclSocketClose(&sock);
+  return ret;
+}
+
+NCCL_PARAM(StaggerRate, "UID_STAGGER_RATE", 7000);
+NCCL_PARAM(StaggerThreshold, "UID_STAGGER_THRESHOLD", 256);
+
+ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
   int rank = comm->rank;
   int nranks = comm->nRanks;
+  // char nextPeerHandle[NCCL_NET_HANDLE_MAXSIZE];
   struct bootstrapState* state;
   struct ncclSocket* proxySocket;
-  ncclSocketAddress nextAddr;
   struct ncclSocket sock, listenSockRoot;
-  struct extInfo info = { 0 };
+  struct extInfo info = {0};
+  union ringConnectInfo nextPeer;
+
+  uint64_t timers[BOOTSTRAP_INIT_TIME_N] = {0};
 
   NCCLCHECK(ncclCalloc(&state, 1));
   state->rank = rank;
   state->nranks = nranks;
+  state->cudaDev = comm->cudaDev;
   state->abortFlag = comm->abortFlag;
+  state->net = comm->ncclNet;
   comm->bootstrap = state;
-  comm->magic = state->magic = handle->magic;
+  comm->magic = state->magic = BOOTSTRAP_HANDLE(handles, 0)->magic; // state and comm magic set to the first magic ID
 
-  TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks);
+  TRACE(NCCL_BOOTSTRAP, "rank %d nranks %d", rank, nranks);
 
-  info.rank = rank;
+  BOOTSTRAP_PROF_OPEN(timers[BOOTSTRAP_INIT_TIME_TOTAL]);
+  // fill up the info
   info.nranks = nranks;
-  // Create socket for other ranks to contact me
-  NCCLCHECK(ncclSocketInit(&state->listenSock, &bootstrapNetIfAddr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag));
-  NCCLCHECK(ncclSocketListen(&state->listenSock));
-  NCCLCHECK(ncclSocketGetAddr(&state->listenSock, &info.extAddressListen));
-
-  // Create socket for root to contact me
-  NCCLCHECK(ncclSocketInit(&listenSockRoot, &bootstrapNetIfAddr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag));
-  NCCLCHECK(ncclSocketListen(&listenSockRoot));
-  NCCLCHECK(ncclSocketGetAddr(&listenSockRoot, &info.extAddressListenRoot));
+  info.nroots = nHandles;
+  // get the ring connection info
+  memset(&nextPeer, 0, sizeof(union ringConnectInfo));
+  BOOTSTRAP_PROF_OPEN(timers[BOOTSTRAP_INIT_TIME_CREATE]);
+  if (ncclParamBootstrapNetEnable()) {
+    // Create net interface for other ranks to contact me (all gather)
+    NCCLCHECK(netGetDevice(rank, comm, &STATE_LISTEN(state, net.dev)));
+    NCCLCHECK(state->net->listen(STATE_LISTEN(state, net.dev), STATE_LISTEN(state, net.handle), &STATE_LISTEN(state, net.comm)));
+    memcpy(info.connectInfo.handle, STATE_LISTEN(state, net.handle), NCCL_NET_HANDLE_MAXSIZE);
+  } else {
+    // create socket for ring neightbor to contact mee
+    NCCLCHECK(createListenSocket(comm, comm->magic, &STATE_LISTEN(state, socket), &info.connectInfo.addr, ncclSocketTypeBootstrap));
+  }
+  // Create socket for root to contact me using the root's magic
+  int curr_root = rootIdFromRank(rank, nranks, nHandles);
+  NCCLCHECK(createListenSocket(comm, BOOTSTRAP_HANDLE(handles, curr_root)->magic, &listenSockRoot, &info.listenRootAddress, ncclSocketTypeBootstrap));
+  BOOTSTRAP_PROF_CLOSE(timers[BOOTSTRAP_INIT_TIME_CREATE]);
 
   // stagger connection times to avoid an overload of the root
-  if (nranks > 128) {
-    long msec = rank;
+  BOOTSTRAP_PROF_OPEN(timers[BOOTSTRAP_INIT_TIME_DELAY]);
+  int nRankRoot = nRankFromRoot(curr_root, nranks, nHandles);
+  if (nRankRoot > ncclParamStaggerThreshold()) {
+    // for socket the message rate in microsec
+    double msg_rate = ncclParamStaggerRate() / 1.0e6;
+    long musec = localIdFromRoot(rank, curr_root, nranks, nHandles) / msg_rate;
     struct timespec tv;
-    tv.tv_sec = msec / 1000;
-    tv.tv_nsec = 1000000 * (msec % 1000);
-    TRACE(NCCL_INIT, "rank %d delaying connection to root by %ld msec", rank, msec);
-    (void) nanosleep(&tv, NULL);
+    long c_1e6 = 1e6;
+    tv.tv_sec = musec / c_1e6;
+    tv.tv_nsec = 1e3 * (musec % c_1e6);
+    TRACE(NCCL_BOOTSTRAP, "rank %d delaying connection to root by %ld microsec", rank, musec);
+    (void)nanosleep(&tv, NULL);
   }
+  BOOTSTRAP_PROF_CLOSE(timers[BOOTSTRAP_INIT_TIME_DELAY]);
 
   // send info on my listening socket to root
-  NCCLCHECK(ncclSocketInit(&sock, &handle->addr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag));
-  NCCLCHECK(ncclSocketConnect(&sock));
-  NCCLCHECK(bootstrapNetSend(&sock, &info, sizeof(info)));
-  NCCLCHECK(ncclSocketClose(&sock));
+  BOOTSTRAP_PROF_OPEN(timers[BOOTSTRAP_INIT_TIME_SEND]);
+  // send contact info to my own root
+  info.rank = rank;
+  info.iroot = curr_root;
+  NCCLCHECK(sendToRoot(BOOTSTRAP_HANDLE(handles, curr_root), comm, &info));
+  // if needed, send the connection info to the previous root
+  if (nHandles > 1 && isFirstFromRoot(rank, curr_root, nranks, nHandles)) {
+    int prev_rank = BOOTSTRAP_PID(rank - 1, nranks);
+    int prev_root = rootIdFromRank(prev_rank, nranks, nHandles);
+    info.rank = prev_rank + 1; // my rank as seen by the previous root
+    info.iroot = prev_root;
+    NCCLCHECK(sendToRoot(BOOTSTRAP_HANDLE(handles, prev_root), comm, &info));
+  }
+  BOOTSTRAP_PROF_CLOSE(timers[BOOTSTRAP_INIT_TIME_SEND]);
 
   // get info on my "next" rank in the bootstrap ring from root
+  BOOTSTRAP_PROF_OPEN(timers[BOOTSTRAP_INIT_TIME_RECV]);
   NCCLCHECK(ncclSocketInit(&sock));
   NCCLCHECK(ncclSocketAccept(&sock, &listenSockRoot));
-  NCCLCHECK(bootstrapNetRecv(&sock, &nextAddr, sizeof(union ncclSocketAddress)));
+  NCCLCHECK(socketRecv(&sock, &nextPeer, sizeof(nextPeer)));
   NCCLCHECK(ncclSocketClose(&sock));
   NCCLCHECK(ncclSocketClose(&listenSockRoot));
+  BOOTSTRAP_PROF_CLOSE(timers[BOOTSTRAP_INIT_TIME_RECV]);
 
-  NCCLCHECK(ncclSocketInit(&state->ringSendSocket, &nextAddr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag));
-  NCCLCHECK(ncclSocketConnect(&state->ringSendSocket));
-  // Accept the connect request from the previous rank in the AllGather ring
-  NCCLCHECK(ncclSocketInit(&state->ringRecvSocket));
-  NCCLCHECK(ncclSocketAccept(&state->ringRecvSocket, &state->listenSock));
+  // accept and connect the ring network
+  if (ncclParamBootstrapNetEnable()) {
+    NCCLCHECK(netRingConnect(state->net, &state->listen, nextPeer.handle,
+                             &STATE_RING(state, net.sendComm), &STATE_RING(state, net.sendDevHandle),
+                             &STATE_RING(state, net.recvComm), &STATE_RING(state, net.recvDevHandle), state->abortFlag));
+  } else {
+    NCCLCHECK(socketRingConnect(&nextPeer.addr, &STATE_RING(state, socket.send), &STATE_LISTEN(state, socket), &STATE_RING(state, socket.recv), comm->magic, state->abortFlag));
+  }
 
   // AllGather all listen handlers
-  NCCLCHECK(ncclCalloc(&state->peerCommAddresses, nranks));
-  NCCLCHECK(ncclSocketGetAddr(&state->listenSock, state->peerCommAddresses+rank));
-  NCCLCHECK(bootstrapAllGather(state, state->peerCommAddresses, sizeof(union ncclSocketAddress)));
-
-  // Create the service proxy
+  // in case of failure, those resources will be free'd when calling bootstrapDestroy, so we can return immediatly
   NCCLCHECK(ncclCalloc(&state->peerProxyAddresses, nranks));
-  NCCLCHECK(ncclCalloc(&state->peerProxyAddressesUDS, nranks));
-
-  // proxy is aborted through a message; don't set abortFlag
   NCCLCHECK(ncclCalloc(&proxySocket, 1));
-  NCCLCHECK(ncclSocketInit(proxySocket, &bootstrapNetIfAddr, comm->magic, ncclSocketTypeProxy, comm->abortFlag));
-  NCCLCHECK(ncclSocketListen(proxySocket));
-  NCCLCHECK(ncclSocketGetAddr(proxySocket, state->peerProxyAddresses+rank));
-  NCCLCHECK(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress)));
-  // cuMem UDS support
-  // Make sure we create a unique UDS socket name
-  uint64_t randId;
-  NCCLCHECK(getRandomData(&randId, sizeof(randId)));
-  state->peerProxyAddressesUDS[rank] = getPidHash()+randId;
-  NCCLCHECK(bootstrapAllGather(state, state->peerProxyAddressesUDS, sizeof(*state->peerProxyAddressesUDS)));
+  NCCLCHECK(createListenSocket(comm, comm->magic, proxySocket, state->peerProxyAddresses + rank, ncclSocketTypeProxy));
+
+  NCCLCHECK(ncclCalloc(&state->peerProxyAddressesUDS, nranks));
+  NCCLCHECK(getUDS(state->peerProxyAddressesUDS + rank));
+
+  // create a socket for others to reach out (P2P)
+  union ncclSocketAddress peerSocketAddress;
+  NCCLCHECK(createListenSocket(comm, comm->magic, &STATE_LISTEN(state, peerSocket), &peerSocketAddress, ncclSocketTypeBootstrap));
+  NCCLCHECK(ncclCalloc(&state->peerP2pAddresses, nranks * sizeof(union ncclSocketAddress)));
+  memcpy(state->peerP2pAddresses + rank, &peerSocketAddress, sizeof(union ncclSocketAddress));
+
+  BOOTSTRAP_PROF_OPEN(timers[BOOTSTRAP_INIT_TIME_RING]);
+  NCCLCHECK(ringAllInfo(comm, state, state->peerP2pAddresses, state->peerProxyAddresses, state->peerProxyAddressesUDS));
+  BOOTSTRAP_PROF_CLOSE(timers[BOOTSTRAP_INIT_TIME_RING]);
+
+  // Create the service proxy and get the UDS
   NCCLCHECK(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses, state->peerProxyAddressesUDS));
 
-  TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
+  BOOTSTRAP_PROF_CLOSE(timers[BOOTSTRAP_INIT_TIME_TOTAL]);
+  TRACE(NCCL_BOOTSTRAP, "rank %d nranks %d - DONE", rank, nranks);
+  INFO(NCCL_BOOTSTRAP | NCCL_PROFILE, "Bootstrap timings total %f (create %f, send %f, recv %f, ring %f, delay %f)", timers[BOOTSTRAP_INIT_TIME_TOTAL] / 1e9,
+       timers[BOOTSTRAP_INIT_TIME_CREATE] / 1e9,
+       timers[BOOTSTRAP_INIT_TIME_SEND] / 1e9,
+       timers[BOOTSTRAP_INIT_TIME_RECV] / 1e9,
+       timers[BOOTSTRAP_INIT_TIME_RING] / 1e9,
+       timers[BOOTSTRAP_INIT_TIME_DELAY] / 1e9);
 
   return ncclSuccess;
 }
 
-ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks) {
+ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks) {
   ncclResult_t ret = ncclSuccess;
   int rank = comm->rank;
   int nranks = comm->nRanks;
   int prev, next;
-  ncclSocketAddress listenAddr, tmpAddr;
-  struct ncclSocket* proxySocket;
+  union ringConnectInfo info;
+  union ringConnectInfo nextPeer;
+  struct ncclSocket* proxySocket = NULL;
   struct bootstrapState* state;
 
   NCCLCHECKGOTO(ncclCalloc(&state, 1), ret, fail);
   state->rank = rank;
   state->nranks = nranks;
+  state->cudaDev = comm->cudaDev;
   state->abortFlag = comm->abortFlag;
+  state->net = comm->ncclNet;
   comm->bootstrap = state;
-  comm->magic = state->magic = handle->magic;
+  comm->magic = state->magic = magic;
 
-  prev = parentRanks[(rank-1+nranks)%nranks];
-  next = parentRanks[(rank+1)%nranks];
+  prev = parentRanks[(rank - 1 + nranks) % nranks];
+  next = parentRanks[(rank + 1) % nranks];
 
-  // Setup my sockets for the allgather ring and other p2p connections
-  NCCLCHECKGOTO(ncclSocketInit(&state->listenSock, &bootstrapNetIfAddr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag, 0), ret, fail);
-  NCCLCHECKGOTO(ncclSocketInit(&state->ringRecvSocket, NULL, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag, 0), ret, fail);
+  // create a handle for the others to reach out to me
+  if (ncclParamBootstrapNetEnable()) {
+    NCCLCHECKGOTO(netGetDevice(rank, comm, &STATE_LISTEN(state, net.dev)), ret, fail);
+    NCCLCHECKGOTO(state->net->listen(STATE_LISTEN(state, net.dev), STATE_LISTEN(state, net.handle), &STATE_LISTEN(state, net.comm)), ret, fail);
+    memcpy(info.handle, STATE_LISTEN(state, net.handle), NCCL_NET_HANDLE_MAXSIZE);
+  } else {
+    // create socket for ring neightbor to contact mee
+    NCCLCHECK(createListenSocket(comm, comm->magic, &STATE_LISTEN(state, socket), &info.addr, ncclSocketTypeBootstrap));
+  }
+  // create a socket for others to reach out (P2P)
+  union ncclSocketAddress peerSocketAddress;
+  NCCLCHECK(createListenSocket(comm, comm->magic, &STATE_LISTEN(state, peerSocket), &peerSocketAddress, ncclSocketTypeBootstrap));
 
-  // Create socket for other ranks to contact me
-  NCCLCHECKGOTO(ncclSocketListen(&state->listenSock), ret, fail);
-
-  // Get addr from next rank
-  NCCLCHECKGOTO(ncclSocketGetAddr(&state->listenSock, &listenAddr), ret, fail);
-  NCCLCHECKGOTO(bootstrapSend(parent->bootstrap, prev, -2, &listenAddr, sizeof(union ncclSocketAddress)), ret, fail);
-  NCCLCHECKGOTO(bootstrapRecv(parent->bootstrap, next, -2, &tmpAddr, sizeof(union ncclSocketAddress)), ret, fail);
-
-  NCCLCHECKGOTO(ncclSocketInit(&state->ringSendSocket, &tmpAddr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag, 0), ret, fail);
-  NCCLCHECKGOTO(ncclSocketConnect(&state->ringSendSocket), ret, fail);
-  // Accept the connect request from the previous rank in the AllGather ring
-  NCCLCHECKGOTO(ncclSocketAccept(&state->ringRecvSocket, &state->listenSock), ret, fail);
-
-  // AllGather all listen handlers
-  NCCLCHECKGOTO(ncclCalloc(&state->peerCommAddresses, nranks), ret, fail);
-  memcpy(state->peerCommAddresses+rank, &listenAddr, sizeof(union ncclSocketAddress));
-  NCCLCHECKGOTO(bootstrapAllGather(state, state->peerCommAddresses, sizeof(union ncclSocketAddress)), ret, fail);
+  // Get addr from next rank using the parent's connections
+  NCCLCHECKGOTO(bootstrapSend(parent->bootstrap, prev, BOOTSTRAP_TAG_COMMSPLIT, &info, sizeof(union ringConnectInfo)), ret, fail);
+  NCCLCHECKGOTO(bootstrapRecv(parent->bootstrap, next, BOOTSTRAP_TAG_COMMSPLIT, &nextPeer, sizeof(union ringConnectInfo)), ret, fail);
+  if (ncclParamBootstrapNetEnable()) {
+    NCCLCHECKGOTO(netRingConnect(state->net, &state->listen, nextPeer.handle,
+                                 &STATE_RING(state, net.sendComm), &STATE_RING(state, net.sendDevHandle),
+                                 &STATE_RING(state, net.recvComm), &STATE_RING(state, net.recvDevHandle), state->abortFlag),
+                  ret, fail);
+  } else {
+    NCCLCHECK(socketRingConnect(&nextPeer.addr, &STATE_RING(state, socket.send), &STATE_LISTEN(state, socket), &STATE_RING(state, socket.recv), comm->magic, state->abortFlag));
+  }
 
+  NCCLCHECKGOTO(ncclCalloc(&state->peerP2pAddresses, nranks * sizeof(union ncclSocketAddress)), ret, fail);
+  memcpy(state->peerP2pAddresses + rank, &peerSocketAddress, sizeof(union ncclSocketAddress));
   if (parent->config.splitShare) {
     /* map local rank to top parent local rank. */
     for (int i = 0; i < nranks; ++i) {
       comm->topParentRanks[i] = parent->topParentRanks[parentRanks[i]];
     }
+    NCCLCHECKGOTO(ringAllInfo(comm, state, state->peerP2pAddresses, NULL, NULL), ret, fail);
   } else {
-    // Create the service proxy
     NCCLCHECKGOTO(ncclCalloc(&state->peerProxyAddresses, nranks), ret, fail);
-    NCCLCHECKGOTO(ncclCalloc(&proxySocket, 1), ret, fail);
-    NCCLCHECKGOTO(ncclSocketInit(proxySocket, &bootstrapNetIfAddr, comm->magic, ncclSocketTypeProxy, comm->abortFlag, 0), ret, fail);
-    NCCLCHECKGOTO(ncclSocketListen(proxySocket), ret, fail);
-    NCCLCHECKGOTO(ncclSocketGetAddr(proxySocket, &tmpAddr), ret, fail);
-    memcpy(state->peerProxyAddresses + rank, &tmpAddr, sizeof(union ncclSocketAddress));
-    NCCLCHECKGOTO(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress)), ret, fail);
-    // cuMem UDS support
     NCCLCHECKGOTO(ncclCalloc(&state->peerProxyAddressesUDS, nranks), ret, fail);
-    // Make sure we create a unique UDS socket name
-    uint64_t randId;
-    NCCLCHECKGOTO(getRandomData(&randId, sizeof(randId)), ret, fail);
-    state->peerProxyAddressesUDS[rank] = getPidHash()+randId;
-    NCCLCHECKGOTO(bootstrapAllGather(state, state->peerProxyAddressesUDS, sizeof(*state->peerProxyAddressesUDS)), ret, fail);
+    // Create the service proxy and get the UDS
+    NCCLCHECKGOTO(ncclCalloc(&proxySocket, 1), ret, fail);
+    NCCLCHECKGOTO(getUDS(state->peerProxyAddressesUDS + rank), ret, fail);
+    NCCLCHECKGOTO(createListenSocket(comm, comm->magic, proxySocket, state->peerProxyAddresses + rank, ncclSocketTypeProxy), ret, fail);
+    NCCLCHECKGOTO(ringAllInfo(comm, state, state->peerP2pAddresses, state->peerProxyAddresses, state->peerProxyAddressesUDS), ret, fail);
     NCCLCHECKGOTO(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses, state->peerProxyAddressesUDS), ret, fail);
   }
 
-  INFO(NCCL_INIT, "bootstrapSplit: comm %p parent %p rank %d nranks %d color %d key %d prev %d next %d - DONE", comm, parent, rank, nranks, color, key, prev, next);
+  TRACE(NCCL_BOOTSTRAP, "bootstrapSplit: comm %p parent %p rank %d nranks %d color %d key %d prev %d next %d - DONE", comm, parent, rank, nranks,
+        color, key, prev, next);
 
 exit:
   return ret;
 fail:
+  free(proxySocket);
   goto exit;
 }
 
-// Bootstrap send/receive functions
-//
-// We do not keep connections opened with all ranks at all times, and we have no guarantee
-// that connections to our unique listen socket will arrive in the same order as we need
-// them. Therefore, when establishing a connection, the sender sends a (peer, tag) tuple to
-// allow the receiver to identify the flow, and keep it in an unexpected queue if needed.
-
-ncclResult_t bootstrapConnect(void* commState, int peer, int tag, struct ncclSocket* sock) {
+struct socketAckInfo {
+  int rank;
+  int tag;
+};
+static ncclResult_t socketConnect(void* commState, int peer, int tag, struct ncclSocket* sock) {
   ncclResult_t ret = ncclSuccess;
   struct bootstrapState* state = (struct bootstrapState*)commState;
 
-  NCCLCHECKGOTO(ncclSocketInit(sock, state->peerCommAddresses+peer, state->magic, ncclSocketTypeBootstrap), ret, fail);
+  struct socketAckInfo ack = (struct socketAckInfo){.rank = state->rank, .tag = tag};
+  NCCLCHECKGOTO(ncclSocketInit(sock, state->peerP2pAddresses + peer, state->magic, ncclSocketTypeBootstrap), ret, fail);
   NCCLCHECKGOTO(ncclSocketConnect(sock), ret, fail);
-  NCCLCHECKGOTO(bootstrapNetSend(sock, &state->rank, sizeof(int)), ret, fail);
-  NCCLCHECKGOTO(bootstrapNetSend(sock, &tag, sizeof(int)), ret, fail);
+  NCCLCHECKGOTO(socketSend(sock, &ack, sizeof(struct socketAckInfo)), ret, fail);
   return ncclSuccess;
 fail:
-  NCCLCHECK(ncclSocketClose(sock));
+  (void)ncclSocketClose(sock);
   return ret;
 }
-
 ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size) {
   ncclResult_t ret = ncclSuccess;
   struct ncclSocket sock;
-
   TRACE(NCCL_BOOTSTRAP, "Sending to peer=%d tag=%d size=%d", peer, tag, size);
-  NCCLCHECK(bootstrapConnect(commState, peer, tag, &sock));
-  NCCLCHECKGOTO(bootstrapNetSend(&sock, data, size), ret, exit);
-
+  NCCLCHECK(socketConnect(commState, peer, tag, &sock));
+  NCCLCHECKGOTO(socketSend(&sock, data, size), ret, fail);
   TRACE(NCCL_BOOTSTRAP, "Sent to peer=%d tag=%d size=%d", peer, tag, size);
-
-exit:
   NCCLCHECK(ncclSocketClose(&sock));
   return ret;
+fail:
+  (void)ncclSocketClose(&sock);
+  return ret;
 }
-
-ncclResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int tag, struct ncclSocket* sock) {
+// Bootstrap send/receive functions
+static ncclResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int tag, struct ncclSocket* sock) {
   // New unex
   struct unexConn* unex;
   NCCLCHECK(ncclCalloc(&unex, 1));
@@ -455,8 +851,7 @@ ncclResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int tag,
   list->next = unex;
   return ncclSuccess;
 }
-
-ncclResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int tag, struct ncclSocket* sock, int* found) {
+static ncclResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int tag, struct ncclSocket* sock, int* found) {
   struct unexConn* elem = state->unexpectedConnections;
   struct unexConn* prev = NULL;
   *found = 0;
@@ -491,10 +886,9 @@ static void unexpectedFree(struct bootstrapState* state) {
 }
 
 // We can't know who we'll receive from, so we need to receive everything at once
-ncclResult_t bootstrapAccept(void* commState, int peer, int tag, struct ncclSocket* sock) {
+static ncclResult_t socketAccept(void* commState, int peer, int tag, struct ncclSocket* sock) {
   ncclResult_t ret = ncclSuccess;
   struct bootstrapState* state = (struct bootstrapState*)commState;
-  int newPeer, newTag;
 
   // Search unexpected connections first
   int found;
@@ -503,128 +897,203 @@ ncclResult_t bootstrapAccept(void* commState, int peer, int tag, struct ncclSock
 
   // Then look for new connections
   while (1) {
+    struct socketAckInfo ack = {0};
     NCCLCHECKGOTO(ncclSocketInit(sock), ret, fail);
-    NCCLCHECKGOTO(ncclSocketAccept(sock, &state->listenSock), ret, fail);
-    NCCLCHECKGOTO(bootstrapNetRecv(sock, &newPeer, sizeof(int)), ret, fail);
-    NCCLCHECKGOTO(bootstrapNetRecv(sock, &newTag, sizeof(int)), ret, fail);
-    if (newPeer == peer && newTag == tag) return ncclSuccess;
-    NCCLCHECKGOTO(unexpectedEnqueue(state, newPeer, newTag, sock), ret, fail);
+    NCCLCHECKGOTO(ncclSocketAccept(sock, &STATE_LISTEN(state, peerSocket)), ret, fail);
+    NCCLCHECKGOTO(socketRecv(sock, &ack, sizeof(struct socketAckInfo)), ret, fail);
+    if (ack.rank == peer && ack.tag == tag) return ncclSuccess;
+    NCCLCHECKGOTO(unexpectedEnqueue(state, ack.rank, ack.tag, sock), ret, fail);
   }
   return ncclSuccess;
 fail:
-  NCCLCHECK(ncclSocketClose(sock));
+  (void)ncclSocketClose(sock);
   return ret;
 }
-
 // We can't know who we'll receive from, so we need to receive everything at once
 ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size) {
   ncclResult_t ret;
   struct ncclSocket sock;
-  NCCLCHECK(bootstrapAccept(commState, peer, tag, &sock));
+  NCCLCHECK(socketAccept(commState, peer, tag, &sock));
   TRACE(NCCL_BOOTSTRAP, "Receiving tag=%d peer=%d size=%d", tag, peer, size);
-  NCCLCHECKGOTO(bootstrapNetRecv(&sock, ((char*)data), size), ret, exit);
-exit:
+  NCCLCHECKGOTO(socketRecv(&sock, ((char*)data), size), ret, fail);
   NCCLCHECK(ncclSocketClose(&sock));
   return ret;
+fail:
+  (void)ncclSocketClose(&sock);
+  return ret;
 }
 
-// Collective algorithms, based on bootstrapSend/Recv, and sometimes bootstrapConnect/Accept
-
-ncclResult_t bootstrapRingAllGather(struct ncclSocket* prevSocket, struct ncclSocket* nextSocket, int rank, int nranks, char* data, int size) {
+static ncclResult_t netRingAllGather(ncclNet_t* net, void* sendComm, void* recvComm, int rank, int nranks, char* data, int size, volatile uint32_t* abortFlag) {
+  ncclResult_t res;
+  uint64_t tFirst = 0, tRest = 0;
+  void* sendDataHandle = NULL;
+  void* recvDataHandle = NULL;
+  NCCLCHECKGOTO(netReg(net, sendComm, data, nranks * size, &sendDataHandle), res, exit);
+  NCCLCHECKGOTO(netReg(net, recvComm, data, nranks * size, &recvDataHandle), res, exit);
   /* Simple ring based AllGather
    * At each step i receive data from (rank-i-1) from prev
    * and send previous step's data from (rank-i) to next
    */
-  for (int i=0; i<nranks-1; i++) {
+  TRACE(NCCL_BOOTSTRAP, "NetRingAllGather started");
+  BOOTSTRAP_PROF_OPEN(tFirst);
+  for (int i = 0; i < nranks - 1; i++) {
+    int tag = i;
     size_t rslice = (rank - i - 1 + nranks) % nranks;
     size_t sslice = (rank - i + nranks) % nranks;
-
-    // Send slice to the right, recv slice from the left
-    NCCLCHECK(bootstrapNetSendRecv(nextSocket, data+sslice*size, size, prevSocket, data+rslice*size, size));
+    void* recv_data = data + rslice * size;
+    void* send_data = data + sslice * size;
+    NCCLCHECKGOTO(netSendRecv(net, sendComm, send_data, size, sendDataHandle, recvComm, recv_data, size, recvDataHandle, tag, abortFlag), res, exit);
+    if (i == 0) {
+      BOOTSTRAP_PROF_CLOSE(tFirst);
+      BOOTSTRAP_PROF_OPEN(tRest);
+    }
   }
-  return ncclSuccess;
+  BOOTSTRAP_PROF_CLOSE(tRest);
+  TRACE(NCCL_BOOTSTRAP | NCCL_PROFILE, "netRingAllGather first message in %f (%f MB/sec), rest in %f (%f MB/sec)", tFirst / 1e9, (size / 1e6) / (tFirst / 1e9), tRest / 1e9, (nranks - 1) * (size / 1e6) / (tRest / 1e9));
+exit:
+  // do not fail in case of error, try to deregister as much as possible
+  if (sendDataHandle) netDereg(net, sendComm, &sendDataHandle);
+  if (recvDataHandle) netDereg(net, recvComm, &recvDataHandle);
+  return res;
+}
+static ncclResult_t socketRingAllGather(struct ncclSocket* sendSock, struct ncclSocket* recvSock, int rank, int nranks, char* data, int size) {
+  ncclResult_t res = ncclSuccess;
+  uint64_t tFirst = 0, tRest = 0;
+  /* Simple ring based AllGather
+   * At each step i receive data from (rank-i-1) from prev
+   * and send previous step's data from (rank-i) to next
+   */
+  TRACE(NCCL_BOOTSTRAP, "socketRingAllGather started");
+  BOOTSTRAP_PROF_OPEN(tFirst);
+  for (int i = 0; i < nranks - 1; i++) {
+    size_t rslice = (rank - i - 1 + nranks) % nranks;
+    size_t sslice = (rank - i + nranks) % nranks;
+    void* recv_data = data + rslice * size;
+    void* send_data = data + sslice * size;
+    NCCLCHECKGOTO(socketSendRecv(sendSock, send_data, size, recvSock, recv_data, size), res, exit);
+    if (i == 0) {
+      BOOTSTRAP_PROF_CLOSE(tFirst);
+      BOOTSTRAP_PROF_OPEN(tRest);
+    }
+  }
+  BOOTSTRAP_PROF_CLOSE(tRest);
+  TRACE(NCCL_BOOTSTRAP | NCCL_PROFILE, "socketRingAllGather first message in %f (%f MB/sec), rest in %f (%f MB/sec)", tFirst / 1e9, (size / 1e6) / (tFirst / 1e9), tRest / 1e9, (nranks - 1) * (size / 1e6) / (tRest / 1e9));
+exit:
+  return res;
 }
 ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
+  ncclResult_t res = ncclSuccess;
   struct bootstrapState* state = (struct bootstrapState*)commState;
   int rank = state->rank;
   int nranks = state->nranks;
 
-  TRACE(NCCL_INIT, "rank %d nranks %d size %d", rank, nranks, size);
+  TRACE(NCCL_BOOTSTRAP, "rank %d nranks %d size %d - AllGather", rank, nranks, size);
 
-  NCCLCHECK(bootstrapRingAllGather(&state->ringRecvSocket, &state->ringSendSocket, rank, nranks, (char*)allData, size));
-
-  TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
-  return ncclSuccess;
+  uint64_t time = 0;
+  BOOTSTRAP_PROF_OPEN(time);
+  if (ncclParamBootstrapNetEnable()) {
+    NCCLCHECKGOTO(netRingAllGather(state->net, STATE_RING(state, net.sendComm), STATE_RING(state, net.recvComm), rank, nranks, (char*)allData, size, state->abortFlag), res, exit);
+  } else {
+    NCCLCHECKGOTO(socketRingAllGather(&STATE_RING(state, socket.send), &STATE_RING(state, socket.recv), rank, nranks, (char*)allData, size), res, exit);
+  }
+exit:
+  BOOTSTRAP_PROF_CLOSE(time);
+  TRACE(NCCL_BOOTSTRAP | NCCL_PROFILE, "bootstrapAllGather for %d B done in %f sec: %f MB/sec", size, time / 1e9, (nranks * size / 1e6) / (time / 1e9));
+  TRACE(NCCL_BOOTSTRAP, "rank %d nranks %d size %d - AllGather DONE", rank, nranks, size);
+  return res;
 }
 
-ncclResult_t bootstrapIntraNodeBarrier(void* commState, int *ranks, int rank, int nranks, int tag) {
-  if (nranks == 1) return ncclSuccess;
-  TRACE(NCCL_INIT, "rank %d nranks %d tag %x - ENTER", rank, nranks, tag);
-
+static ncclResult_t bootstrapP2PBarrier(void* commState, int* ranks, int rank, int nranks, int tag) {
+  if (nranks == 1)
+    return ncclSuccess;
   /* Simple [intra] process barrier
    *
    * Based on the dissemination algorithm by Debra Hensgen, Raphael Finkel, and Udi Manbet,
    * "Two Algorithms for Barrier Synchronization," International Journal of Parallel Programming, 17(1):1-17, 1988"
    */
   int data[1];
-  for (int mask=1; mask<nranks; mask<<=1) {
+  for (int mask = 1; mask < nranks; mask <<= 1) {
     int src = (rank - mask + nranks) % nranks;
     int dst = (rank + mask) % nranks;
     NCCLCHECK(bootstrapSend(commState, ranks ? ranks[dst] : dst, tag, data, sizeof(data)));
     NCCLCHECK(bootstrapRecv(commState, ranks ? ranks[src] : src, tag, data, sizeof(data)));
   }
+  return ncclSuccess;
+}
 
-  TRACE(NCCL_INIT, "rank %d nranks %d tag %x - DONE", rank, nranks, tag);
+ncclResult_t bootstrapIntraNodeBarrier(void* commState, int* ranks, int rank, int nranks, int tag) {
+  uint64_t time = 0;
+  BOOTSTRAP_PROF_OPEN(time);
+  NCCLCHECK(bootstrapP2PBarrier(commState, ranks, rank, nranks, tag));
+  BOOTSTRAP_PROF_CLOSE(time);
+  TRACE(NCCL_BOOTSTRAP | NCCL_PROFILE, "bootstrapIntraNodeBarrier done in %f sec", time / 1e9);
   return ncclSuccess;
 }
 
 ncclResult_t bootstrapBarrier(void* commState, int rank, int nranks, int tag) {
-  return bootstrapIntraNodeBarrier(commState, NULL, rank, nranks, tag);
+  uint64_t time = 0;
+  BOOTSTRAP_PROF_OPEN(time);
+  NCCLCHECK(bootstrapP2PBarrier(commState, NULL, rank, nranks, tag));
+  BOOTSTRAP_PROF_CLOSE(time);
+  TRACE(NCCL_BOOTSTRAP | NCCL_PROFILE, "bootstrapBarrier done in %f sec", time / 1e9);
+  return ncclSuccess;
 }
 
-ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size) {
+ncclResult_t bootstrapIntraNodeAllGather(void* commState, int* ranks, int rank, int nranks, void* allData, int size) {
   if (nranks == 1) return ncclSuccess;
   TRACE(NCCL_INIT, "rank %d nranks %d size %d - ENTER", rank, nranks, size);
 
-  int prevRank = ranks[(rank - 1 + nranks)%nranks];
+  int prevRank = ranks[(rank - 1 + nranks) % nranks];
   int nextRank = ranks[(rank + 1) % nranks];
-  struct ncclSocket prevSocket, nextSocket;
-  NCCLCHECK(bootstrapConnect(commState, nextRank, 0, &nextSocket));
-  NCCLCHECK(bootstrapAccept(commState, prevRank, 0, &prevSocket));
+  // intraNode bootstrap is done defacto using the socket-based implementation
+  struct ncclSocket recvSocket, sendSocket;
+  NCCLCHECK(socketConnect(commState, nextRank, BOOTSTRAP_TAG_INTRANODE_ALLGATHER, &sendSocket));
+  NCCLCHECK(socketAccept(commState, prevRank, BOOTSTRAP_TAG_INTRANODE_ALLGATHER, &recvSocket));
 
-  NCCLCHECK(bootstrapRingAllGather(&prevSocket, &nextSocket, rank, nranks, (char*)allData, size));
+  NCCLCHECK(socketRingAllGather(&sendSocket, &recvSocket, rank, nranks, (char*)allData, size));
 
-  NCCLCHECK(ncclSocketClose(&nextSocket));
-  NCCLCHECK(ncclSocketClose(&prevSocket));
+  NCCLCHECK(ncclSocketClose(&sendSocket));
+  NCCLCHECK(ncclSocketClose(&recvSocket));
 
   TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
   return ncclSuccess;
 }
 
 // [IntraNode] in-place Broadcast
-ncclResult_t bootstrapIntraNodeBroadcast(void* commState, int *ranks, int rank, int nranks, int root, void* bcastData, int size) {
+static ncclResult_t bootstrapP2PBroadcast(void* commState, int* ranks, int rank, int nranks, int root, void* bcastData, int size) {
   if (nranks == 1) return ncclSuccess;
-  TRACE(NCCL_INIT, "rank %d nranks %d root %d size %d - ENTER", rank, nranks, root, size);
-
   if (rank == root) {
-    for (int i=0; i<nranks; i++) {
+    for (int i = 0; i < nranks; i++) {
       if (i != root) NCCLCHECK(bootstrapSend(commState, ranks ? ranks[i] : i, /*tag=*/ranks ? ranks[i] : i, bcastData, size));
     }
-  }
-  else {
+  } else {
     NCCLCHECK(bootstrapRecv(commState, ranks ? ranks[root] : root, /*tag=*/ranks ? ranks[rank] : rank, bcastData, size));
   }
-
-  TRACE(NCCL_INIT, "rank %d nranks %d root %d size %d - DONE", rank, nranks, root, size);
   return ncclSuccess;
 }
 
+ncclResult_t bootstrapIntraNodeBroadcast(void* commState, int* ranks, int rank, int nranks, int root, void* bcastData, int size) {
+  uint64_t time = 0;
+  BOOTSTRAP_PROF_OPEN(time);
+  NCCLCHECK(bootstrapP2PBroadcast(commState, ranks, rank, nranks, root, bcastData, size));
+  BOOTSTRAP_PROF_CLOSE(time);
+  TRACE(NCCL_BOOTSTRAP | NCCL_PROFILE, "bootstrapIntraNodeBroadcast for %d B done in %f sec: %f MB/sec", size, time / 1e9, (nranks * size / 1e6) / (time / 1e9));
+  return ncclSuccess;
+}
 ncclResult_t bootstrapBroadcast(void* commState, int rank, int nranks, int root, void* bcastData, int size) {
-  return bootstrapIntraNodeBroadcast(commState, NULL, rank, nranks, root, bcastData, size);
+  uint64_t time = 0;
+  BOOTSTRAP_PROF_OPEN(time);
+  NCCLCHECK(bootstrapP2PBroadcast(commState, NULL, rank, nranks, root, bcastData, size));
+  BOOTSTRAP_PROF_CLOSE(time);
+  TRACE(NCCL_BOOTSTRAP | NCCL_PROFILE, "bootstrapBroadcast done in %f sec", time / 1e9);
+  return ncclSuccess;
 }
 
 ncclResult_t bootstrapClose(void* commState) {
+  if (commState == NULL)
+    return ncclSuccess;
   struct bootstrapState* state = (struct bootstrapState*)commState;
+  // close unexpected and return an error if we are not aborting and still operations in the pipe
   if (state->unexpectedConnections != NULL) {
     unexpectedFree(state);
     if (__atomic_load_n(state->abortFlag, __ATOMIC_ACQUIRE) == 0) {
@@ -632,26 +1101,31 @@ ncclResult_t bootstrapClose(void* commState) {
       return ncclInternalError;
     }
   }
+  if (ncclParamBootstrapNetEnable()) {
+    NCCLCHECK(state->net->closeSend(STATE_RING(state, net.sendComm)));
+    NCCLCHECK(state->net->closeRecv(STATE_RING(state, net.recvComm)));
+    NCCLCHECK(state->net->closeListen(STATE_LISTEN(state, net.comm)));
+  } else {
+    NCCLCHECK(ncclSocketClose(&STATE_RING(state, socket.send)));
+    NCCLCHECK(ncclSocketClose(&STATE_RING(state, socket.recv)));
+    NCCLCHECK(ncclSocketClose(&STATE_LISTEN(state, socket)));
+  }
+  // close the p2p socket
+  NCCLCHECK(ncclSocketClose(&STATE_LISTEN(state, peerSocket)));
 
-  NCCLCHECK(ncclSocketClose(&state->listenSock));
-  NCCLCHECK(ncclSocketClose(&state->ringSendSocket));
-  NCCLCHECK(ncclSocketClose(&state->ringRecvSocket));
-
-  free(state->peerCommAddresses);
+  // proxy things are free'd elsewhere
+  free(state->peerP2pAddresses);
   free(state);
-
   return ncclSuccess;
 }
 
 ncclResult_t bootstrapAbort(void* commState) {
+  if (commState == NULL)
+    return ncclSuccess;
   struct bootstrapState* state = (struct bootstrapState*)commState;
-  if (commState == NULL) return ncclSuccess;
-  NCCLCHECK(ncclSocketClose(&state->listenSock));
-  NCCLCHECK(ncclSocketClose(&state->ringSendSocket));
-  NCCLCHECK(ncclSocketClose(&state->ringRecvSocket));
-  free(state->peerCommAddresses);
+  // when aborting we need to close the proxy here (maybe?)
   free(state->peerProxyAddresses);
   free(state->peerProxyAddressesUDS);
-  free(state);
+  NCCLCHECK(bootstrapClose(commState));
   return ncclSuccess;
 }
diff --git a/src/collectives.cc b/src/collectives.cc
index e21807e042..be9468d49b 100644
--- a/src/collectives.cc
+++ b/src/collectives.cc
@@ -59,6 +59,7 @@ const char* ncclAlgoToString(int algo) {
   case NCCL_ALGO_COLLNET_CHAIN: return "COLLNET_CHAIN";
   case NCCL_ALGO_NVLS: return "NVLS";
   case NCCL_ALGO_NVLS_TREE: return "NVLS_TREE";
+  case NCCL_ALGO_PAT: return "PAT";
   default: return "Unknown";
   }
 }
diff --git a/src/debug.cc b/src/debug.cc
index dde8e8fcb5..d21ea3d12e 100644
--- a/src/debug.cc
+++ b/src/debug.cc
@@ -19,7 +19,7 @@ static int pid = -1;
 static char hostname[1024];
 thread_local int ncclDebugNoWarn = 0;
 char ncclLastError[1024] = ""; // Global string for the last error in human readable form
-static uint64_t ncclDebugMask = NCCL_INIT|NCCL_ENV; // Default debug sub-system mask is INIT and ENV
+static uint64_t ncclDebugMask = NCCL_INIT | NCCL_BOOTSTRAP | NCCL_ENV; // Default debug sub-system mask is INIT and ENV
 FILE *ncclDebugFile = stdout;
 static pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER;
 static std::chrono::steady_clock::time_point ncclEpoch;
@@ -122,7 +122,7 @@ static void ncclDebugInit() {
     int c = 0;
     char debugFn[PATH_MAX+1] = "";
     char *dfn = debugFn;
-    while (ncclDebugFileEnv[c] != '\0' && c < PATH_MAX) {
+    while (ncclDebugFileEnv[c] != '\0' && (dfn - debugFn) < PATH_MAX) {
       if (ncclDebugFileEnv[c++] != '%') {
         *dfn++ = ncclDebugFileEnv[c-1];
         continue;
@@ -132,16 +132,24 @@ static void ncclDebugInit() {
           *dfn++ = '%';
           break;
         case 'h': // %h = hostname
-          dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
+          dfn += snprintf(dfn, PATH_MAX + 1 - (dfn - debugFn), "%s", hostname);
           break;
         case 'p': // %p = pid
-          dfn += snprintf(dfn, PATH_MAX, "%d", pid);
+          dfn += snprintf(dfn, PATH_MAX + 1 - (dfn - debugFn), "%d", pid);
           break;
         default: // Echo everything we don't understand
           *dfn++ = '%';
-          *dfn++ = ncclDebugFileEnv[c-1];
+          if ((dfn - debugFn) < PATH_MAX) {
+            *dfn++ = ncclDebugFileEnv[c-1];
+          }
           break;
       }
+      if ((dfn - debugFn) > PATH_MAX) {
+        // snprintf wanted to overfill the buffer: set dfn to the end
+        // of the buffer (for null char) and it will naturally exit
+        // the loop.
+        dfn = debugFn + PATH_MAX;
+      }
     }
     *dfn = '\0';
     if (debugFn[0] != '\0') {
@@ -181,9 +189,9 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
     tid = syscall(SYS_gettid);
   }
 
-  int cudaDev;
+  int cudaDev = 0;
   if (!(level == NCCL_LOG_TRACE && flags == NCCL_CALL)) {
-    cudaGetDevice(&cudaDev);
+    (void)cudaGetDevice(&cudaDev);
   }
 
   char buffer[1024];
@@ -207,11 +215,13 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
   va_start(vargs, fmt);
   len += vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
   va_end(vargs);
-  // vsnprintf may return len > sizeof(buffer) in the case of a truncated output.
+  // vsnprintf may return len >= sizeof(buffer) in the case of a truncated output.
   // Rewind len so that we can replace the final \0 by \n
-  if (len > sizeof(buffer)) len = sizeof(buffer)-1;
-  buffer[len++] = '\n';
-  if (len) fwrite(buffer, 1, len, ncclDebugFile);
+  if (len >= sizeof(buffer)) len = sizeof(buffer)-1;
+  if (len) {
+    buffer[len++] = '\n';
+    fwrite(buffer, 1, len, ncclDebugFile);
+  }
 }
 
 NCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0);
diff --git a/src/device/all_gather.h b/src/device/all_gather.h
index 8fe2248484..fb56e483b6 100644
--- a/src/device/all_gather.h
+++ b/src/device/all_gather.h
@@ -23,8 +23,11 @@ namespace {
 
     T *inputBuf = (T*)work->sendbuff;
     T *outputBuf = (T*)work->recvbuff;
+    // Coverity reports that the callee treats &ring->next as an array.  However, due to the use of
+    // FanSymmetric<1>, only the first element is ever accessed, so it's fine.
+    // coverity[callee_ptr_arith:FALSE]
     Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0> prims
-      (tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg);
+      (tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg, 0, 0, 0, work);
 
     for (size_t elemOffset = 0; elemOffset < partCount; elemOffset += chunkCount) {
       /////////////// begin AllGather steps ///////////////
@@ -46,7 +49,7 @@ namespace {
         rankDest = ringRanks[nranks-j];
         offset = dataOffset + rankDest * count;
 
-        prims.directRecvCopySend(offset, nelem);
+        prims.directRecvCopyDirectSend(offset, nelem);
       }
 
       // Make final copy from buffer to dest.
@@ -54,7 +57,7 @@ namespace {
       offset = dataOffset + rankDest * count;
 
       // Final wait/copy.
-      prims.directRecv(offset, nelem);
+      prims.directRecv(offset, offset, nelem);
     }
   }
 }
@@ -81,6 +84,31 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128
   }
 };
 
+template<typename T, typename RedOp>
+struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SIMPLE> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
+    using Proto = ProtoSimple<1, 1>;
+    const int nranks = ncclShmem.comm.nRanks;
+    const int rank = ncclShmem.comm.rank;
+    size_t count, channelOffset, channelCount, chunkCount;
+    ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &channelOffset, &channelCount, &chunkCount);
+
+    T *inputBuf = (T*)work->sendbuff;
+    T *outputBuf = (T*)work->recvbuff;
+    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
+      (tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, false, false, 0, primsModePatAg);
+
+    PatAGAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
+    int last = 0;
+    while (!last) {
+      int recvDim, sendDim, recvOffset, sendOffset, recvStepOffset, postRecv, postSend, nelem;
+      size_t inpIx, outIx;
+      patAlgo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, recvStepOffset, nelem, postRecv, postSend, last);
+      prims.patCopy(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, recvStepOffset, nelem, postRecv, postSend);
+    }
+  }
+};
+
 template<typename T, typename RedOp>
 struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
   __device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
@@ -165,7 +193,7 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
     template<int SlicePerChunk, int MinSrcs, int MaxSrcs, int MinDsts, int MaxDsts>
     __device__ __forceinline__ void operator()(
         int tid, int tn, int slice, int maxSliceSize,
-        int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes
+        int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes, uint32_t sendDirectFlag, uint32_t recvDirectFlag
       ) {
       static_assert(SlicePerChunk==1, "require: SlicePerChunk==1");
       static_assert(MaxDsts<=1 || MaxSrcs<=1, "require: MaxDsts<=1 || MaxSrcs<=1");
@@ -203,19 +231,22 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
           int rank = ncclShmem.comm.collNetDenseToUserRank[node*nRails + rail];
           ssize_t userOneBeg = rank*sizePerRank + railOneOffset;
           int outIsDst = (inPlace && rank == ncclShmem.comm.rank) ? 0 : 1;
-          reduceCopy<ncclCollUnroll(), RedOp, T,
+          if (nSrcs != 0 && outIsDst+nDsts != 0) {
+            reduceCopy<ncclCollUnroll(), RedOp, T,
                      /*MultimemSrcs,MinSrcs,MaxSrcs=*/0,1,1,
                      /*MultimemDsts=*/0, 0+MinDsts, 1+MaxDsts,
                      /*PreOpSrcs=*/0>
             (tid, tn, 0, nullptr, false,
              /*nSrcs=*/1, [=]__device__(int s/*==0*/) -> void* {
-               return (char*)srcPtrs[src] + railAllOffset;
+               return work->regUsed && (recvDirectFlag & NCCL_DIRECT_READ) ? (char*)srcPtrs[src] + userOneBeg : (char*)srcPtrs[src] + railAllOffset;
              },
              /*nDsts=*/outIsDst+nDsts, [=]__device__(int d) -> void* {
                return d < outIsDst ? outbuf + userOneBeg
+                                   : work->regUsed && (sendDirectFlag & NCCL_DIRECT_WRITE) ? (char*)dstPtrs[d-outIsDst] + userOneBeg
                                    : (char*)dstPtrs[d-outIsDst] + railAllOffset;
              },
              delta);
+          }
           railAllOffset += delta;
           node += 1;
         }
@@ -281,15 +312,15 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
         __syncwarp();
       } else {
         // Phase 2: Recv network -> deposit output + send to bcast
-        Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
-          prims(tid, tn, &direct->out, direct->heads + 1, nullptr, nullptr,
-            /*redOpArg=*/0, 1 * Proto::MaxGroupWidth, 0, 0);
+        Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0>
+          prims(tid, tn, &direct->out, direct->heads + 1, nullptr, work->recvbuff,
+            /*redOpArg=*/0, 1 * Proto::MaxGroupWidth, 0, 0, work);
         for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
           Scatterer</*BcastSendNotRecv=*/true> scat;
           scat.work = work;
           scat.chunkSize = chunkSize;
           scat.railGridOffset = railGridOffset;
-          prims.template process</*Recv=*/1, /*Send=*/1>(scat);
+          prims.template process</*Recv=*/1, /*Send=*/1>(scat, work->direct, 0);
         }
       }
       return;
@@ -299,15 +330,15 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
     tn = nWarps3*WARP_SIZE;
     if (tid < tn) {
       // Phase 3: Recv bcast -> deposit output
-      Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 0>, /*Direct=*/0, Proto, 0>
-        prims(tid, tn, direct->heads+1, nullptr, nullptr, nullptr,
-              /*redOpArg=*/0, 2*Proto::MaxGroupWidth, 0, 0);
+      Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 0>, /*Direct=*/1, Proto, 0>
+        prims(tid, tn, direct->heads+1, nullptr, nullptr, work->recvbuff,
+              /*redOpArg=*/0, 2*Proto::MaxGroupWidth, 0, 0, work);
       for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
         Scatterer</*BcastSendNotRecv=*/false> scat;
         scat.work = work;
         scat.chunkSize = chunkSize;
         scat.railGridOffset = railGridOffset;
-        prims.template process</*Recv=*/1, /*Send=*/0>(scat);
+        prims.template process</*Recv=*/1, /*Send=*/0>(scat, 0, work->direct);
       }
       return;
     }
diff --git a/src/device/all_reduce.h b/src/device/all_reduce.h
index 293138f4d7..36b8d32066 100644
--- a/src/device/all_reduce.h
+++ b/src/device/all_reduce.h
@@ -23,8 +23,11 @@ namespace {
     int nelem;
     int chunk;
 
+    // Coverity reports that the callee treats &ring->next as an array.  However, due to the use of
+    // FanSymmetric<1>, only the first element is ever accessed, so it's fine.
+    // coverity[callee_ptr_arith:FALSE]
     Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0> prims
-      (tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg);
+      (tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg, 0, 0, 0, work);
 
     for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
       ssize_t remCount = channelCount - elemOffset;
@@ -41,7 +44,7 @@ namespace {
       chunkOffset = chunk * chunkCount;
       offset = gridOffset + elemOffset + chunkOffset;
       nelem = (int)min(chunkCount, remCount - chunkOffset);
-      prims.send(offset, nelem);
+      prims.directSend(offset, offset, nelem);
 
       // k-2 steps: reduce and copy to next GPU
       for (int j = 2; j < nranks; ++j) {
@@ -49,7 +52,7 @@ namespace {
         chunkOffset = chunk * chunkCount;
         offset = gridOffset + elemOffset + chunkOffset;
         nelem = (int)min(chunkCount, remCount - chunkOffset);
-        prims.recvReduceSend(offset, nelem);
+        prims.directRecvReduceDirectSend(offset, offset, nelem);
       }
 
       // step k-1: reduce this buffer and data, which will produce the final
@@ -58,7 +61,7 @@ namespace {
       chunkOffset = chunk * chunkCount;
       offset = gridOffset + elemOffset + chunkOffset;
       nelem = (int)min(chunkCount, remCount - chunkOffset);
-      prims.directRecvReduceCopySend(offset, offset, nelem, /*postOp=*/true);
+      prims.directRecvReduceCopyDirectSend(offset, offset, nelem, /*postOp=*/true);
 
       // k-2 steps: copy to next GPU
       for (int j = 1; j < nranks - 1; ++j) {
@@ -66,7 +69,7 @@ namespace {
         chunkOffset = chunk * chunkCount;
         offset = gridOffset + elemOffset + chunkOffset;
         nelem = (int)min(chunkCount, remCount - chunkOffset);
-        prims.directRecvCopySend(offset, nelem);
+        prims.directRecvCopyDirectSend(offset, nelem);
       }
 
       // Make final copy from buffer to dest.
@@ -75,7 +78,7 @@ namespace {
       offset = gridOffset + elemOffset + chunkOffset;
       nelem = (int)min(chunkCount, remCount - chunkOffset);
 
-      prims.directRecv(offset, nelem);
+      prims.directRecv(offset, offset, nelem);
     }
   }
 
@@ -90,34 +93,34 @@ namespace {
     int nelem;
 
     { // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
-      Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_TREE_ARITY, 1>, /*Direct=*/0, Proto, 0> prims
-        (tid, nthreads, tree->down, &tree->up, work->sendbuff, work->recvbuff, work->redOpArg);
+      Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_TREE_ARITY, 1>, /*Direct=*/1, Proto, 0> prims
+        (tid, nthreads, tree->down, &tree->up, work->sendbuff, work->recvbuff, work->redOpArg, 0, 0, 0, work);
       if (tree->up == -1) {
         for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
           offset = gridOffset + elemOffset;
           nelem = min(chunkCount, channelCount - elemOffset);
-          prims.recvReduceCopy(offset, offset, nelem, /*postOp=*/true);
+          prims.directRecvReduceCopy(offset, offset, nelem, /*postOp=*/true);
         }
       }
       else if (tree->down[0] == -1) {
         for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
           offset = gridOffset + elemOffset;
           nelem = min(chunkCount, channelCount - elemOffset);
-          prims.send(offset, nelem);
+          prims.directSend(offset, nelem);
         }
       }
       else {
         for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
           offset = gridOffset + elemOffset;
           nelem = min(chunkCount, channelCount - elemOffset);
-          prims.recvReduceSend(offset, nelem);
+          prims.directRecvReduceDirectSend(offset, offset, nelem);
         }
       }
     }
 
     { // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
       Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_TREE_ARITY>, /*Direct=*/1, Proto, 0> prims
-        (tid, nthreads, &tree->up, tree->down, work->sendbuff, work->recvbuff, work->redOpArg);
+        (tid, nthreads, &tree->up, tree->down, work->sendbuff, work->recvbuff, work->redOpArg, 0, 0, 0, work);
       if (tree->up == -1) {
         for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
           offset = gridOffset + elemOffset;
@@ -129,14 +132,14 @@ namespace {
         for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
           offset = gridOffset + elemOffset;
           nelem = min(chunkCount, channelCount - elemOffset);
-          prims.directRecv(offset, nelem);
+          prims.directRecv(offset, offset, nelem);
         }
       }
       else {
         for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
           offset = gridOffset + elemOffset;
           nelem = min(chunkCount, channelCount - elemOffset);
-          prims.directRecvCopySend(offset, nelem);
+          prims.directRecvCopyDirectSend(offset, nelem);
         }
       }
     }
@@ -164,11 +167,11 @@ namespace {
     if (tree->up == -1) {
       // Reduce and broadcast. Max number of recv is 2, max number of send is 2
       Primitives<T, RedOp, FanSymmetric<NCCL_MAX_TREE_ARITY_TOP>, /*Direct=*/1, Proto, 0>
-        prims(tid, nthreads, tree->down, tree->down, work->sendbuff, work->recvbuff, work->redOpArg);
+        prims(tid, nthreads, tree->down, tree->down, work->sendbuff, work->recvbuff, work->redOpArg, 0, 0, 0, work);
       for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
         offset = gridOffset + elemOffset;
         nelem = min(chunkCount, channelCount - elemOffset);
-        prims.directRecvReduceCopySend(offset, offset, nelem, /*doPost=*/true);
+        prims.directRecvReduceCopyDirectSend(offset, offset, nelem, /*doPost=*/true);
       }
     }
     else if (tid < nthreadsSplit) {
@@ -180,40 +183,46 @@ namespace {
        * into DirectRecv and DirectSend capabilities, this ctor would have both=0,
        * but the ctor above for tree roots would be DirectRecv=0 DirectSend=1.
        */
+      // Coverity reports that the callee treats &tree->up as an array.  However, due to the use of
+      // FanAsymmetric<n, 1>, only the first element is ever accessed, so it's fine.
+      // coverity[callee_ptr_arith:FALSE]
       Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_TREE_ARITY, 1>, /*Direct=*/1, Proto, 0>
-        prims(tid, nthreadsSplit, tree->down, &tree->up, work->sendbuff, work->recvbuff, work->redOpArg, 0*Proto::MaxGroupWidth);
+        prims(tid, nthreadsSplit, tree->down, &tree->up, work->sendbuff, work->recvbuff, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, work);
       if (tree->down[0] == -1) {
         for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
           offset = gridOffset + elemOffset;
           nelem = min(chunkCount, channelCount - elemOffset);
-          prims.send(offset, nelem);
+          prims.directSend(offset, offset, nelem);
         }
       }
       else {
         for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
           offset = gridOffset + elemOffset;
           nelem = min(chunkCount, channelCount - elemOffset);
-          prims.recvReduceSend(offset, nelem);
+          prims.directRecvReduceDirectSend(offset, offset, nelem);
         }
       }
     }
     else {
       // Broadcast down. Max number of recv is 1, max number of send is 3 (binary tree + local)
+      // Coverity reports that the callee treats &tree->up as an array.  However, due to the use of
+      // FanAsymmetric<1, n>, only the first element is ever accessed, so it's fine.
+      // coverity[callee_ptr_arith:FALSE]
       Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_TREE_ARITY>, /*Direct=*/1, Proto, 0>
         prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, work->sendbuff, work->recvbuff,
-            work->redOpArg, 1*Proto::MaxGroupWidth);
+            work->redOpArg, 1*Proto::MaxGroupWidth, 0, 0, work);
       if (tree->down[0] == -1) {
         for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
           offset = gridOffset + elemOffset;
           nelem = min(chunkCount, channelCount - elemOffset);
-          prims.directRecv(offset, nelem);
+          prims.directRecv(offset, offset, nelem);
         }
       }
       else {
         for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
           offset = gridOffset + elemOffset;
           nelem = min(chunkCount, channelCount - elemOffset);
-          prims.directRecvCopySend(offset, nelem);
+          prims.directRecvCopyDirectSend(offset, nelem);
         }
       }
     }
@@ -264,9 +273,9 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
 
     if (tid >= tidStartScatter && tid < tidStartReduce && hasUp) {
       // Scatter
-      Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0>
+      Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
         prims(tid-tidStartScatter, nThreadsScatter, NULL, direct->up, work->sendbuff, work->recvbuff,
-           work->redOpArg, 2*Proto::MaxGroupWidth, 1, 1, work);
+           work->redOpArg, 2*Proto::MaxGroupWidth, 1, 1);
       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
         ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
         int nelem = min(direct->nHeads*chunkSize, size-offset);
@@ -276,12 +285,15 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
           prims.scatter(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift);
         }
       }
+      // Coverity complains about a possible overrun inside the destructor of "prims", but that's actually
+      // a false positive.
+      // coverity[overrun-call:FALSE]
     } else if (tid >= tidStartReduce && direct->out != -1) {
       if (hasDn) {
         // Reduce, send to network
-        Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/1, Proto, 0>
+        Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>
           prims(tid-tidStartReduce, nThreadsReduce, direct->down, &direct->out, work->sendbuff, work->recvbuff,
-             work->redOpArg, 3*Proto::MaxGroupWidth, 1, 1, work);
+             work->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
           int nelem = min(chunkSize, size-offset);
@@ -323,6 +335,9 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
     } else if (tid >= tidStartBcast && tid < tidStartScatter && direct->out != -1) {
       if (hasDn) {
         // Recv from network, broadcast
+        // Coverity complains about a possible overrun inside the class below, but that's actually
+        // a false positive.
+        // coverity[identity_transfer:FALSE]
         Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0>
           prims(tid-tidStartBcast, nThreadsBcast, &direct->out, direct->down, work->sendbuff, work->recvbuff,
              work->redOpArg, 1*Proto::MaxGroupWidth, 0, 0, work);
@@ -382,7 +397,7 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
       ssize_t offset;
       int nelem;
       int remCount = channelCount%(nvls->nHeads*chunkSize);
-      int lastChunkSize = alignUp(divUp(remCount, nvls->nHeads), 16/sizeof(T));
+      int lastChunkSize = alignUp(divUp(remCount, nvls->nHeads), 16384/sizeof(T));
 
       if (tid < tidEndScatter) {
         // Scatter
@@ -456,6 +471,9 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
         if (!hasOut) {
           // Reduce, broadcast through NVLS
           using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
+          // Coverity complains about a possible overrun inside the class below, but that's actually
+          // a false positive.
+          // coverity[identity_transfer:FALSE]
           Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
             prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL,
               work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work);
@@ -467,6 +485,9 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
         } else {
           // Reduce, send to network
           using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
+          // Coverity complains about a possible overrun inside the class below, but that's actually
+          // a false positive.
+          // coverity[identity_transfer:FALSE]
           Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
             prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->out, NULL, NULL,
               work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 1, work);
@@ -479,6 +500,9 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
       } else if (tid < tidEndBcast && nvls->headRank != -1) {
         // Recv from network, broadcast
         using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
+        // Coverity complains about a possible overrun inside the class below, but that's actually
+        // a false positive.
+        // coverity[identity_transfer:FALSE]
         Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
           prims(tid - tidEndReduce, nThreadsBcast, &nvls->out, &nvls->down, NULL, NULL,
             work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, work);
@@ -564,6 +588,9 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PROTO_
       } else {
         // Reduce, send to network
         using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
+        // Coverity reports that the callee treats &treeUp as an array.  However, due to the use of
+        // FanAsymmetric<3, 1>, only the first element is ever accessed, so it's fine.
+        // coverity[callee_ptr_arith:FALSE]
         Primitives<T, RedOp, FanAsymmetric<3, 1>, /*Direct=*/1, Proto, 0>
           prims(tid - tidEndGather, nThreadsReduce, treeDown, &treeUp, NULL, NULL,
             work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work);
@@ -579,6 +606,9 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PROTO_
     } else if (tid < tidEndBcast && nvls->headRank != -1) {
       // Recv from network, broadcast
       using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
+      // Coverity reports that the callee treats &treeUp as an array.  However, due to the use of
+      // FanAsymmetric<1, 3>, only the first element is ever accessed, so it's fine.
+      // coverity[callee_ptr_arith:FALSE]
       Primitives<T, RedOp, FanAsymmetric<1, 3>, /*Direct=*/1, Proto, 0>
         prims(tid - tidEndReduce, nThreadsBcast, &treeUp, treeDown, NULL, NULL,
           work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, work);
@@ -639,21 +669,21 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PR
         } else {
           Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
             prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
-              work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
+              work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex, work);
           for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
             ssize_t offset = gridOffset + bid * int(chunkSize);
             int nelem = min(chunkSize, size - offset);
-            prims.send(offset, nelem);
+            prims.directSend(offset, offset, nelem);
           }
         }
       } else {
         Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
           prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
-            work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
+            work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex, work);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + bid * int(chunkSize);
           int nelem = min(chunkSize, size - offset);
-          prims.recvReduceSend(offset, nelem);
+          prims.directRecvReduceDirectSend(offset, offset, nelem);
         }
       }
     }
@@ -668,40 +698,49 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PR
             }
             __syncwarp();
           } else {
+            // Coverity reports that the callee treats &send as an array.  However, due to the use of
+            // FanSymmetric<1>, only the first element is ever accessed, so it's fine.
+            // coverity[callee_ptr_arith:FALSE]
             Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
               prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
-                work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
+                work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex, work);
             for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
               ssize_t offset = gridOffset + bid * int(chunkSize);
               int nelem = min(chunkSize, size - offset);
-              prims.recv(offset, nelem, /*postOp*/true);
+              prims.directRecv(offset, offset, nelem, /*postOp*/true);
             }
           }
         } else {
+          // Coverity reports that the callee treats &send as an array.  However, due to the use of
+          // FanSymmetric<1>, only the first element is ever accessed, so it's fine.
+          // coverity[callee_ptr_arith:FALSE]
           Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
             prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
-              work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
+              work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex, work);
           for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
             ssize_t offset = gridOffset + bid * int(chunkSize);
             int nelem = min(chunkSize, size - offset);
-            prims.recvCopyDirectSend(offset, nelem, /*postOp*/true);
+            prims.directRecvCopyDirectSend(offset, nelem, /*postOp*/true);
           }
         }
       } else {
+        // Coverity reports that the callee treats &send as an array.  However, due to the use of
+        // FanSymmetric<1>, only the first element is ever accessed, so it's fine.
+        // coverity[callee_ptr_arith:FALSE]
         Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
           prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
-            work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
+            work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex, work);
         if (send == -1) {
           for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
             ssize_t offset = gridOffset + bid*int(chunkSize);
             int nelem = min(chunkSize, size-offset);
-            prims.directRecv(offset, nelem);
+            prims.directRecv(offset, offset, nelem);
           }
         } else {
           for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
             ssize_t offset = gridOffset + bid*int(chunkSize);
             int nelem = min(chunkSize, size-offset);
-            prims.directRecvCopySend(offset, nelem);
+            prims.directRecvCopyDirectSend(offset, nelem);
           }
         }
       }
diff --git a/src/device/broadcast.h b/src/device/broadcast.h
index 7026adc3d3..851b01d948 100644
--- a/src/device/broadcast.h
+++ b/src/device/broadcast.h
@@ -24,8 +24,11 @@ namespace {
 
     T *inputBuf = (T*)work->sendbuff;
     T *outputBuf = (T*)work->recvbuff;
-    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
-      prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg);
+    // Coverity reports that the callee treats &ring->next as an array.  However, due to the use of
+    // FanSymmetric<1>, only the first element is ever accessed, so it's fine.
+    // coverity[callee_ptr_arith:FALSE]
+    Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0>
+      prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg, 0, 0, 0, work);
 
     for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
       offset = gridOffset + elemOffset;
@@ -33,14 +36,14 @@ namespace {
 
       if (rank == root) {
         if (inputBuf == outputBuf) {
-          prims.send(offset, nelem);
+          prims.directSend(offset, offset, nelem);
         } else {
-          prims.copySend(offset, offset, nelem);
+          prims.directCopySend(offset, offset, nelem);
         }
       } else if (nextRank == root) {
-        prims.recv(offset, nelem);
+        prims.directRecv(offset, offset, nelem);
       } else {
-        prims.recvCopySend(offset, nelem);
+        prims.directRecvCopyDirectSend(offset, nelem);
       }
     }
   }
diff --git a/src/device/common.h b/src/device/common.h
index 5fa7be9ce2..967421b7dc 100644
--- a/src/device/common.h
+++ b/src/device/common.h
@@ -97,7 +97,7 @@ __device__ inline void barrier_sync_aligned(int name, int nThreads) {
 
 __device__ inline bool barrier_red_or(bool vote, int name) {
   int ans;
-  asm("{ .reg .pred p;"
+  asm volatile("{ .reg .pred p;"
       "  setp.ne.s32 p, %1, 0;"
       "  barrier.red.or.pred p, %2, p; "
       "  selp.s32 %0, 1, 0, p; }"
@@ -106,7 +106,7 @@ __device__ inline bool barrier_red_or(bool vote, int name) {
 }
 __device__ inline bool barrier_red_or(bool vote, int name, int nThreads) {
   int ans;
-  asm("{ .reg .pred p;"
+  asm volatile("{ .reg .pred p;"
       "  setp.ne.s32 p, %1, 0;"
       "  barrier.red.or.pred p, %2, %3, p; "
       "  selp.s32 %0, 1, 0, p; }"
@@ -115,7 +115,7 @@ __device__ inline bool barrier_red_or(bool vote, int name, int nThreads) {
 }
 __device__ inline bool barrier_red_or_aligned(bool vote, int name) {
   int ans;
-  asm("{ .reg .pred p;"
+  asm volatile("{ .reg .pred p;"
       "  setp.ne.s32 p, %1, 0;"
       "  barrier.red.or.pred.aligned p, %2, p; "
       "  selp.s32 %0, 1, 0, p; }"
@@ -137,9 +137,9 @@ inline __device__ void copyToShmem16(int tid, void* dst, void const* src, int by
   int offset = 16*tid;
   if (offset < bytes) {
     uint64_t a=0, b=0;
-    asm("ld.v2.u64 {%0,%1},[%2];" : "=l"(a),"=l"(b) : "l"((char const*)src + offset));
+    asm volatile("ld.v2.u64 {%0,%1},[%2];" : "=l"(a),"=l"(b) : "l"((char const*)src + offset) : "memory");
     uint32_t udst = (uint32_t)__cvta_generic_to_shared(dst);
-    asm volatile("st.shared.v2.u64 [%0],{%1,%2};" :: "r"(udst + offset), "l"(a), "l"(b));
+    asm volatile("st.shared.v2.u64 [%0],{%1,%2};" :: "r"(udst + offset), "l"(a), "l"(b) : "memory");
   }
 }
 
@@ -300,6 +300,9 @@ struct RunWorkBatch {
         if (work->nWarps != workPrev->nWarps) __syncthreads();
       }
       int subtn = work->nWarps*WARP_SIZE;
+      // Coverity reports a possible thread divergence due to not all threads participating in the collective.
+      // However, the code ensures that the participation is on a per-warp basis.
+      // coverity[device_thread_diverged:FALSE]
       if (tid < subtn) RunWorkColl<Fn, T, RedOp, Algo, Proto>().run(tid, subtn, work);
     }
   }
@@ -348,6 +351,9 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
   default:
     { int subtid = tid - 2*WARP_SIZE;
       int subtn = tn - 2*WARP_SIZE;
+      // Coverity reports a possible thread divergence due to not all threads participating in the collective.
+      // However, the code ensures that the participation is on a per-warp basis.
+      // coverity[device_thread_diverged:FALSE]
       loadWorkBatchToShmem(subtid, subtn, args, /*batchIx=*/blockIdx.x);
     } break;
   }
diff --git a/src/device/common_kernel.h b/src/device/common_kernel.h
index e82c94714e..f932f51f00 100644
--- a/src/device/common_kernel.h
+++ b/src/device/common_kernel.h
@@ -69,6 +69,8 @@ __device__ __forceinline__ void reduceCopyPacks(
     minSrcs[s] = cvta_to_global(srcPtrFn(s)) + threadBytesBehind;
   #pragma unroll
   for (int d=0; d < MinDsts; d++)
+    // Yes, for some template arguments this code will be unreachable.  That's fine.
+    // coverity[dead_error_line]
     minDsts[d] = cvta_to_global(dstPtrFn(d)) + threadBytesBehind;
 
   // We dictate loop termination condition according to whether partial hunks
@@ -93,13 +95,17 @@ __device__ __forceinline__ void reduceCopyPacks(
 
     #pragma unroll (MinSrcs-1 + !(MinSrcs-1))
     for (int s=1; s < MinSrcs; s++) {
+      // Yes, for some template arguments this code will be unreachable.  That's fine.
+      // coverity[dead_error_begin]
       BytePack<BytePerPack> tmp[Unroll];
+      // coverity[dead_error_line]
       RedFn preFn(s < PreOpSrcs ? preOpArgs[s] : 0);
       #pragma unroll Unroll
       for (int u=0; u < Unroll; u++) {
         if (s < MultimemSrcs) {
           // applyLoadMultimem uses relaxed semantics for same reason we use volatile below.
-          acc[u] = applyLoadMultimem<RedFn, BytePerPack>(redFn, minSrcs[s]);
+          // coverity[dead_error_line]
+          tmp[u] = applyLoadMultimem<RedFn, BytePerPack>(redFn, minSrcs[s]);
         } else {
           // Use volatile loads in case credits are polled for with volatile (instead of acquire).
           tmp[u] = ld_volatile_global<BytePerPack>(minSrcs[s]);
@@ -108,6 +114,7 @@ __device__ __forceinline__ void reduceCopyPacks(
       }
       #pragma unroll Unroll
       for (int u=0; u < Unroll; u++) {
+        // coverity[dead_error_line]
         if (s < PreOpSrcs) tmp[u] = applyPreOp(preFn, tmp[u]);
         acc[u] = applyReduce(redFn, acc[u], tmp[u]);
       }
@@ -116,6 +123,8 @@ __device__ __forceinline__ void reduceCopyPacks(
     for (int s=MinSrcs; (MinSrcs < MaxSrcs) && (s < MaxSrcs) && (s < nSrcs); s++) {
       uintptr_t src = cvta_to_global(srcPtrFn(s)) + threadBytesBehind;
       BytePack<BytePerPack> tmp[Unroll];
+      // Yes, for some template arguments this code will be unreachable.  That's fine.
+      // coverity[dead_error_line]
       RedFn preFn(s < PreOpSrcs ? preOpArgs[s] : 0);
       #pragma unroll Unroll
       for (int u=0; u < Unroll; u++) {
@@ -125,6 +134,8 @@ __device__ __forceinline__ void reduceCopyPacks(
       }
       #pragma unroll Unroll
       for (int u=0; u < Unroll; u++) {
+        // Yes, for some template arguments this code will be unreachable.  That's fine.
+        // coverity[dead_error_line]
         if (s < PreOpSrcs) tmp[u] = applyPreOp(preFn, tmp[u]);
         acc[u] = applyReduce(redFn, acc[u], tmp[u]);
       }
@@ -139,7 +150,10 @@ __device__ __forceinline__ void reduceCopyPacks(
     #pragma unroll (MinDsts + !MinDsts)
     for (int d=0; d < MinDsts; d++) {
       #pragma unroll Unroll
+      // Yes, for some template arguments this code will be unreachable.  That's fine.
+      // coverity[dead_error_begin]
       for (int u=0; u < Unroll; u++) {
+        // coverity[dead_error_condition]
         if (d < MultimemDsts) {
           multimem_st_global(minDsts[d], acc[u]);
         } else {
@@ -161,6 +175,8 @@ __device__ __forceinline__ void reduceCopyPacks(
     #pragma unroll
     for (int s=0; s < MinSrcs; s++) minSrcs[s] += (nWarps-1)*BytePerHunk;
     #pragma unroll
+    // Yes, for some template arguments this code will be unreachable.  That's fine.
+    // coverity[dead_error_line]
     for (int d=0; d < MinDsts; d++) minDsts[d] += (nWarps-1)*BytePerHunk;
     threadBytesBehind += nWarps*BytePerHunk;
     threadBytesAhead -= nWarps*BytePerHunk;
diff --git a/src/device/generate.py b/src/device/generate.py
index d0feee10fe..a0d2259466 100755
--- a/src/device/generate.py
+++ b/src/device/generate.py
@@ -7,7 +7,7 @@ all_colls =  ["Broadcast","Reduce","AllGather","ReduceScatter","AllReduce","Send
 all_redops = ["Sum","Prod","MinMax","PreMulSum","SumPostDiv"]
 all_tys =    ["i8","u8","i32","u32","i64","u64","f16","f32","f64","bf16"]
 all_protos = ["LL","LL128","SIMPLE"]
-all_algos =  ["TREE","RING","COLLNET_DIRECT","COLLNET_CHAIN","NVLS","NVLS_TREE"]
+all_algos =  ["TREE","RING","COLLNET_DIRECT","COLLNET_CHAIN","NVLS","NVLS_TREE","PAT"]
 
 ################################################################################
 # The first command line argument is the path to the directory to generate and
@@ -74,11 +74,11 @@ else:
 ################################################################################
 
 algos_of_coll = {
-  "AllGather":     ["RING","COLLNET_DIRECT","NVLS"],
-  "AllReduce":     all_algos,
+  "AllGather":     ["RING","COLLNET_DIRECT","NVLS","PAT"],
+  "AllReduce":     ["TREE","RING","COLLNET_DIRECT","COLLNET_CHAIN","NVLS","NVLS_TREE"],
   "Broadcast":     ["RING"],
   "Reduce":        ["RING"],
-  "ReduceScatter": ["RING","COLLNET_DIRECT","NVLS"],
+  "ReduceScatter": ["RING","COLLNET_DIRECT","NVLS","PAT"],
   "SendRecv":      [None]
 }
 
@@ -253,6 +253,9 @@ with open(os.path.join(gensrc, "host_table.cc"), "w") as f:
     cudart, _ = required_cuda(*kfn)
     sym = paste("_", "ncclDevKernel", *kfn)
     if cudart != 0: out("#if CUDART_VERSION >= %d\n" % cudart)
+    # __global__ below gets removed by the host compiler, which results in
+    # Coverity diagnosing a specifiers inconsistency.
+    out("// coverity[declaration]\n")
     out("__global__ void %s(ncclDevKernelArgs4K const);\n" % sym)
     if cudart != 0: out("#endif\n")
   out("\n")
diff --git a/src/device/network/unpack/unpack.h b/src/device/network/unpack/unpack.h
index b213fbe39b..e76099821c 100644
--- a/src/device/network/unpack/unpack.h
+++ b/src/device/network/unpack/unpack.h
@@ -19,10 +19,10 @@
 inline __device__ void load64gpu(const uint64_t* ptr, uint64_t &v) {
   #if __CUDA_ARCH__ >= 700
       asm volatile("ld.relaxed.gpu.u64 {%0}, [%1];"
-      : "=l"(v) : "l"(ptr));
+      : "=l"(v) : "l"(ptr) : "memory");
   #else
       asm volatile("ld.volatile.global.u64 {%0}, [%1];"
-      : "=l"(v) : "l"(ptr));
+      : "=l"(v) : "l"(ptr) : "memory");
   #endif
 }
 
@@ -226,6 +226,8 @@ inline __device__ void ncclNetDeviceUnpackInner(
 
   int PPW = ppw(nbytes, nw);
 
+  // Coverity reports a potential overflow but in reality PPW is tiny so there's no need to store it in an uint64_t.
+  // coverity[overflow_before_widen]
   for (uint64_t meta_s = w * PPW; meta_s < meta_cnt; meta_s += nw * PPW) {
 
     uint64_t iter_meta_cnt = meta_cnt - meta_s;
diff --git a/src/device/op128.h b/src/device/op128.h
index b2f8227b05..b2e519d8c8 100644
--- a/src/device/op128.h
+++ b/src/device/op128.h
@@ -11,28 +11,28 @@
 
 inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64_t &v1) {
   asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];"
-      : "=l"(v0), "=l"(v1) : "l"(ptr));
+      : "=l"(v0), "=l"(v1) : "l"(ptr) : "memory");
 }
 
 inline __device__ void store128(uint64_t* ptr, uint64_t v0, uint64_t v1) {
   asm volatile("st.volatile.global.v2.u64 [%2], {%0,%1};"
-      :: "l"(v0), "l"(v1), "l"(ptr));
+      :: "l"(v0), "l"(v1), "l"(ptr) : "memory");
 }
 
 inline __device__ uint64_t* shmemCvtPtr(volatile uint64_t* shmemGenericPtr) {
   uint64_t* shmemAsmPtr;
-  asm volatile("cvta.to.shared.u64 %0, %1;" : "=l"(shmemAsmPtr) : "l"(shmemGenericPtr));
+  asm volatile("cvta.to.shared.u64 %0, %1;" : "=l"(shmemAsmPtr) : "l"(shmemGenericPtr) : "memory");
   return shmemAsmPtr;
 }
 
 inline __device__ void loadShmem128(uint64_t* shmemAsmPtr, uint64_t &v0, uint64_t &v1) {
   asm volatile("ld.volatile.shared.v2.u64 {%0,%1}, [%2];"
-      : "=l"(v0), "=l"(v1) : "l"(shmemAsmPtr));
+      : "=l"(v0), "=l"(v1) : "l"(shmemAsmPtr) : "memory");
 }
 
 inline __device__ void storeShmem128(uint64_t* shmemAsmPtr, uint64_t v0, uint64_t v1) {
   asm volatile("st.volatile.shared.v2.u64 [%2], {%0,%1};"
-      :: "l"(v0), "l"(v1), "l"(shmemAsmPtr));
+      :: "l"(v0), "l"(v1), "l"(shmemAsmPtr) : "memory");
 }
 
 template<typename T>
@@ -48,20 +48,20 @@ inline __device__ void loadShmemMisaligned128(T *ptr, uint64_t &v0, uint64_t &v1
       // Produce 4 bytes of sub-register type by reading 2 4-byte
       // aligned values and shifting.
       uint32_t lo, hi;
-      asm("ld.shared.b32 %0,[%1];" : "=r"(lo) : "l"(ptr4+e+0));
-      asm("ld.shared.b32 %0,[%1];" : "=r"(hi) : "l"(ptr4+e+1));
+      asm volatile("ld.shared.b32 %0,[%1];" : "=r"(lo) : "l"(ptr4+e+0) : "memory");
+      asm volatile("ld.shared.b32 %0,[%1];" : "=r"(hi) : "l"(ptr4+e+1) : "memory");
       tmp4[e] = __funnelshift_r(lo, hi, 8*(int(reinterpret_cast<uintptr_t>(ptr))%4));
     }
   }
   else if(sizeof(T) == 4) {
     #pragma unroll
     for(int e=0; e < 4; e++)
-      asm("ld.shared.b32 %0,[%1];" : "=r"(tmp4[e]) : "l"(ptr+e));
+      asm volatile("ld.shared.b32 %0,[%1];" : "=r"(tmp4[e]) : "l"(ptr+e) : "memory");
   }
   else /*sizeof(T)==8*/ {
     #pragma unroll
     for(int e=0; e < 2; e++)
-      asm("ld.shared.b64 %0,[%1];" : "=l"(tmp8[e]) : "l"(ptr+e));
+      asm volatile("ld.shared.b64 %0,[%1];" : "=l"(tmp8[e]) : "l"(ptr+e) : "memory");
   }
   v0 = tmp8[0];
   v1 = tmp8[1];
@@ -146,6 +146,9 @@ struct BytePackOf<BytePack<0>> {
 template<typename T>
 __device__ __forceinline__ typename BytePackOf<T>::Pack toPack(T value)  {
   union { typename BytePackOf<T>::Pack p; T v; };
+  // Coverity recommends the use of std::move here but, given that T is a POD
+  // scalar, a plain copy will be just as efficient.
+  // coverity[copy_assignment_call]
   v = value;
   return p;
 }
@@ -183,7 +186,7 @@ template<> __device__ __forceinline__ void st_relaxed_gpu_global<0>(uintptr_t ad
   template<> \
   __device__ __forceinline__ BytePack<bytes> ld_##space<bytes>(addr_cxx_ty addr) { \
     data_cxx_ty tmp; \
-    asm("ld." #space "." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : #addr_reg_ty(addr)); \
+    asm volatile("ld." #space "." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : #addr_reg_ty(addr) : "memory"); \
     BytePack<bytes> ans; \
     ans.native = tmp; \
     return ans; \
@@ -191,7 +194,7 @@ template<> __device__ __forceinline__ void st_relaxed_gpu_global<0>(uintptr_t ad
   template<> \
   __device__ __forceinline__ BytePack<bytes> ld_volatile_##space<bytes>(addr_cxx_ty addr) { \
     data_cxx_ty tmp; \
-    asm("ld.volatile." #space "." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : #addr_reg_ty(addr)); \
+    asm volatile("ld.volatile." #space "." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : #addr_reg_ty(addr) : "memory"); \
     BytePack<bytes> ans; \
     ans.native = tmp; \
     return ans; \
@@ -212,7 +215,7 @@ template<> __device__ __forceinline__ void st_relaxed_gpu_global<0>(uintptr_t ad
   template<> \
   __device__ __forceinline__ BytePack<bytes> ld_relaxed_gpu_global<bytes>(uintptr_t addr) { \
     data_cxx_ty tmp; \
-    asm("ld." PTX_relaxed_gpu ".global." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : "l"(addr)); \
+    asm volatile("ld." PTX_relaxed_gpu ".global." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : "l"(addr) : "memory"); \
     BytePack<bytes> ans; \
     ans.native = tmp; \
     return ans; \
@@ -242,18 +245,18 @@ DEFINE_ld_st__size(8, uint64_t, b64, l)
   template<> \
   __device__ __forceinline__ BytePack<16> ld_##space<16>(addr_cxx_ty addr) { \
     BytePack<16> ans; \
-    asm("ld." #space ".v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : #addr_reg_ty(addr)); \
+    asm volatile("ld." #space ".v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : #addr_reg_ty(addr) : "memory"); \
     return ans; \
   } \
   template<> \
   __device__ __forceinline__ BytePack<16> ld_volatile_##space<16>(addr_cxx_ty addr) { \
     BytePack<16> ans; \
-    asm("ld.volatile." #space ".v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : #addr_reg_ty(addr)); \
+    asm volatile("ld.volatile." #space ".v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : #addr_reg_ty(addr) : "memory"); \
     return ans; \
   } \
   template<> \
   __device__ __forceinline__ void st_##space<16>(addr_cxx_ty addr, BytePack<16> value) { \
-    asm("st." #space ".v2.b64 [%0], {%1,%2};" :: #addr_reg_ty(addr), "l"(value.u64[0]), "l"(value.u64[1]) : "memory"); \
+    asm volatile("st." #space ".v2.b64 [%0], {%1,%2};" :: #addr_reg_ty(addr), "l"(value.u64[0]), "l"(value.u64[1]) : "memory"); \
   }
 DEFINE_ld_st_16__space(global, uintptr_t, l)
 DEFINE_ld_st_16__space(shared, uint32_t, r)
@@ -262,7 +265,7 @@ DEFINE_ld_st_16__space(shared, uint32_t, r)
 template<>
 __device__ __forceinline__ BytePack<16> ld_relaxed_gpu_global<16>(uintptr_t addr) {
   BytePack<16> ans;
-  asm("ld." PTX_relaxed_gpu ".global.v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : "l"(addr));
+  asm volatile("ld." PTX_relaxed_gpu ".global.v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : "l"(addr) : "memory");
   return ans;
 }
 template<>
@@ -277,33 +280,33 @@ __device__ __forceinline__ void st_relaxed_gpu_global<16>(uintptr_t addr, BytePa
 
 __device__ __forceinline__ uint64_t ld_volatile_global(uint64_t *ptr) {
   uint64_t ans;
-  asm("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
+  asm volatile("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory");
   return ans;
 }
 __device__ __forceinline__ uint64_t ld_relaxed_sys_global(uint64_t *ptr) {
   uint64_t ans;
   #if __CUDA_ARCH__ >= 700
-    asm("ld.relaxed.sys.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
+    asm volatile("ld.relaxed.sys.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory");
   #else
-    asm("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
+    asm volatile("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory");
   #endif
   return ans;
 }
 __device__ __forceinline__ uint64_t ld_relaxed_gpu_global(uint64_t *ptr) {
   uint64_t ans;
   #if __CUDA_ARCH__ >= 700
-    asm("ld.relaxed.gpu.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
+    asm volatile("ld.relaxed.gpu.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory");
   #else
-    asm("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
+    asm volatile("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory");
   #endif
   return ans;
 }
 __device__ __forceinline__ uint64_t ld_acquire_sys_global(uint64_t *ptr) {
   uint64_t ans;
   #if __CUDA_ARCH__ >= 700
-    asm("ld.acquire.sys.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
+    asm volatile("ld.acquire.sys.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory");
   #else
-    asm("ld.volatile.sys.global.u64 %0, [%1]; membar.gl;" : "=l"(ans) : "l"(cvta_to_global(ptr)));
+    asm volatile("ld.volatile.sys.global.u64 %0, [%1]; membar.gl;" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory");
   #endif
   return ans;
 }
diff --git a/src/device/primitives.h b/src/device/primitives.h
index 01cad705ab..1913640e89 100644
--- a/src/device/primitives.h
+++ b/src/device/primitives.h
@@ -115,19 +115,25 @@ struct PrimitivesWithoutDirect {
   __device__ void directSendFromOutput(intptr_t outIx, int eltN) {
     static_cast<RealPrimitives*>(this)->sendFromOutput(outIx, eltN);
   }
-  __device__ void directRecv(intptr_t outIx, int eltN) {
+  __device__ void directRecv(intptr_t inpIx, intptr_t outIx, int eltN) {
     static_cast<RealPrimitives*>(this)->recv(outIx, eltN, /*postOp=*/false);
   }
   __device__ void directCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
     static_cast<RealPrimitives*>(this)->copySend(inpIx, outIx, eltN, postOp);
   }
-  __device__ void directRecvCopySend(intptr_t outIx, int eltN) {
+  __device__ void directRecvCopyDirectSend(intptr_t outIx, int eltN, bool postOp=false) {
     static_cast<RealPrimitives*>(this)->recvCopySend(outIx, eltN, /*postOp=*/false);
   }
-  __device__ void directRecvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
+  __device__ void recvReduceCopyDirectSend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
     // Direct is only for the send part
     static_cast<RealPrimitives*>(this)->recvReduceCopySend(inpIx, outIx, eltN, postOp);
   }
+  __device__ __forceinline__ void directRecvReduceDirectSend(intptr_t inpIx, intptr_t outIx, ssize_t eltN, bool postOp=false) {
+    static_cast<RealPrimitives*>(this)->recvReduceSend(inpIx, eltN);
+  }
+  __device__ __forceinline__ void directRecvReduceCopyDirectSend(intptr_t inpIx, intptr_t outIx, ssize_t eltN, bool postOp=false) {
+    static_cast<RealPrimitives*>(this)->recvReduceCopySend(inpIx, outIx, eltN, postOp);
+  }
 };
 
 #include "prims_simple.h"
diff --git a/src/device/prims_ll.h b/src/device/prims_ll.h
index 4a6f9e267b..1a1307f5c8 100644
--- a/src/device/prims_ll.h
+++ b/src/device/prims_ll.h
@@ -101,7 +101,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
     uint32_t data1, flag1, data2, flag2;
     int spins = 0;
     do {
-      asm("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
+      asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4) : "memory");
       if (checkAbort(spins, 0)) break;
     } while ((flag1 != flag) || (flag2 != flag));
     uint64_t val64 = data1 + (((uint64_t)data2) << 32);
@@ -112,9 +112,11 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
   __device__ void readLLBeginAll(int offset, ncclLLFifoLine(&line)[MaxRecv]) {
     #pragma unroll
     for (int i=BeginIx; i < MaxRecv; i++) {
+      // Yes, for some template arguments this code will be unreachable.  That's fine.
+      // coverity[dead_error_line]
       if (i < fan.nrecv()) {
         union ncclLLFifoLine* src = recvPtr(i) + offset;
-        asm("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(line[i].data1), "=r"(line[i].flag1), "=r"(line[i].data2), "=r"(line[i].flag2) : "l"(&src->i4));
+        asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(line[i].data1), "=r"(line[i].flag1), "=r"(line[i].data2), "=r"(line[i].flag2) : "l"(&src->i4) : "memory");
       }
     }
   }
@@ -123,7 +125,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
     uint32_t flag = recvFlag(i);
     int spins = 0;
     while (line[i].flag1 != flag || line[i].flag2 != flag) {
-      asm("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(line[i].data1), "=r"(line[i].flag1), "=r"(line[i].data2), "=r"(line[i].flag2) : "l"(&src->i4));
+      asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(line[i].data1), "=r"(line[i].flag1), "=r"(line[i].data2), "=r"(line[i].flag2) : "l"(&src->i4) : "memory");
       if (checkAbort(spins, 0)) break;
     }
     uint64_t val64 = line[i].data1 + (((uint64_t)line[i].data2) << 32);
@@ -131,7 +133,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
   }
 
   __device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
-    asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag));
+    asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag) : "memory");
   }
 
   static constexpr int EltPerLine = sizeof(uint64_t)/sizeof(T);
@@ -145,13 +147,13 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
       uint64_t u8;
     };
     if(sizeof(U) == 1)
-      asm("ld.volatile.global.b8 %0,[%1];" : "=r"(u4) : "l"(src));
+      asm volatile("ld.volatile.global.b8 %0,[%1];" : "=r"(u4) : "l"(src) : "memory");
     else if(sizeof(U) == 2)
-      asm("ld.volatile.global.b16 %0,[%1];" : "=h"(u2) : "l"(src));
+      asm volatile("ld.volatile.global.b16 %0,[%1];" : "=h"(u2) : "l"(src) : "memory");
     else if(sizeof(U) == 4)
-      asm("ld.volatile.global.b32 %0,[%1];" : "=r"(u4) : "l"(src));
+      asm volatile("ld.volatile.global.b32 %0,[%1];" : "=r"(u4) : "l"(src) : "memory");
     else
-      asm("ld.volatile.global.b64 %0,[%1];" : "=l"(u8) : "l"(src));
+      asm volatile("ld.volatile.global.b64 %0,[%1];" : "=l"(u8) : "l"(src) : "memory");
     return elt;
   }
 
@@ -165,13 +167,13 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
     };
     elt = val;
     if(sizeof(U) == 1)
-      asm("st.volatile.global.b8 [%0],%1;" :: "l"(dst), "r"(u4));
+      asm volatile("st.volatile.global.b8 [%0],%1;" :: "l"(dst), "r"(u4) : "memory");
     else if(sizeof(U) == 2)
-      asm("st.volatile.global.b16 [%0],%1;" :: "l"(dst), "h"(u2));
+      asm volatile("st.volatile.global.b16 [%0],%1;" :: "l"(dst), "h"(u2) : "memory");
     else if(sizeof(U) == 4)
-      asm("st.volatile.global.b32 [%0],%1;" :: "l"(dst), "r"(u4));
+      asm volatile("st.volatile.global.b32 [%0],%1;" :: "l"(dst), "r"(u4) : "memory");
     else
-      asm("st.volatile.global.b64 [%0],%1;" :: "l"(dst), "l"(u8));
+      asm volatile("st.volatile.global.b64 [%0],%1;" :: "l"(dst), "l"(u8) : "memory");
   }
 
   struct DataLoader {
@@ -194,6 +196,8 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
       else {
         #pragma unroll
         for(int i=0; i < EltPerLine; i++) {
+          // Yes, for some template arguments this code will be unreachable.  That's fine.
+          // coverity[dead_error_line]
           if(i==0 || i < eltN)
             elt[i] = load(src + i);
         }
@@ -218,6 +222,8 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
     u8 = val;
     #pragma unroll
     for(int i=0; i < EltPerLine; i++) {
+      // Yes, for some template arguments this code will be unreachable.  That's fine.
+      // coverity[dead_error_line]
       if (i==0 || i < eltN)
         //store(dst+i, elt[i]);
         dst[i] = elt[i];
@@ -261,6 +267,8 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
       if (RECV) {
         data = !SRC ? peerData : applyReduce(redOp, peerData, data);
         #pragma unroll MaxRecv
+        // Yes, for some template arguments this code will be unreachable.  That's fine.
+        // coverity[dead_error_line]
         for (int i=1; i < MaxRecv && i < fan.nrecv(); i++) {
           peerData = readLLFinish(offset, line, i);
           data = applyReduce(redOp, peerData, data);
@@ -271,6 +279,8 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
 
       // Send : inter-node, then intra-node, then local
       if (SEND) {
+        // Yes, for some template arguments this code will be unreachable.  That's fine.
+        // coverity[dead_error_line]
         for (int i=1; i < MaxSend && i < fan.nsend(); i++)
           storeLL(sendPtr(i)+offset, data, sendFlag(i));
         storeLL(sendPtr(0)+offset, data, sendFlag(0));
@@ -288,6 +298,8 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
       postRecv();
     }
     if (SEND) {
+      // Yes, for some template arguments this code will be unreachable.  That's fine.
+      // coverity[dead_error_line]
       for (int i=1; i < MaxSend && i < fan.nsend(); i++)
         incSend(i, offset);
       incSend(0, offset);
@@ -324,8 +336,8 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
   __device__  Primitives(
       const int tid, const int nthreads, int const *recvPeers, int const *sendPeers,
       void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
-      uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclWorkElem* e = nullptr,
-      bool userBufReg=false, int stepSize_=0
+      uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclDevWorkColl* e = nullptr,
+      bool ipcReg = false, bool netReg = false, int stepSize_ = 0
     ):
     redOp(redOpArg),
     tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group),
@@ -334,16 +346,23 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
     // If we are going to support oneshot collNet + LL, then we would need to add connector index here
     int nrecv=0, nsend=0;
     // We compare with Fan::MaxRecv here because this->MaxRecv is always at least 1
+    // Yes, for some template arguments this code will be unreachable.  That's fine.
+    // coverity[dead_error_line]
     while (nrecv < Fan::MaxRecv && recvPeers[nrecv] >= 0) {
       loadRecvConn(&channel->peers[recvPeers[nrecv]]->recv[connIndexRecv], nrecv);
       nrecv++;
     }
+    // coverity[dead_error_line]
     while (nsend < MaxSend && sendPeers[nsend] >= 0) {
       loadSendConn(&channel->peers[sendPeers[nsend]]->send[connIndexSend], nsend);
       nsend++;
     }
     this->fan = Fan(nrecv, nsend);
+    // Coverity reports recvConn and sendConn being possibly NULL at this point but that won't actually
+    // happen given the two "while" loops just above.
+    // coverity[var_deref_model:FALSE]
     loadRecvSync();
+    // coverity[var_deref_model:FALSE]
     loadSendSync();
     setDataPtrs(inputBuf, outputBuf);
   }
diff --git a/src/device/prims_ll128.h b/src/device/prims_ll128.h
index 9c7169545a..2cb10cc499 100644
--- a/src/device/prims_ll128.h
+++ b/src/device/prims_ll128.h
@@ -234,6 +234,8 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
         }
       }
 
+      // Yes, for some template arguments this code will be unreachable.  That's fine.
+      // coverity[dead_error_line]
       for (int i=1; i<MaxRecv && i<fan.nrecv(); i++) {
         uint64_t flag = recvFlag(i);
         uint64_t* ptr = recvPtr(i)+ll128Offset;
@@ -272,6 +274,8 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
 
     /************************ Send **************************/
     if (SEND) {
+      // Yes, for some template arguments this code will be unreachable.  That's fine.
+      // coverity[dead_error_line]
       for (int i=1; i<MaxSend && i<fan.nsend(); i++) {
         uint64_t flag = sendFlag(i);
         uint64_t* ptr = sendPtr(i)+ll128Offset;
@@ -365,7 +369,7 @@ public:
       const int tid, const int nthreads, int const *recvPeers, int const *sendPeers,
       void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
       uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclDevWorkColl* e = nullptr,
-      bool userBufReg=false, int stepSize_=0
+      bool ipcReg = false, bool netReg = false, int stepSize_ = 0
     ):
     redOp(redOpArg),
     tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE),
@@ -383,7 +387,11 @@ public:
       nsend++;
     }
     this->fan = Fan(nrecv, nsend);
+    // Coverity reports recvConn and sendConn being possibly NULL at this point but that won't actually
+    // happen given the two "while" loops just above.
+    // coverity[var_deref_model:FALSE]
     loadRecvSync();
+    // coverity[var_deref_model:FALSE]
     loadSendSync();
     setDataPtrs(inputBuf, outputBuf);
   }
diff --git a/src/device/prims_simple.h b/src/device/prims_simple.h
index c026570388..945878b762 100644
--- a/src/device/prims_simple.h
+++ b/src/device/prims_simple.h
@@ -7,6 +7,12 @@
 #include "network/unpack/unpack.h"
 #include <cassert>
 
+enum primsMode {
+  primsModeDefault = 0,
+  primsModePatRs = 1,
+  primsModePatAg = 2
+};
+
 template<typename T, typename RedOp, typename Fan, int Direct,
          int SlicePerChunk, int StepPerSlice, int Unroll, int P2p, int MultimemSrcs, int MultimemDsts>
 class Primitives<
@@ -14,21 +20,25 @@ class Primitives<
   > {
   static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend;
   static constexpr int Input=0, Output=1;
-  static constexpr int RoleWaitRecv = 0x04, // 0x1 0x2 are free to use
+  static constexpr int RoleInput = 0x01,
+                       RoleOutput = 0x02,
+                       RoleWaitRecv = 0x04,
                        RoleWaitSend = 0x08,
                        RolePostSend = 0x10,
                        RolePostRecv = 0x20,
                        Aborted = 0x40,
-                       UserBufferMode = 0x80,
+                       NetRegMode = 0x80,
                        ConnFifoEnabled = 0x100,
                        DirectWrite = 0x200,
                        DirectRead = 0x400,
-                       // 0x800 is free to use
+                       PatMode = 0x800,
                        NvlsMinPolling = 0x1000,
                        NetDeviceUnpack = 0x2000,
                        AnyNetDeviceUnpack = 0x4000,
                        NvlsDirectRead = 0x8000,
-                       NvlsDirectWrite = 0x10000;
+                       NvlsDirectWrite = 0x10000,
+                       IpcWrite = 0x20000,
+                       IpcRead = 0x40000;
   const int tid, tidInBlock;
   const int nthreads;
   int nworkers;
@@ -38,13 +48,15 @@ class Primitives<
   int flags;
   int group;
   uint64_t step;
+  struct ncclConnInfo* conn = NULL;
   struct ncclConnFifo* connFifo = NULL;
   T* connEltsFifo;
-  T* directBuff;
+  T* directBuff = NULL;
   uint64_t *connStepPtr;
   uint64_t connStepCache; // Cache last seen value of (*connStepPtr)
   int      connStepSize; // Connection step size
   void*    netDeviceHandle;
+  uint64_t accSize; // Accumulated size. Used by PAT operations
 
   // Don't use barrier 0 as it's used by the final sync
   __device__ void barrier() {
@@ -95,7 +107,7 @@ class Primitives<
     #if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010
     if (flags & NvlsMinPolling) {
       uint64_t ans;
-      asm("multimem.ld_reduce.acquire.sys.global.min.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
+      asm volatile("multimem.ld_reduce.acquire.sys.global.min.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory");
       return ans;
     }
     #endif
@@ -107,8 +119,10 @@ class Primitives<
   template <int DirectRecv, int DirectSend, int Recv, int Send, int Src, int Dst>
   __device__ __forceinline__ void waitPeer(intptr_t srcIx, intptr_t dstIx, int offset, int nelts) {
     const bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send;
-    const bool noRecvWait = DirectRecv && Src && (flags & DirectRead);        // no wait when directly reading from remote input
+    const bool noRecvWait = DirectRecv && Src && (flags & (DirectRead | IpcRead));        // no wait when directly reading from remote input
     const bool noSendWait = DirectSend && (flags & (DirectRead|DirectWrite)); // no wait in empty send (e.g. directScatter) or direct remote write
+    // Yes, for some template arguments this code will be unreachable.  That's fine.
+    // coverity[dead_error_line]
     if (((flags & (Recv*RoleWaitRecv)) && !noRecvWait) ||
         ((flags & (Send*RoleWaitSend)) && !noSendWait)) {
       int spins = 0;
@@ -125,28 +139,30 @@ class Primitives<
 
       void **ptrs = isSendNotRecv ? (ncclShmem.groups[group].dsts + Dst)
                                   : (ncclShmem.groups[group].srcs + Src);
-      if (flags & UserBufferMode) {
+      if (flags & NetRegMode) {
          // Do nothing
       } else if ((flags & ConnFifoEnabled) && connFifo[step%NCCL_STEPS].mode == NCCL_MODE_OFFSET) {
         ptrs[index] = connEltsFifo + loadInt(&connFifo[step%NCCL_STEPS].offset)/sizeof(T);
       } else if (isSendNotRecv && DirectSend) {
-        if (flags & (DirectWrite | NvlsDirectWrite)) {
+        if (flags & (DirectWrite | NvlsDirectWrite | IpcWrite)) {
           ptrs[index] = directBuff + dstIx + offset;
-        } else if (flags & DirectRead) {  // empty send
+        } else if ((flags & DirectRead) || (flags & IpcRead)) {  // empty send
           ptrs[index] = nullptr;
         } else {
           ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize;
         }
       } else if (!isSendNotRecv && DirectRecv) {
-        if (flags & (DirectRead | NvlsDirectRead)) {
+        if (flags & (DirectRead | NvlsDirectRead | IpcRead)) {
           ptrs[index] = directBuff + srcIx + offset;
-        } else if (flags & DirectWrite) {
+        } else if ((flags & DirectWrite) || (flags & IpcWrite)) {
           ptrs[index] = directBuff + dstIx + offset;  // send to next from my output buffer
         } else {
           ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize;
         }
       }
       else {
+        // Yes, for some template arguments this code will be unreachable.  That's fine.
+        // coverity[dead_error_line]
         ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize;
       }
       if (flags & NetDeviceUnpack) {
@@ -182,7 +198,7 @@ class Primitives<
     int slice = 0;
     int offset = 0;
 
-    if (tid < nworkers && offset < nelem && ((flags & UserBufferMode) == 0)) {
+    if (tid < nworkers && offset < nelem && ((flags & NetRegMode) == 0)) {
       // Worker-only loop for non-empty slices. Non-workers and empty slices are
       // processed in the loop following this if block. The benefit of splitting
       // the loop like this is we pull two branches out of the critical path.
@@ -234,7 +250,7 @@ class Primitives<
         if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]
             /* NVLS can have srcs[0] == dsts[0], but we cannot enter this "if branch",
              * so we need to check whether MultimemSrcs and MultimemDsts are 0. */
-            && MultimemSrcs == 0 && MultimemDsts == 0) {
+            && MultimemSrcs == 0 && MultimemDsts == 0 && !Src) {
           // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
           if (Send) {
             reduceCopy<Unroll, RedOp, T, 0, 1, 1, 0, 1, MaxSend, /*PreOpSrcs*/0>
@@ -250,7 +266,7 @@ class Primitives<
              Recv, ncclShmem.groups[group].srcs,
              Dst, ncclShmem.groups[group].dsts,
              workSize);
-        } else {
+        } else if (ncclShmem.groups[group].srcs[0] && ncclShmem.groups[group].dsts[0]) {
           constexpr int PreOpSrcs = SrcBuf != Input ? 0 :
                                     DirectRecv*MaxRecv == NCCL_MAX_DIRECT_ARITY ? (1+NCCL_MAX_DIRECT_ARITY) : 1;
           reduceCopy<Unroll, RedOp, T,
@@ -265,6 +281,8 @@ class Primitives<
         postPeer<Recv, Send>(0 < sliceSize);
         offset += sliceSize;
         slice += 1;
+        // Yes, for some template arguments this code will be unreachable.  That's fine.
+        // coverity[dead_error_line]
       } while (slice < SlicePerChunk && offset < nelem);
     }
 
@@ -310,12 +328,13 @@ public:
   }
 
   template<int Recv, int Send, typename Fn>
-  __device__ __forceinline__ void process(Fn &&fn) {
+  __device__ __forceinline__ void process(Fn &&fn, uint32_t sendDirectFlag, uint32_t recvDirectFlag) {
     #pragma unroll 1
     for (int slice=0; slice < SlicePerChunk; slice++) {
       if (tid < nworkers) {
+        int nsend, nrecv;
         if (flags & (Recv*RoleWaitRecv | Send*RoleWaitSend)) {
-          bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send;
+          const bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send;
           int spins = 0;
           while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) {
             connStepCache = loadStepValue(connStepPtr);
@@ -326,19 +345,53 @@ public:
           if ((flags & ConnFifoEnabled) && connFifo[step%NCCL_STEPS].mode == NCCL_MODE_OFFSET) {
             int offset = loadInt(&connFifo[step%NCCL_STEPS].offset);
             ptrs[index] = connEltsFifo + offset/sizeof(T);
+          } else if (Direct && fn.work->regUsed) {
+            if (isSendNotRecv) {
+              if (flags & (DirectWrite | IpcWrite)) {
+                ptrs[index] = directBuff;
+              } else if (flags & (DirectRead | IpcRead)) {  // empty send
+                ptrs[index] = nullptr;
+              } else {
+                ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
+              }
+            } else {
+              if (flags & (DirectRead | IpcRead)) {
+                ptrs[index] = directBuff;
+              } else if (flags & (DirectWrite | IpcWrite)) {
+                if (Send)
+                  ptrs[index] = directBuff;  // send to next from my output buffer
+                else
+                  ptrs[index] = nullptr;
+              } else {
+                ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
+              }
+            }
           } else {
             ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
           }
         }
         subBarrier();
-        fn.template operator()<SlicePerChunk, 0, Recv*MaxRecv, 0, Send*MaxSend>
-          (tid, nworkers, slice, stepSize*StepPerSlice,
-           fan.nrecv(), ncclShmem.groups[group].srcs,
-           fan.nsend(), ncclShmem.groups[group].dsts, ncclShmem.groups[group].dstSizes);
+        if (Recv == 0 || ncclShmem.groups[group].srcs[0] == nullptr) {
+          nrecv = 0;
+        } else {
+          nrecv = fan.nrecv();
+        }
+
+        if (Send == 0 || ncclShmem.groups[group].dsts[0] == nullptr) {
+          nsend = 0;
+        } else {
+          nsend = fan.nsend();
+        }
+        fn.template operator() < SlicePerChunk, 0, Recv*MaxRecv, 0, Send*MaxSend >
+          (tid, nworkers, slice, stepSize * StepPerSlice,
+            nrecv, ncclShmem.groups[group].srcs,
+            nsend, ncclShmem.groups[group].dsts, ncclShmem.groups[group].dstSizes, sendDirectFlag, recvDirectFlag);
       }
       barrier();
       int32_t dstSize = 0;
       if (flags & Send*RolePostSend) {
+        // Yes, for some template arguments this code will be unreachable.  That's fine.
+        // coverity[dead_error_begin]
         dstSize = ncclShmem.groups[group].dstSizes[index];
         ncclShmem.groups[group].dstSizes[index] = 0;
         if (flags & ConnFifoEnabled) connFifo[step%NCCL_STEPS].size = dstSize*sizeof(T);
@@ -421,99 +474,97 @@ private:
     }
   }
 
-  __device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, struct ncclDevWorkColl* e) {
-    if (flags & (RoleWaitRecv|RolePostRecv)) {
-      auto *conn = &peer->recv[connIndex];
-      if (conn->netDeviceHandle.netDeviceType == NCCL_NET_DEVICE_UNPACK) {
-        // handle must be a device ptr
-        netDeviceHandle = conn->netDeviceHandle.handle;
-        // Cache the handle
-        ncclNetDeviceUnpackSetup(netDeviceHandle, group, index);
-        flags |= NetDeviceUnpack;
-      }
-      step = conn->step;
-      step = roundUp(step, SlicePerChunk*StepPerSlice);
-      if (flags & RolePostRecv) {
-        connStepPtr = conn->head;
-        *connStepPtr = step; // Return credits in case we rounded up.
-      }
-      if (flags & RoleWaitRecv) {
-        ncclShmem.groups[group].recvConns[index] = conn; // WaitRecv role saves since that's who needs it in setDataPtrs()
-        flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0;
-        connStepPtr = conn->tail;
-        connStepCache = loadStepValue(connStepPtr);
-        connStepSize = conn->stepSize/sizeof(T);
-        connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
-        if (conn->connFifo != nullptr) {
-          flags |= ConnFifoEnabled;
-          connFifo = conn->connFifo;
-        } else if (Direct) {
-          // User buffers have been registered
-          if ((conn->flags & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
-            if (connIndex == 1 && P2p == 0) {
-              flags |= DirectRead;  // scatter-reduce use direct pull
-            } else {
-              flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite :
-                       (e->direct & NCCL_DIRECT_READ)  ? DirectRead  : 0;
-            }
-          } else if (conn->flags & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) {
-            if (connIndex == 1 && P2p == 0) {
-              flags |= DirectRead;  // scatter-reduce use direct pull
-            } else {
-              // direct read not allowed in non-register case
-              // otherwise, in one-to-multi send, we could mix empty send and intermediate send
-              flags |= (conn->flags & NCCL_DIRECT_WRITE) ? DirectWrite : 0;
-            }
-          } else if ((conn->flags & NCCL_NVLS_MIN_POLL) && e != nullptr && e->regUsed) {
-            /* NVLS direct */
-            flags |= NvlsDirectRead;
+  __device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, uint32_t direct, int regFlag) {
+    conn = &peer->recv[connIndex];
+    if (conn->netDeviceHandle.netDeviceType == NCCL_NET_DEVICE_UNPACK) {
+      // handle must be a device ptr
+      netDeviceHandle = conn->netDeviceHandle.handle;
+      // Cache the handle
+      ncclNetDeviceUnpackSetup(netDeviceHandle, group, index);
+      flags |= NetDeviceUnpack;
+    }
+    step = conn->step;
+    step = roundUp(step, SlicePerChunk*StepPerSlice);
+    if (flags & RolePostRecv) {
+      connStepPtr = conn->head;
+      *connStepPtr = step; // Return credits in case we rounded up.
+    }
+    if (flags & RoleWaitRecv) {
+      if ((flags & PatMode) == 0) ncclShmem.groups[group].recvConns[index] = conn; // WaitRecv role saves since that's who needs it in setDataPtrs()
+      flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0;
+      connStepPtr = conn->tail;
+      connStepCache = loadStepValue(connStepPtr);
+      connStepSize = conn->stepSize/sizeof(T);
+      connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
+      if (conn->connFifo != nullptr) {
+        flags |= ConnFifoEnabled;
+        connFifo = conn->connFifo;
+      } else if (Direct && regFlag) {
+        // User buffers have been registered
+        if (conn->flags & (NCCL_IPC_READ | NCCL_IPC_WRITE)) {
+          if (P2p) {
+            flags |= conn->flags & NCCL_IPC_WRITE ? IpcWrite : IpcRead;
+          } else if (connIndex == 1 && direct) {
+            flags |= IpcRead;
+          } else {
+            flags |= direct & NCCL_DIRECT_READ ? IpcRead : IpcWrite;
           }
+        } else if (conn->flags & (NCCL_DIRECT_WRITE | NCCL_DIRECT_READ)) {
+          if (P2p) {
+            flags |= conn->flags & NCCL_DIRECT_WRITE ? DirectWrite : DirectRead;
+          } else if (connIndex == 1 && direct) {
+            flags |= DirectRead;  // scatter-reduce use direct pull
+          } else {
+            flags |= direct & NCCL_DIRECT_READ ? DirectRead : DirectWrite;
+          }
+        } else if ((conn->flags & NCCL_NVLS_MIN_POLL)) {
+          /* NVLS direct */
+          flags |= NvlsDirectRead;
         }
       }
     }
   }
 
-  __device__ __forceinline__ void loadSendConn(ncclDevChannelPeer *peer, int connIndex, struct ncclDevWorkColl* e) {
-    if (flags & (RoleWaitSend|RolePostSend)) {
-      auto *conn = &peer->send[connIndex];
-      step = conn->step;
-      step = roundUp(step, SlicePerChunk*StepPerSlice);
+  __device__ __forceinline__ void loadSendConn(ncclDevChannelPeer *peer, int connIndex, uint32_t direct, int regFlag) {
+    conn = &peer->send[connIndex];
+    step = conn->step;
+    step = roundUp(step, SlicePerChunk*StepPerSlice);
 
-      connFifo = conn->connFifo;
-      if (connFifo != nullptr) flags |= ConnFifoEnabled;
+    connFifo = conn->connFifo;
+    if (connFifo != nullptr) flags |= ConnFifoEnabled;
 
-      if (flags & RolePostSend) {
-        connStepPtr = conn->tail;
-        connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
-      }
-      if (flags & RoleWaitSend) {
-        ncclShmem.groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs()
-        flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0;
-        connStepPtr = conn->head;
-        connStepCache = loadStepValue(connStepPtr);
-        connStepSize = conn->stepSize/sizeof(T);
-        connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
-        if (connFifo == nullptr && Direct) {
-          // User buffers have been registered
-          if ((conn->flags & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
-            if (connIndex == 1 && P2p == 0) {
-              flags |= DirectRead;  // scatter-reduce use direct pull
-            } else {
-              flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite :
-                       (e->direct & NCCL_DIRECT_READ)  ? DirectRead  : 0;
-            }
-          } else if (conn->flags & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) {
-            if (connIndex == 1 && P2p == 0) {
-              flags |= DirectRead;  // scatter-reduce use direct pull
-            } else {
-              // direct read not allowed in non-register case
-              // otherwise, in one-to-multi send, we could mix empty send and intermediate send
-              flags |= (conn->flags & NCCL_DIRECT_WRITE) ? DirectWrite : 0;
-            }
-          } else if ((conn->flags & NCCL_NVLS_MIN_POLL) && e != nullptr && e->regUsed) {
-            /* NVLS direct */
-            flags |= NvlsDirectWrite;
+    if (flags & RolePostSend) {
+      connStepPtr = conn->tail;
+      connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
+    }
+    if (flags & RoleWaitSend) {
+      if ((flags & PatMode) == 0) ncclShmem.groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs()
+      flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0;
+      connStepPtr = conn->head;
+      connStepCache = loadStepValue(connStepPtr);
+      connStepSize = conn->stepSize/sizeof(T);
+      connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
+      if (connFifo == nullptr && Direct && regFlag) {
+        // User buffers have been registered
+        if (conn->flags & (NCCL_IPC_READ | NCCL_IPC_WRITE)) {
+          if (P2p) {
+            flags |= conn->flags & NCCL_IPC_WRITE ? IpcWrite : IpcRead;
+          } else if (connIndex == 1 && direct) {
+            flags |= IpcRead;
+          } else {
+            flags |= direct & NCCL_DIRECT_READ ? IpcRead : IpcWrite;
           }
+        } else if (conn->flags & (NCCL_DIRECT_WRITE | NCCL_DIRECT_READ)) {
+          if (P2p) {
+            flags |= conn->flags & NCCL_DIRECT_WRITE ? DirectWrite : DirectRead;
+          } else if (connIndex == 1 && direct) {
+            flags |= DirectRead;  // scatter-reduce use direct pull
+          } else {
+            flags |= direct & NCCL_DIRECT_READ ? DirectRead : DirectWrite;
+          }
+        } else if ((conn->flags & NCCL_NVLS_MIN_POLL)) {
+          /* NVLS direct */
+          flags |= NvlsDirectWrite;
         }
       }
     }
@@ -523,7 +574,8 @@ private:
   __device__ Primitives(
       int tid, int nthreads, int const *recvPeers, int const *sendPeers,
       void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
-      uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclDevWorkColl* e = nullptr,bool userBufReg=false, int stepSize_=0
+      uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclDevWorkColl* e = nullptr,
+      bool ipcReg = false, bool netReg = false, int stepSize_ = 0, int mode = primsModeDefault
     ):
     tid(tid), nthreads(nthreads), tidInBlock(threadIdx.x), group(group),
     stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_) {
@@ -531,33 +583,71 @@ private:
     // For send operations, we need an extra warp to overlap the threadfence and the copy
     this->nworkers = nthreads - (MaxSend > 0 && nthreads >= NCCL_SIMPLE_EXTRA_GROUP_IF_NTHREADS_GE ? WARP_SIZE : 0);
 
-    int nrecv=0, nsend=0;
-    while (nrecv < MaxRecv && recvPeers[nrecv] != -1) nrecv++;
-    while (nsend < MaxSend && sendPeers[nsend] != -1) nsend++;
-    this->fan = Fan(nrecv, nsend);
-
-    constexpr int ThreadPerSync =
-      MaxSend >= 16 || MaxRecv >= 16 ? 32 : // NVLS may have an arity > 8. In that case increase the size of the groups
-      MaxSend >= 8 || MaxRecv >= 8 ? 16 :
-      8; // Allows for all roles (WaitRecv/WaitSend/PostRecv/PostSend) within a single warp
-    static_assert(MaxSend <= ThreadPerSync && MaxRecv <= ThreadPerSync, "Not enough threads to cover all peers");
-
-    index = -1;
+    int peer = -1;
     flags = 0;
-    assert(2*(nrecv+nsend) <= nthreads); // Ensure no thread is assigned more than one role.
-    if      (tid < nrecv)                 { flags |= RoleWaitRecv; index = tid; }
-    else if (tid < nrecv+nsend)           { flags |= RoleWaitSend; index = tid-nrecv; }
-    else if (nthreads-nsend <= tid)       { flags |= RolePostSend; index = tid-(nthreads-nsend); }
-    else if (nthreads-nrecv-nsend <= tid) { flags |= RolePostRecv; index = tid-(nthreads-nrecv-nsend); }
+    index = -1;
+    if (mode == primsModeDefault) { // Connect to ranks in sendPeers/recvPeers
+      int nrecv=0, nsend=0;
+      // Yes, for some template arguments this code will be unreachable.  That's fine.
+      // coverity[dead_error_line]
+      while (nrecv < MaxRecv && recvPeers[nrecv] != -1) nrecv++;
+      // coverity[dead_error_line]
+      while (nsend < MaxSend && sendPeers[nsend] != -1) nsend++;
+      this->fan = Fan(nrecv, nsend);
 
-    int peer = 0;
-    if (flags & (RoleWaitRecv|RolePostRecv)) peer = recvPeers[index];
-    if (flags & (RoleWaitSend|RolePostSend)) peer = sendPeers[index];
+      constexpr int ThreadPerSync =
+        MaxSend >= 16 || MaxRecv >= 16 ? 32 : // NVLS may have an arity > 8. In that case increase the size of the groups
+        MaxSend >= 8 || MaxRecv >= 8 ? 16 :
+        8; // Allows for all roles (WaitRecv/WaitSend/PostRecv/PostSend) within a single warp
+      static_assert(MaxSend <= ThreadPerSync && MaxRecv <= ThreadPerSync, "Not enough threads to cover all peers");
 
-    loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, e);
-    loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, e);
+      assert(2*(nrecv+nsend) <= nthreads); // Ensure no thread is assigned more than one role.
+      // Coverity assumes that index will equal tid based on the line below, but it doesn't consider the setting
+      // of flags.  This results in multiple false positive overruns being reported here and in all_reduce.h.
+      // Unfortunately, we've been unsuccessful in trying to silence them with a single directive here so
+      // instead it's being done at the callers.
+      // coverity[assignment:FALSE]
+      if      (tid < nrecv)                 { flags |= RoleWaitRecv; index = tid; }
+      // Yes, for some template arguments this code will be unreachable.  That's fine.
+      // coverity[dead_error_begin]
+      else if (tid < nrecv+nsend)           { flags |= RoleWaitSend; index = tid-nrecv; }
+      else if (nthreads-nsend <= tid)       { flags |= RolePostSend; index = tid-(nthreads-nsend); }
+      else if (nthreads-nrecv-nsend <= tid) { flags |= RolePostRecv; index = tid-(nthreads-nrecv-nsend); }
 
-    if (userBufReg) flags |= UserBufferMode;
+      if (flags & (RoleWaitRecv|RolePostRecv)) peer = recvPeers[index];
+      if (flags & (RoleWaitSend|RolePostSend)) peer = sendPeers[index];
+    } else if (mode == primsModePatRs || mode == primsModePatAg) { // Connect to all ranks +/- 2^n
+      flags |= PatMode;
+      accSize = 0;
+      int nranks = ncclShmem.comm.nRanks;
+      int rank = ncclShmem.comm.rank;
+      // A thread is responsible for rank +/- 2 ^ (tid%32). That should be fine as long as rank is a 32-bits integer.
+      index = tid % 32;
+      uint32_t delta = 1 << index;
+      const int roles[4] = { RoleWaitRecv, RoleWaitSend, RolePostSend, RolePostRecv};
+      int block = tid / 32;
+      if (block < 4 && delta < nranks) {
+        int role = roles[block];
+        if (mode == primsModePatRs) {
+          if (role & (RoleWaitRecv|RolePostRecv)) peer = (rank - delta + nranks) % nranks;
+          if (role & (RoleWaitSend|RolePostSend)) peer = (rank + delta) % nranks;
+        } else if (mode == primsModePatAg) {
+          if (role & (RoleWaitSend|RolePostSend)) peer = (rank - delta + nranks) % nranks;
+          if (role & (RoleWaitRecv|RolePostRecv)) peer = (rank + delta) % nranks;
+        }
+        flags |= role;
+      } else if (tid == 128) {
+        flags |= RoleInput | RoleOutput; // Only one will be used depending on the operation
+      }
+    }
+
+    // Coverity thinks that index could be -1 here but that's not actually the case.
+    // coverity[negative_returns:FALSE]
+    if (flags & (RoleWaitRecv|RolePostRecv)) loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, e ? e->direct : 0, e ? e->regUsed : ipcReg);
+    // coverity[negative_returns:FALSE]
+    if (flags & (RoleWaitSend|RolePostSend)) loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, e ? e->direct : 0, e ? e->regUsed : ipcReg);
+
+    if (netReg) flags |= NetRegMode;
 
     if (barrierAny(flags & NetDeviceUnpack)) {
       flags |= AnyNetDeviceUnpack;
@@ -569,18 +659,14 @@ private:
       }
     }
 
-    setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclDevWorkCollReg*)e);
+    // coverity[negative_returns:FALSE]
+    setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclDevWorkCollReg*)e, (uint8_t)(e ? e->regUsed : ipcReg), peer);
   }
 
   __device__ ~Primitives() {
-    // Ensure ncclShmem.groups[].send/recvConns are available
-    barrier();
     // Save steps for the next operation
-    if (flags & (RolePostSend|RolePostRecv)) {
-      auto *conns = (flags & RolePostSend) ? ncclShmem.groups[group].sendConns : ncclShmem.groups[group].recvConns;
-      conns[index]->step = step;
-    }
-    if ((flags & UserBufferMode) && (flags & RoleWaitSend)) {
+    if (flags & (RolePostSend|RolePostRecv)) conn->step = step;
+    if ((flags & NetRegMode) && (flags & RoleWaitSend)) {
       // Make sure we wait until the proxy has sent data before we return.
       // We don't want the next CUDA kernel to overwrite the send buffer which
       // was accessed directly.
@@ -599,97 +685,111 @@ private:
     barrier();
   }
 
-  __device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclDevWorkCollReg* e) {
+  __device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclDevWorkCollReg* work, uint8_t ipcReg, int peer) {
     if (tid==0) {
       ncclShmem.groups[group].userInput = (void*)inputBuf;
       ncclShmem.groups[group].userOutput = (void*)outputBuf;
       ncclShmem.redOpArgs[0] = redOpArg;  // scaler for local input
     }
-    bool recvProvider = flags == (flags|RoleWaitRecv|DirectWrite);
-    bool sendAcceptor = (flags == (flags|RoleWaitSend|DirectWrite)) || (flags == (flags|RoleWaitSend|NvlsDirectWrite));
-    bool sendProvider = flags == (flags|RoleWaitSend|DirectRead); // sender provides direct buffer (to be fetched)
-    bool recvAcceptor = flags == (flags|RoleWaitRecv|DirectRead) || (flags == (flags|RoleWaitRecv|NvlsDirectRead)); // receiver accepts direct buffer
-    int regUsed = e != nullptr ? e->coll.regUsed : 0;
 
-    if (Direct && recvProvider) {
-      int spins = 0;
-      void *volatile *slot = ncclShmem.groups[group].recvConns[index]->ptrExchange;
-      // Wait for consumer to consume previous value before trampling it.
-      if (slot) {
-        while (*slot != nullptr && !checkAbort(spins));
-        directBuff = (T*)outputBuf;
-        // Encode pointer by XOR'ing against some address they definitely wouldn't send
-        // since we want to allow them sending us nullptr while not colliding with
-        // the empty slot value.
-        *slot = reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(directBuff) ^ reinterpret_cast<uintptr_t>(slot));
-      }
-    }
-    if (Direct && sendAcceptor) {
-      int spins = 0;
-      void *volatile *slot = ncclShmem.groups[group].sendConns[index]->ptrExchange;
-      void *ptr;
-      while (slot) {
-        ptr = *slot;
-        if (ptr != nullptr || checkAbort(spins)) break;
-      }
-
-      if (slot) {
-        directBuff = regUsed ? (T*)(e->dnOutputs[index]) :
-                   reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(ptr) ^ reinterpret_cast<uintptr_t>(slot));
-        *slot = nullptr;
-      } else {
-        /* slot is NULL, it must be regUsed == 1 */
-        directBuff = (T*)e->dnOutputs[index];
-      }
-    }
-    if (Direct && sendProvider) {
-      int spins = 0;
-      void *volatile *slot = ncclShmem.groups[group].sendConns[index]->ptrExchange;
-      volatile uint64_t* argSlot0 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange;
-      volatile uint64_t* argSlot1 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange+1;
-      // Wait for consumer to consume previous value before trampling it.
-      if (slot && argSlot0 && argSlot1) {
-        while ((*slot != nullptr || *argSlot0 != 0 || *argSlot1 !=0) && !checkAbort(spins));
-        // If there is no recv, then we are directly pulling from input buffer (e.g. directScatter)
-        // Otherwise, we are pulling from output buffer (e.g. recvCopyDirectSend)
-        directBuff = MaxRecv == 0 ? (T*)inputBuf : (T*)outputBuf;
-        // Exchange pre-scalers for use in direct pull
-        *argSlot0 = (uint64_t(1)<<32) | (uint32_t)redOpArg;
-        *argSlot1 = (uint64_t(1)<<32) | (uint32_t)(redOpArg>>32);
-        // Encode pointer by XOR'ing against some address they definitely wouldn't send
-        // since we want to allow them sending us nullptr while not colliding with
-        // the empty slot value.
-        *slot = reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(directBuff) ^ reinterpret_cast<uintptr_t>(slot));
-      }
-    }
-    if (Direct && recvAcceptor) {
-      int spins = 0;
-      void *volatile *slot = ncclShmem.groups[group].recvConns[index]->ptrExchange;
-      volatile uint64_t* argSlot0 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange;
-      volatile uint64_t* argSlot1 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange+1;
-      void *ptr;
-      while (slot) {
-        ptr = *slot;
-        if (ptr != nullptr || checkAbort(spins)) break;
-      }
-
-      if (slot && argSlot0 && argSlot1) {
-        directBuff = regUsed ? (T*)(MaxSend == 0 ? e->upOutputs[index] : e->dnInputs[index]) :
-          reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(ptr) ^ reinterpret_cast<uintptr_t>(slot));
-        if (MaxSend != 0) { // reduce group rather than gather group
-          // Store scalers for remote inputs
-          uint64_t arg0, arg1;
-          while (true) {
-            arg0 = *argSlot0;
-            arg1 = *argSlot1;
-            if ((arg0 != 0 && arg1 != 0) || checkAbort(spins)) break;
+    if (Direct && ipcReg) {
+      bool recvProvider = (flags & RoleWaitRecv) && (flags & DirectWrite || flags & IpcWrite);
+      bool sendAcceptor = (flags & RoleWaitSend) && (flags & DirectWrite || flags & IpcWrite || flags & NvlsDirectWrite);
+      bool sendProvider = (flags & RoleWaitSend) && (flags & DirectRead || flags & IpcRead); // sender provides direct buffer (to be fetched)
+      bool recvAcceptor = (flags & RoleWaitRecv) && (flags & DirectRead || flags & IpcRead || flags & NvlsDirectRead); // receiver accepts direct buffer
+      if (recvProvider) {
+        int spins = 0;
+        void* volatile* slot = ncclShmem.groups[group].recvConns[index]->ptrExchange;
+        // Wait for consumer to consume previous value before trampling it.
+        if (slot) {
+          T* exchgPtr;
+          directBuff = (T*)outputBuf;
+          while (*slot != nullptr && !checkAbort(spins));
+          if (P2p) {
+            exchgPtr = (T*)outputBuf;
+          } else {
+            int localPeer = ncclShmem.comm.rankToLocalRank[peer];
+            exchgPtr = (T*)(work->coll.recvbuffOffset + work->coll.recvbuffRmtAddrs[localPeer]);
           }
-          ncclShmem.redOpArgs[1 + index] = ((arg1 & 0xffffffff) << 32) | (arg0 & 0xffffffff);
+          *slot = reinterpret_cast<void*>(exchgPtr);
+        }
+      }
+      if (sendAcceptor) {
+        int spins = 0;
+        void* volatile* slot = ncclShmem.groups[group].sendConns[index]->ptrExchange;
+        void* ptr;
+        while (slot) {
+          ptr = *slot;
+          if (ptr != nullptr || checkAbort(spins)) break;
+        }
+
+        if (slot) {
+          directBuff = reinterpret_cast<T*>(ptr);
+          *slot = nullptr;
+        } else {
+          directBuff = (T*)work->dnOutputs[index];
+        }
+      }
+      if (sendProvider) {
+        int spins = 0;
+        void* volatile* slot = ncclShmem.groups[group].sendConns[index]->ptrExchange;
+        volatile uint64_t* argSlot0 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange;
+        volatile uint64_t* argSlot1 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange + 1;
+        // Wait for consumer to consume previous value before trampling it.
+        if (slot && argSlot0 && argSlot1) {
+          T* exchgPtr;
+          while ((*slot != nullptr || *argSlot0 != 0 || *argSlot1 != 0) && !checkAbort(spins));
+          // If there is no recv, then we are directly pulling from input buffer (e.g. directScatter)
+          // Otherwise, we are pulling from output buffer (e.g. recvCopyDirectSend)
+          directBuff = MaxRecv == 0 ? (T*)inputBuf : (T*)outputBuf;
+          if (P2p) {
+            exchgPtr = MaxRecv == 0 ? (T*)inputBuf : (T*)outputBuf;
+          } else {
+            int localPeer = ncclShmem.comm.rankToLocalRank[peer];
+            if (MaxRecv == 0)
+              exchgPtr = (T*)(work->coll.sendbuffOffset + work->coll.sendbuffRmtAddrs[localPeer]);
+            else
+              exchgPtr = (T*)(work->coll.recvbuffOffset + work->coll.recvbuffRmtAddrs[localPeer]);
+          }
+
+          // Exchange pre-scalers for use in direct pull
+          *argSlot0 = (uint64_t(1) << 32) | (uint32_t)redOpArg;
+          *argSlot1 = (uint64_t(1) << 32) | (uint32_t)(redOpArg >> 32);
+          *slot = reinterpret_cast<T*>(exchgPtr);
+        }
+      }
+      if (recvAcceptor) {
+        int spins = 0;
+        void* volatile* slot = ncclShmem.groups[group].recvConns[index]->ptrExchange;
+        volatile uint64_t* argSlot0 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange;
+        volatile uint64_t* argSlot1 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange + 1;
+        void* ptr;
+        while (slot) {
+          ptr = *slot;
+          if (ptr != nullptr || checkAbort(spins)) break;
+        }
+
+        if (slot && argSlot0 && argSlot1) {
+          directBuff = reinterpret_cast<T*>(ptr);
+          if (MaxSend != 0) { // reduce group rather than gather group
+            // Store scalers for remote inputs
+            uint64_t arg0, arg1;
+            while (true) {
+              arg0 = *argSlot0;
+              arg1 = *argSlot1;
+              if ((arg0 != 0 && arg1 != 0) || checkAbort(spins)) break;
+            }
+            ncclShmem.redOpArgs[1 + index] = ((arg1 & 0xffffffff) << 32) | (arg0 & 0xffffffff);
+          }
+          *argSlot0 = 0; *argSlot1 = 0;
+          *slot = nullptr;
+        } else {
+          // Coverity complains about work being possibly NULL below.  However, slot
+          // being NULL means that the NVLS buffer is registered (regUsed == 1)
+          // so work can't be NULL in this code path.
+          // coverity[var_deref_op]
+          directBuff = (T*)work->dnInputs[index];
         }
-        *argSlot0 = 0; *argSlot1 = 0;
-        *slot = nullptr;
-      } else {
-        directBuff = (T*)e->dnInputs[index];
       }
     }
   }
@@ -717,8 +817,8 @@ private:
   __device__ __forceinline__ void recv(intptr_t outIx, int eltN, bool postOp=false) {
     genericOp<0, 0, 1, 0, -1, Output>(-1, outIx, eltN, postOp);
   }
-  __device__ __forceinline__ void directRecv(intptr_t outIx, int eltN) {
-    genericOp<1, 0, 1, 0, -1, Output>(-1, outIx, eltN, /*postOp=*/false);
+  __device__ __forceinline__ void directRecv(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
+    genericOp<1, 0, 1, 0, -1, Output>(inpIx, outIx, eltN, postOp);
   }
   __device__ __forceinline__ void directRecvCopy(intptr_t inpIx, intptr_t outIx, int eltN) {
     genericOp<1, 0, 1, 0, -1, Output>(inpIx, outIx, eltN, /*postOp=*/false);
@@ -737,8 +837,8 @@ private:
   __device__ __forceinline__ void recvCopySend(intptr_t outIx, int eltN, bool postOp=false) {
     genericOp<0, 0, 1, 1, -1, Output>(-1, outIx, eltN, postOp);
   }
-  __device__ __forceinline__ void directRecvCopySend(intptr_t outIx, int eltN) {
-    genericOp<1, 1, 1, 1, -1, Output>(-1, outIx, eltN, false);
+  __device__ __forceinline__ void directRecvCopyDirectSend(intptr_t outIx, int eltN, bool postOp=false) {
+    genericOp<1, 1, 1, 1, -1, Output>(-1, outIx, eltN, postOp);
   }
   __device__ __forceinline__ void directRecvDirectSend(intptr_t inpIx, intptr_t outIx, int eltN) {
     genericOp<1, 1, 1, 1, -1, -1>(inpIx, outIx, eltN, false);
@@ -750,6 +850,9 @@ private:
   __device__ __forceinline__ void recvReduceCopy(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
     genericOp<0, 0, 1, 0, Input, Output>(inpIx, outIx, eltN, postOp);
   }
+  __device__ __forceinline__ void directRecvReduceCopy(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
+    genericOp<1, 0, 1, 0, Input, Output>(inpIx, outIx, eltN, postOp);
+  }
 
   __device__ __forceinline__ void recvReduceSend(intptr_t inpIx, int eltN, bool postOp=false) {
     genericOp<0, 0, 1, 1, Input, -1>(inpIx, -1, eltN, postOp);
@@ -757,14 +860,20 @@ private:
   __device__ __forceinline__ void directRecvReduceSend(intptr_t inpIx, int eltN, bool postOp=false) {
     genericOp<1, 0, 1, 1, Input, -1>(inpIx, -1, eltN, postOp);
   }
+  __device__ __forceinline__ void directRecvReduceDirectSend(intptr_t inpIx, intptr_t outIx, ssize_t eltN, bool postOp=false) {
+    genericOp<1, 1, 1, 1, Input, -1>(inpIx, outIx, eltN, postOp);
+  }
 
   __device__ __forceinline__ void recvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
     genericOp<0, 0, 1, 1, Input, Output>(inpIx, outIx, eltN, postOp);
   }
-  __device__ __forceinline__ void directRecvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
+  __device__ __forceinline__ void recvReduceCopyDirectSend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
     // Direct is only for the send part
     genericOp<0, 1, 1, 1, Input, Output>(inpIx, outIx, eltN, postOp);
   }
+  __device__ __forceinline__ void directRecvReduceCopyDirectSend(intptr_t inpIx, intptr_t outIx, ssize_t eltN, bool postOp=false) {
+    genericOp<1, 1, 1, 1, Input, Output>(inpIx, outIx, eltN, postOp);
+  }
 
   __device__ __forceinline__ void
   scatter(intptr_t inpIx, ssize_t totalElem, int peerElem, ssize_t peerOffset, int skip, int shift) {
@@ -783,4 +892,126 @@ private:
   directGather(intptr_t outIx, ssize_t totalElem, int peerElem, ssize_t peerOffset, int skip, int shift) {
     ScatterGatherOp<1, 0, 1, 0>(-1, outIx, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false);
   }
+
+  __device__ __forceinline__ void patReduce(int recvPow2, int sendPow2, intptr_t inpIx, intptr_t outIx, int recvOffset, int sendOffset, int sendStepOffset, int nelem, int postRecv, int postSend) {
+    nelem = nelem < 0 ? 0 : nelem;
+    T* userInput = (T*)ncclShmem.groups[group].userInput;
+    T* userOutput = (T*)ncclShmem.groups[group].userOutput;
+
+    if (recvPow2 >= 0 && recvPow2 == index && (flags & RoleWaitRecv)) {
+      ncclShmem.groups[group].srcs[0] = (T*)(connEltsFifo + (step%NCCL_STEPS)*connStepSize) + recvOffset;
+      int spins = 0;
+      while (connStepCache < step + StepPerSlice) {
+        connStepCache = loadStepValue(connStepPtr);
+        if (checkAbort(spins)) break;
+      }
+      if (postRecv) step += StepPerSlice;
+    }
+    if (sendPow2 >= 0 && sendPow2 == index && (flags & RoleWaitSend)) {
+      int spins = 0;
+      while (connStepCache + NCCL_STEPS < step + sendStepOffset + StepPerSlice) {
+        connStepCache = loadStepValue(connStepPtr);
+        if (checkAbort(spins)) break;
+      }
+      ncclShmem.groups[group].dsts[0] = (T*)(connEltsFifo + ((step+sendStepOffset)%NCCL_STEPS)*connStepSize) + sendOffset;
+      if (accSize < sendOffset + nelem + (step+sendStepOffset)*connStepSize) {
+        // New data, add our own data to it.
+        ncclShmem.groups[group].srcs[1] = userInput + inpIx;
+        accSize = sendOffset + nelem + (step+sendStepOffset)*connStepSize;
+        if (flags & ConnFifoEnabled)
+          connFifo[(step+sendStepOffset)%NCCL_STEPS].size = (sendOffset + nelem)*sizeof(T);
+      } else {
+        // There is already data in there, accumulate instead of writing to it.
+        ncclShmem.groups[group].srcs[1] = ncclShmem.groups[group].dsts[0];
+      }
+      if (postSend) step += StepPerSlice;
+    }
+    if (sendPow2 < 0 && (flags & RoleOutput)) { // Destination is our own local buffer
+      ncclShmem.groups[group].dsts[0] = userOutput + outIx;
+      if (accSize < outIx + nelem) {
+        // New data, add our own data to it.
+        ncclShmem.groups[group].srcs[1] = userInput + inpIx;
+        accSize = outIx + nelem;
+      } else {
+        // There is already data in there, accumulate instead of writing to it.
+        ncclShmem.groups[group].srcs[1] = ncclShmem.groups[group].dsts[0];
+      }
+    }
+    barrier();
+    int nSrcs = 2;
+    void** srcs = ncclShmem.groups[group].srcs;
+    if (recvPow2 < 0) { srcs++; nSrcs--; } // No peer to receive from, remove one source
+
+    int workSize = ncclShmem.aborted ? 0 : nelem;
+
+    reduceCopy<Unroll, RedOp, T, 0, 1, 2, 0, 1, 1, /*PreOpSrcs*/0>
+      (tid, nthreads, ncclShmem.redOpArgs[0],  nullptr, /*postOp=*/false,
+       nSrcs, srcs, 1, ncclShmem.groups[group].dsts, workSize);
+
+    barrier();
+    if (postRecv && recvPow2 >= 0 && recvPow2 == index && (flags & RolePostRecv)) postPeer<1, 0>(0 < nelem);
+    if (postSend && sendPow2 >= 0 && sendPow2 == index && (flags & RolePostSend)) postPeer<0, 1>(0 < nelem);
+  }
+
+  __device__ __forceinline__ void patCopy(int recvPow2, int sendPow2, intptr_t inpIx, intptr_t outIx, int recvOffset, int sendOffset, int recvStepOffset, int nelem, int postRecv, int postSend) {
+    nelem = nelem < 0 ? 0 : nelem;
+    T* userInput = (T*)ncclShmem.groups[group].userInput;
+    T* userOutput = (T*)ncclShmem.groups[group].userOutput;
+
+    if (recvPow2 >= 0 && recvPow2 == index && (flags & RoleWaitRecv)) {
+      ncclShmem.groups[group].srcs[0] = (T*)(connEltsFifo + ((step+recvStepOffset)%NCCL_STEPS)*connStepSize) + recvOffset;
+      int spins = 0;
+      while (connStepCache < step + recvStepOffset + StepPerSlice) {
+        connStepCache = loadStepValue(connStepPtr);
+        if (checkAbort(spins)) break;
+      }
+      if (accSize < recvOffset + nelem + (step+recvStepOffset)*connStepSize) {
+        // New data, copy to our output buffer.
+        ncclShmem.groups[group].dsts[1] = userOutput + outIx;
+        accSize = recvOffset + nelem + (step+recvStepOffset)*connStepSize;
+      } else {
+        ncclShmem.groups[group].dsts[1] = ncclShmem.groups[group].srcs[0]; // Already done
+      }
+      if (postRecv) step += StepPerSlice;
+    }
+    if (sendPow2 >= 0 && sendPow2 == index && (flags & RoleWaitSend)) {
+      int spins = 0;
+      while (connStepCache + NCCL_STEPS < step + StepPerSlice) {
+        connStepCache = loadStepValue(connStepPtr);
+        if (checkAbort(spins)) break;
+      }
+      ncclShmem.groups[group].dsts[0] = (T*)(connEltsFifo + (step%NCCL_STEPS)*connStepSize) + sendOffset;
+      if (postSend) {
+        if (flags & ConnFifoEnabled)
+          connFifo[step%NCCL_STEPS].size = (sendOffset + nelem)*sizeof(T);
+        step += StepPerSlice;
+      }
+    }
+    if (recvPow2 < 0 && (flags & RoleInput)) { // Source is our own local buffer
+      ncclShmem.groups[group].srcs[0] = userInput + inpIx;
+      if (accSize < inpIx + nelem) {
+        // New data, copy to our output buffer.
+        ncclShmem.groups[group].dsts[1] = userOutput + outIx;
+        accSize = inpIx + nelem;
+      } else {
+        ncclShmem.groups[group].dsts[1] = ncclShmem.groups[group].srcs[0]; // Already done
+      }
+    }
+    barrier();
+    int nDsts = 2;
+    void** dsts = ncclShmem.groups[group].dsts;
+    if (sendPow2 < 0) { dsts++; nDsts--; } // No peer to send to, remove one dest
+    if (ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[1]) nDsts--; // In-place or already done.
+
+    int workSize = ncclShmem.aborted ? 0 : nelem;
+
+    reduceCopy<Unroll, RedOp, T, 0, 1, 1, 0, 1, 2, /*PreOpSrcs*/0>
+      (tid, nthreads, ncclShmem.redOpArgs[0],  nullptr, /*postOp=*/false,
+       1, ncclShmem.groups[group].srcs, nDsts, dsts, workSize);
+
+    barrier();
+    if (postRecv && recvPow2 >= 0 && recvPow2 == index && (flags & RolePostRecv)) postPeer<1, 0>(0 < nelem);
+    if (postSend && sendPow2 >= 0 && sendPow2 == index && (flags & RolePostSend)) postPeer<0, 1>(0 < nelem);
+  }
+
 };
diff --git a/src/device/reduce.h b/src/device/reduce.h
index 91cdaeb251..f8597a6504 100644
--- a/src/device/reduce.h
+++ b/src/device/reduce.h
@@ -23,6 +23,9 @@ namespace {
     size_t offset;
     int nelem;
 
+    // Coverity reports that the callee treats &ring->next as an array.  However, due to the use of
+    // FanSymmetric<1>, only the first element is ever accessed, so it's fine.
+    // coverity[callee_ptr_arith:FALSE]
     Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
       prims(tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg);
 
diff --git a/src/device/reduce_kernel.h b/src/device/reduce_kernel.h
index 9e78da98a4..b069c07ec9 100644
--- a/src/device/reduce_kernel.h
+++ b/src/device/reduce_kernel.h
@@ -234,10 +234,10 @@ struct Apply_Reduce<FuncProd<uint8_t>, /*EltPerPack=*/4> {
     uint32_t a = apack.native;
     uint32_t b = bpack.native;
     uint32_t ab0 = (a*b) & 0xffu;
-    asm("mad.lo.u32 %0, %1, %2, %0;" : "+r"(ab0) : "r"(a&0xff00u), "r"(b&0xff00u));
+    asm volatile("mad.lo.u32 %0, %1, %2, %0;" : "+r"(ab0) : "r"(a&0xff00u), "r"(b&0xff00u));
     uint32_t ab1;
-    asm("mul.hi.u32 %0, %1, %2;"     : "=r"(ab1) : "r"(a&0xff0000), "r"(b&0xff0000));
-    asm("mad.hi.u32 %0, %1, %2, %0;" : "+r"(ab1) : "r"(a&0xff000000u), "r"(b&0xff000000u));
+    asm volatile("mul.hi.u32 %0, %1, %2;"     : "=r"(ab1) : "r"(a&0xff0000), "r"(b&0xff0000));
+    asm volatile("mad.hi.u32 %0, %1, %2, %0;" : "+r"(ab1) : "r"(a&0xff000000u), "r"(b&0xff000000u));
     apack.native = __byte_perm(ab0, ab1, 0x6420);
     return apack;
   }
@@ -260,8 +260,12 @@ SPECIALIZE_REDUCE(FuncMinMax, double, 1, double, fn.isMinNotMax ? fmin(x, y) : f
 
 #if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
   SPECIALIZE_REDUCE(FuncSum, half, 1, half, __hadd(x, y))
+  // Coverity recommends the use of std::move here but, given that half is a scalar,
+  // a plain copy will be just as efficient.
+  // coverity[copy_constructor_call]
   SPECIALIZE_REDUCE(FuncSum, half, 2, half2, __hadd2(x, y))
   SPECIALIZE_REDUCE(FuncProd, half, 1, half, __hmul(x, y))
+  // coverity[copy_constructor_call]
   SPECIALIZE_REDUCE(FuncProd, half, 2, half2, __hmul2(x, y))
 #else
   SPECIALIZE_REDUCE(FuncSum, half, 1, half, __float2half(__half2float(x) + __half2float(y)))
@@ -270,6 +274,7 @@ SPECIALIZE_REDUCE(FuncMinMax, double, 1, double, fn.isMinNotMax ? fmin(x, y) : f
 
 #if __CUDA_ARCH__ >= 800
   SPECIALIZE_REDUCE(FuncMinMax, half, 1, half, fn.isMinNotMax ? __hmin(x, y) : __hmax(x, y))
+  // coverity[copy_constructor_call]
   SPECIALIZE_REDUCE(FuncMinMax, half, 2, half2, fn.isMinNotMax ? __hmin2(x, y) : __hmax2(x, y))
 #else
   SPECIALIZE_REDUCE(FuncMinMax, half, 1, half, __float2half(fn.isMinNotMax ? fminf(__half2float(x), __half2float(y)) : fmaxf(__half2float(x), __half2float(y))))
@@ -278,10 +283,13 @@ SPECIALIZE_REDUCE(FuncMinMax, double, 1, double, fn.isMinNotMax ? fmin(x, y) : f
 #if defined(__CUDA_BF16_TYPES_EXIST__)
 #if __CUDA_ARCH__ >= 800
   SPECIALIZE_REDUCE(FuncSum, __nv_bfloat16, 1, __nv_bfloat16, __hadd(x, y))
+  // coverity[copy_constructor_call]
   SPECIALIZE_REDUCE(FuncSum, __nv_bfloat16, 2, __nv_bfloat162, __hadd2(x, y))
   SPECIALIZE_REDUCE(FuncProd, __nv_bfloat16, 1, __nv_bfloat16, __hmul(x, y))
+  // coverity[copy_constructor_call]
   SPECIALIZE_REDUCE(FuncProd, __nv_bfloat16, 2, __nv_bfloat162, __hmul2(x, y))
   SPECIALIZE_REDUCE(FuncMinMax, __nv_bfloat16, 1, __nv_bfloat16, fn.isMinNotMax ? __hmin(x, y) : __hmax(x, y))
+  // coverity[copy_constructor_call]
   SPECIALIZE_REDUCE(FuncMinMax, __nv_bfloat16, 2, __nv_bfloat162, fn.isMinNotMax ? __hmin2(x, y) : __hmax2(x, y))
 #else
   SPECIALIZE_REDUCE(FuncSum, __nv_bfloat16, 1, __nv_bfloat16, __float2bfloat16(__bfloat162float(x) + __bfloat162float(y)))
@@ -402,6 +410,9 @@ struct FuncPreMulSum {
 };
 
 template<>
+// Coverity recommends the users of this type to use std::move in certain cases but,
+// given that half is a scalar, a plain copy will be just as efficient.
+// coverity[moveable_type]
 struct FuncPreMulSum<half> {
   using EltType = half;
 #if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
@@ -424,6 +435,9 @@ struct FuncPreMulSum<half> {
 
 #if defined(__CUDA_BF16_TYPES_EXIST__)
   template<>
+  // Coverity recommends the users of this type to use std::move in certain cases but,
+  // given that __nv_bfloat16 is a scalar, a plain copy will be just as efficient.
+  // coverity[moveable_type]
   struct FuncPreMulSum<__nv_bfloat16> {
     using EltType = __nv_bfloat16;
   #if __CUDA_ARCH__ >= 800
@@ -584,9 +598,9 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
     static constexpr int PackSize = SIZEOF_BytePack_field_##pack_field; \
     __device__ static BytePack<PackSize> load(FuncSum<T> fn, uintptr_t addr) { \
       BytePack<PackSize> ans; \
-      asm("multimem.ld_reduce.relaxed.sys.global.add." #ptx_ty " %0, [%1];" \
+      asm volatile("multimem.ld_reduce.relaxed.sys.global.add." #ptx_ty " %0, [%1];" \
         : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \
-        : "l"(addr)); \
+        : "l"(addr) : "memory"); \
       return ans; \
     } \
   };
@@ -597,13 +611,13 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
     __device__ static BytePack<PackSize> load(FuncMinMax<T> fn, uintptr_t addr) { \
       BytePack<PackSize> ans; \
       if (fn.isMinNotMax) { \
-        asm("multimem.ld_reduce.relaxed.sys.global.min." #ptx_ty " %0, [%1];" \
+        asm volatile("multimem.ld_reduce.relaxed.sys.global.min." #ptx_ty " %0, [%1];" \
           : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \
-          : "l"(addr)); \
+          : "l"(addr) : "memory"); \
       } else { \
-        asm("multimem.ld_reduce.relaxed.sys.global.max." #ptx_ty " %0, [%1];" \
+        asm volatile("multimem.ld_reduce.relaxed.sys.global.max." #ptx_ty " %0, [%1];" \
           : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \
-          : "l"(addr)); \
+          : "l"(addr) : "memory"); \
       } \
       return ans; \
     } \
@@ -615,12 +629,12 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
     static constexpr int PackSize = 4*(SIZEOF_BytePack_field_##pack_field); \
     __device__ static BytePack<PackSize> load(FuncSum<T> fn, uintptr_t addr) { \
       BytePack<PackSize> ans; \
-      asm("multimem.ld_reduce.relaxed.sys.global.add.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
+      asm volatile("multimem.ld_reduce.relaxed.sys.global.add.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
         : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \
           "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \
           "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \
           "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[3]) \
-        : "l"(addr)); \
+        : "l"(addr) : "memory"); \
       return ans; \
     } \
   };
@@ -631,19 +645,19 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
     __device__ static BytePack<PackSize> load(FuncMinMax<T> fn, uintptr_t addr) { \
       BytePack<PackSize> ans; \
       if (fn.isMinNotMax) { \
-        asm("multimem.ld_reduce.relaxed.sys.global.min.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
+        asm volatile("multimem.ld_reduce.relaxed.sys.global.min.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
           : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \
             "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \
             "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \
             "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[3]) \
-          : "l"(addr)); \
+          : "l"(addr) : "memory"); \
       } else { \
-        asm("multimem.ld_reduce.relaxed.sys.global.max.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
+        asm volatile("multimem.ld_reduce.relaxed.sys.global.max.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
           : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \
             "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \
             "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \
             "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[3]) \
-          : "l"(addr)); \
+          : "l"(addr) : "memory"); \
       } \
       return ans; \
     } \
@@ -655,9 +669,9 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
   struct Apply_LoadMultimem<FuncSum<T>, sizeof(T)> { \
     __device__ static BytePack<sizeof(T)> load(FuncSum<T> fn, uintptr_t addr) { \
       BytePack<2*sizeof(T)> tmp; \
-      asm("multimem.ld_reduce.relaxed.sys.global.add." #ptx_ty " %0, [%1];" \
+      asm volatile("multimem.ld_reduce.relaxed.sys.global.add." #ptx_ty " %0, [%1];" \
         : "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
-        : "l"(addr & -uintptr_t(2*sizeof(T)))); \
+        : "l"(addr & -uintptr_t(2*sizeof(T))) : "memory"); \
       return tmp.half[(addr/sizeof(T))%2]; \
     } \
   };
@@ -668,13 +682,13 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
     __device__ static BytePack<sizeof(T)> load(FuncMinMax<T> fn, uintptr_t addr) { \
       BytePack<2*sizeof(T)> tmp; \
       if (fn.isMinNotMax) { \
-        asm("multimem.ld_reduce.relaxed.sys.global.min." #ptx_ty " %0, [%1];" \
+        asm volatile("multimem.ld_reduce.relaxed.sys.global.min." #ptx_ty " %0, [%1];" \
           : "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
-          : "l"(addr & -uintptr_t(2*sizeof(T)))); \
+          : "l"(addr & -uintptr_t(2*sizeof(T))) : "memory"); \
       } else { \
-        asm("multimem.ld_reduce.relaxed.sys.global.max." #ptx_ty " %0, [%1];" \
+        asm volatile("multimem.ld_reduce.relaxed.sys.global.max." #ptx_ty " %0, [%1];" \
           : "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
-          : "l"(addr & -uintptr_t(2*sizeof(T)))); \
+          : "l"(addr & -uintptr_t(2*sizeof(T))) : "memory"); \
       } \
       return tmp.half[(addr/sizeof(T))%2]; \
     } \
diff --git a/src/device/reduce_scatter.h b/src/device/reduce_scatter.h
index cf068ff553..f7b3c25e58 100644
--- a/src/device/reduce_scatter.h
+++ b/src/device/reduce_scatter.h
@@ -24,6 +24,9 @@ namespace {
     uint32_t nelem;
     int rankDest;
 
+    // Coverity reports that the callee treats &ring->next as an array.  However, due to the use of
+    // FanSymmetric<1>, only the first element is ever accessed, so it's fine.
+    // coverity[callee_ptr_arith:FALSE]
     Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
       prims(tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg);
 
@@ -74,6 +77,32 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_L
   }
 };
 
+template<typename T, typename RedOp>
+struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SIMPLE> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
+    using Proto = ProtoSimple<1, 1>;
+    const int nranks = ncclShmem.comm.nRanks;
+    const int rank = ncclShmem.comm.rank;
+    size_t count, channelOffset, channelCount, chunkCount;
+    ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &channelOffset, &channelCount, &chunkCount);
+
+    T *inputBuf = (T*)work->sendbuff;
+    T *outputBuf = (T*)work->recvbuff;
+    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
+      (tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, false, false, 0, primsModePatRs);
+
+    PatRSAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
+    int last = 0;
+    while (!last) {
+      int recvDim, sendDim, recvOffset, sendOffset, sendStepOffset, postRecv, postSend, nelem;
+      size_t inpIx, outIx;
+      patAlgo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, sendStepOffset, nelem, postRecv, postSend, last);
+      prims.patReduce(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, sendStepOffset, nelem, postRecv, postSend);
+    }
+  }
+};
+
+
 template<typename T, typename RedOp>
 struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
   __device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
@@ -88,7 +117,7 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_S
     size_t offset;
     int nelem;
 
-    /* if we are direct NVLS, we only need to allocate 1 warp to scatter for sync; 
+    /* if we are direct NVLS, we only need to allocate 1 warp to scatter for sync;
      * if not, based on #ranks, we allocate 7 or 5 warps to reduce to saturate bandwidth
      * and the rest are allocated to scatter. */
     const int nThreadsReduce = work->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : (nranks <= 6 ? 7 * WARP_SIZE : 5 * WARP_SIZE);
@@ -143,6 +172,9 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_S
           size_t outOffset = gridOffset + elemOffset;
           size_t inpOffset = outOffset + rank * count;
           nelem = min(chunkCount, channelCount - elemOffset);
+          // Coverity complains about a possible overrun inside the method invoked below, but that's actually
+          // a false positive.
+          // coverity[overrun-call:FALSE]
           prims.directRecvCopy(inpOffset, outOffset, nelem);
         }
 
@@ -164,7 +196,7 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
     template<int SlicePerChunk, int MinSrcs, int MaxSrcs, int MinDsts, int MaxDsts>
     __device__ __forceinline__ void operator()(
         int tid, int tn, int slice, int maxSliceSize,
-        int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes
+        int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes, uint32_t sendDirectFlag, uint32_t recvDirectFlag
       ) {
       static_assert(SlicePerChunk==1, "require: SlicePerChunk==1");
       static_assert(MaxDsts<=1 || MaxSrcs<=1, "require: MaxDsts<=1 || MaxSrcs<=1");
@@ -199,19 +231,23 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
           int delta = min(railAllEnd, railOneEnd) - (railAllBeg+railAllOffset);
           int rank = ncclShmem.comm.collNetDenseToUserRank[node*nRails + rail];
           ssize_t userOneBeg = rank*sizePerRank + railOneOffset;
-          reduceCopy<ncclCollUnroll(), RedOp, T,
+          if (nDsts != 0) {
+            reduceCopy<ncclCollUnroll(), RedOp, T,
                      /*MultimemSrcs=*/0, 1+MinSrcs, 1+MaxSrcs,
                      /*MultimemDsts,MinDsts,MaxDsts=*/0,1,1,
                      /*PreOpSrcs=*/1>
             (tid, tn, work->redOpArg, &work->redOpArg, false,
              /*nSrcs=*/1+nSrcs, [=]__device__(int s) {
                return s==0 ? (T*)inbuf + userOneBeg
+                           : work->regUsed && (recvDirectFlag & NCCL_DIRECT_READ)
+                           ? (T*)srcPtrs[s-1] + userOneBeg
                            : (T*)srcPtrs[s-1] + railAllOffset;
              },
              /*nDsts=*/1, [=]__device__(int d/*==0*/) {
                return (T*)dstPtrs[dst] + railAllOffset;
              },
              delta);
+          }
           railAllOffset += delta;
           node += 1;
         }
@@ -245,15 +281,15 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
     int tn = nWarps1*WARP_SIZE;
     if (tid < tn) {
       // Phase 1: Scatter inputs to peers
-      Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
-        prims(tid, tn, nullptr, direct->heads+1, nullptr, nullptr,
-              work->redOpArg, 0*Proto::MaxGroupWidth, 1, 1);
+      Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0>
+        prims(tid, tn, nullptr, direct->heads+1, work->sendbuff, nullptr,
+              work->redOpArg, 0*Proto::MaxGroupWidth, 1, 1, work);
       for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
         Scatterer</*ReduceSendNotRecv=*/true> scat;
         scat.work = work;
         scat.chunkSize = chunkSize;
         scat.railGridOffset = railGridOffset;
-        prims.template process</*Recv=*/0, /*Send=*/1>(scat);
+        prims.template process</*Recv=*/0, /*Send=*/1>(scat, NCCL_DIRECT_READ, 0);
       }
       return;
     }
@@ -269,15 +305,15 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
         __syncwarp();
       } else {
         // Phase 2: Reduce from peers + local input -> send to network
-        Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>
+        Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/1, Proto, 0>
           prims(tid, tn, direct->heads + 1, &direct->out, nullptr, nullptr,
-            work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
+            work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1, work);
         for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
           Scatterer</*ReduceSendNotRecv=*/false> scat;
           scat.work = work;
           scat.chunkSize = chunkSize;
           scat.railGridOffset = railGridOffset;
-          prims.template process</*Recv=*/1, /*Send=*/1>(scat);
+          prims.template process</*Recv=*/1, /*Send=*/1>(scat, 0, NCCL_DIRECT_READ);
         }
       }
       return;
diff --git a/src/device/sendrecv.h b/src/device/sendrecv.h
index 7774202a13..9b039a41a0 100644
--- a/src/device/sendrecv.h
+++ b/src/device/sendrecv.h
@@ -15,11 +15,11 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
   template<typename Proto>
   __device__ void runSend(int tid, int tn, int group, struct ncclDevWorkP2p* work) {
     size_t bytes = work->sendBytes;
-    int chunkSize = u32fp8Decode(work->sendChunkSize_u32fp8);
+    int chunkSize = work->sendIpcReg && ncclShmem.comm.isNvlink ? (1 << 30) : u32fp8Decode(work->sendChunkSize_u32fp8);
     Primitives<T, RedOp, FanAsymmetric<0, 1>, 1, Proto, 1>
       prims(tid, tn, nullptr, &work->sendRank, work->sendAddr, nullptr,
             /*redOpArg(ignored)=*/0, group, 1, 1, nullptr,
-            /*userBufferMode=*/work->sendRegistered, ncclShmem.comm.p2pChunkSize);
+            /*ipcReg=*/work->sendIpcReg, /*netReg=*/work->sendRegistered, ncclShmem.comm.p2pChunkSize);
     size_t cursor = 0;
     do {
       int n = min(size_t(chunkSize), bytes-cursor);
@@ -31,15 +31,15 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
   template<typename Proto>
   __device__ void runRecv(int tid, int tn, int group, struct ncclDevWorkP2p* work) {
     size_t bytes = work->recvBytes;
-    int chunkSize = u32fp8Decode(work->recvChunkSize_u32fp8);
+    int chunkSize = work->recvIpcReg && ncclShmem.comm.isNvlink ? (1 << 30) : u32fp8Decode(work->recvChunkSize_u32fp8);
     Primitives<T, RedOp, FanAsymmetric<1, 0>, 1, Proto, 1>
       prims(tid, tn, &work->recvRank, nullptr, nullptr, work->recvAddr,
             /*redOpArg(ignored)=*/0, group, 1, 1, nullptr,
-            /*userBufferMode=*/work->recvRegistered, ncclShmem.comm.p2pChunkSize);
+            /*ipcReg=*/work->recvIpcReg, /*netReg=*/work->recvRegistered, ncclShmem.comm.p2pChunkSize);
     size_t cursor = 0;
     do {
       int n = min(size_t(chunkSize), bytes-cursor);
-      prims.directRecv(cursor, n);
+      prims.directRecv(cursor, cursor, n);
       cursor += n;
     } while (cursor < bytes && work->recvRegistered == 0);
   }
@@ -80,6 +80,9 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
           (isSend ? work->sendBytes : work->recvBytes) = partEnd - partBeg;
         }
       }
+      // Coverity reports a possible thread divergence due to not all threads participating in the collective.
+      // However, the code ensures that the participation is on a per-warp basis.
+      // coverity[device_thread_diverged:FALSE]
       uint32_t mask = __ballot_sync(~0u, hasWork);
       if (lane == 0) {
         shared->workSendMask = mask>>16;
diff --git a/src/enqueue.cc b/src/enqueue.cc
index 0e07e3f257..4edb42decc 100644
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@@ -11,6 +11,7 @@
 #include "bootstrap.h"
 #include "channel.h"
 #include "cudawrap.h"
+#include "profiler.h"
 #include "transport.h"
 
 #include <cstring> // std::memcpy
@@ -121,6 +122,10 @@ static void addWorkBatchToPlan(
   if (newBatch || extendBatch) {
     if (!newBatch) batch->nextExtends = extendBatch; // Extending the previous batch.
     struct ncclWorkBatchList* batchNode = ncclMemoryStackAlloc<ncclWorkBatchList>(&comm->memScoped);
+    // Coverity thinks that ncclIntruQueueEnqueue will access chan->workBatchQueue->tail, which might
+    // be NULL.  But that code is guarded by chan->workBatchQueue->head not being NULL, in which
+    // case tail won't be NULL either.
+    // coverity[var_deref_model:FALSE]
     ncclIntruQueueEnqueue(&chan->workBatchQueue, batchNode);
     batch = &batchNode->batch;
     batch->nextExtends = 0;
@@ -239,7 +244,29 @@ static ncclResult_t cleanupIpc(struct ncclComm* comm, struct ncclCommCallback* c
   return ncclSuccess;
 }
 
-static ncclResult_t registerIntraNodeBuffers(
+static ncclResult_t registerCheckP2PConnection(struct ncclComm* comm, struct ncclConnector* conn, struct ncclTopoGraph* graph, int peer, bool* needReg) {
+  if (conn->connected) {
+    if (conn->conn.flags & (NCCL_IPC_READ | NCCL_IPC_WRITE | NCCL_DIRECT_READ | NCCL_DIRECT_WRITE)) {
+      *needReg = true;
+    } else {
+      // network connection
+      *needReg = false;
+    }
+  } else {
+    struct ncclPeerInfo* peerInfo = &comm->peerInfo[peer];
+    struct ncclPeerInfo* myInfo = &comm->peerInfo[comm->rank];
+    int canConnect = 0;
+    NCCLCHECK(ncclTransports[0]->canConnect(&canConnect, comm, graph, myInfo, peerInfo));
+    if (canConnect) {
+      *needReg = true;
+    } else {
+      *needReg = false;
+    }
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t registerCollBuffers(
     struct ncclComm* comm, struct ncclTaskColl* info,
     void* outRegBufSend[NCCL_MAX_LOCAL_RANKS],
     void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS],
@@ -250,8 +277,10 @@ static ncclResult_t registerIntraNodeBuffers(
 
   info->regBufType = NCCL_REGULAR_BUFFER;
   *regNeedConnect = true;
+  if (!(ncclParamLocalRegister() || (comm->planner.persistent && ncclParamGraphRegister()))) goto exit;
 #if CUDART_VERSION >= 11030
-  if ((info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) && comm->nvlsRegSupport) {
+  if (info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) {
+    if (!comm->nvlsRegSupport || info->opDev.op == ncclDevPreMulSum) goto exit;
     bool regBufUsed = false;
     const void *sendbuff = info->sendbuff;
     void *recvbuff = info->recvbuff;
@@ -284,60 +313,6 @@ static ncclResult_t registerIntraNodeBuffers(
       }
       info->regBufType = NCCL_NVLS_REG_BUFFER;
     }
-  } else if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT &&   // limited to CollNetDirect for now
-    comm->intraHighestTransportType == TRANSPORT_P2P && // only when all ranks can p2p each other
-    comm->intraRanks < comm->localRanks &&  // only with inter-process & intra-node peers
-    comm->planner.persistent && 0) {
-    /* Disable CollnetDirect registration since it does not support cuMem* allocated memory. */
-    int localRank = comm->localRank;
-    cudaPointerAttributes sattr, rattr;
-
-    CUDACHECK(cudaPointerGetAttributes(&sattr, info->sendbuff));
-    CUDACHECK(cudaPointerGetAttributes(&rattr, info->recvbuff));
-    if (sattr.type != cudaMemoryTypeDevice || rattr.type != cudaMemoryTypeDevice) return ncclSuccess;
-
-    if (CUPFN(cuMemGetAddressRange) == nullptr) return ncclSuccess;
-
-    struct HandlePair {
-      cudaIpcMemHandle_t ipc[2]; // {send, recv}
-      size_t offset[2]; // {send, recv}
-    };
-    struct HandlePair handles[NCCL_MAX_LOCAL_RANKS];
-
-    CUDACHECKGOTO(cudaIpcGetMemHandle(&handles[localRank].ipc[0], (void*)info->sendbuff), result, fallback);
-    CUDACHECKGOTO(cudaIpcGetMemHandle(&handles[localRank].ipc[1], (void*)info->recvbuff), result, fallback);
-
-    void *baseSend, *baseRecv;
-    size_t size;
-    CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &size, (CUdeviceptr)info->sendbuff));
-    handles[localRank].offset[0] = (char*)info->sendbuff - (char*)baseSend;
-    CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseRecv, &size, (CUdeviceptr)info->recvbuff));
-    handles[localRank].offset[1] = (char*)info->recvbuff - (char*)baseRecv;
-
-    NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, handles, sizeof(struct HandlePair)));
-
-    // Open handles locally
-    for (int i=0; i < comm->localRanks; i++) {
-      if (i == localRank) { // Skip self
-        outRegBufSend[i] = nullptr;
-        outRegBufRecv[i] = nullptr;
-      } else {
-        for (int sr=0; sr < 2; sr++) {
-          // Get base address of mapping
-          void* base;
-          CUDACHECK(cudaIpcOpenMemHandle(&base, handles[i].ipc[sr], cudaIpcMemLazyEnablePeerAccess));
-          // Get real buffer address by adding offset in the mapping
-          (sr == 0 ? outRegBufSend : outRegBufRecv)[i] = (char*)base + handles[i].offset[sr];
-          // Enqueue reminder to close memory handle
-          struct ncclIpcCleanupCallback* cb = (struct ncclIpcCleanupCallback*)malloc(sizeof(struct ncclIpcCleanupCallback));
-          cb->base.fn = cleanupIpc;
-          cb->ptr = base;
-          ncclIntruQueueEnqueue(cleanupQueue, &cb->base);
-          info->nCleanupQueueElts += 1;
-        }
-      }
-    }
-    info->regBufType = NCCL_IPC_REG_BUFFER;
   } else if ((info->algorithm == NCCL_ALGO_COLLNET_DIRECT || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) && comm->collNetRegSupport && info->opDev.op != ncclDevPreMulSum && info->opDev.op != ncclDevSumPostDiv) {
     size_t elementSize = ncclTypeSize(info->datatype);
     size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count);
@@ -356,27 +331,200 @@ static ncclResult_t registerIntraNodeBuffers(
     }
 
     if ((sendRegBufFlag == 0 || recvRegBufFlag == 0) && comm->planner.persistent && ncclParamGraphRegister()) {
-      ncclCollnetGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &sendRegBufFlag, &sendHandle, cleanupQueue, &info->nCleanupQueueElts);
-      info->sendMhandle = sendHandle;
-      if (sendRegBufFlag) {
+      if (!sendRegBufFlag) {
+        ncclCollnetGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &sendRegBufFlag, &sendHandle, cleanupQueue, &info->nCleanupQueueElts);
+        info->sendMhandle = sendHandle;
+      }
+      if (sendRegBufFlag && !recvRegBufFlag) {
         ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &recvRegBufFlag, &recvHandle, cleanupQueue, &info->nCleanupQueueElts);
         info->recvMhandle = recvHandle;
       }
     }
 
     if (sendRegBufFlag && recvRegBufFlag) {
-      info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 1));
+      info->nMaxChannels = 1;
       info->regBufType = NCCL_COLLNET_REG_BUFFER;
       if (sendRegBufFlag == 1 && recvRegBufFlag == 1) {
         INFO(NCCL_REG, "rank %d successfully registered collNet sendbuff %p (handle %p), sendbuff size %ld, recvbuff %p (handle %p), recvbuff size %ld", comm->rank, info->sendbuff, sendHandle, sendbuffSize, info->recvbuff, recvHandle, recvbuffSize);
       }
     }
+  } else if (comm->intraNodeP2pSupport && info->protocol == NCCL_PROTO_SIMPLE) {
+    // IPC buffer registration
+    if (info->func == ncclFuncReduceScatter) goto exit;
+    if (info->algorithm == NCCL_ALGO_RING && ((info->func == ncclFuncAllReduce && info->sendbuff == info->recvbuff) || info->func == ncclFuncReduce)) goto exit;
+    if ((info->algorithm == NCCL_ALGO_TREE || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) && info->sendbuff == info->recvbuff) goto exit;
+    if (info->func == ncclFuncAllGather && info->algorithm == NCCL_ALGO_PAT) goto exit;
+
+    int peerRanks[NCCL_MAX_LOCAL_RANKS];
+    int nPeers = 0;
+    size_t elementSize = ncclTypeSize(info->datatype);
+    size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count);
+    size_t recvbuffSize = elementSize*ncclFuncRecvCount(info->func, comm->nRanks, info->count);
+    int regBufFlag = 0;
+    memset(peerRanks, 0xff, sizeof(int) * NCCL_MAX_LOCAL_RANKS);
+
+    if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) {
+      struct ncclChannel* channel = comm->channels;
+      for (int r = 0; r < NCCL_MAX_DIRECT_ARITY; ++r) {
+        for (int updown = 0; updown < 2; ++updown) {
+          int peer;
+          if (updown == 0)
+            peer = channel->collnetDirect.up[r];
+          else
+            peer = channel->collnetDirect.down[r];
+          if (peer != -1) {
+            struct ncclConnector* peerConn = &channel->peers[peer]->recv[0];
+            bool needReg = false;
+
+            NCCLCHECK(registerCheckP2PConnection(comm, peerConn, &comm->graphs[NCCL_ALGO_COLLNET_DIRECT], peer, &needReg));
+            if (needReg) {
+              bool found = false;
+              for (int p = 0; p < nPeers; ++p) {
+                if (peerRanks[p] == peer) {
+                  found = true;
+                  break;
+                }
+              }
+              if (!found) peerRanks[nPeers++] = peer;
+            }
+          }
+        }
+      }
+
+      if (nPeers > 0) {
+        if (ncclParamLocalRegister())
+          ncclIpcLocalRegisterBuffer(comm, info->sendbuff, sendbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->sendbuffOffset, &info->sendbuffRmtAddrs);
+        if (!regBufFlag && comm->planner.persistent && ncclParamGraphRegister()) {
+          ncclIpcGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->sendbuffOffset, &info->sendbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts);
+        }
+        if (regBufFlag) {
+          if (ncclParamLocalRegister())
+            ncclIpcLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs);
+          if (!regBufFlag && comm->planner.persistent && ncclParamGraphRegister()) {
+            ncclIpcGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts);
+          }
+        }
+      }
+      if (regBufFlag) {
+        info->regBufType = NCCL_IPC_REG_BUFFER;
+      }
+    } else if (info->algorithm == NCCL_ALGO_RING) {
+      struct ncclReg* recvRegRecord;
+      NCCLCHECK(ncclRegFind(comm, info->recvbuff, recvbuffSize, &recvRegRecord));
+      if (recvRegRecord == NULL) goto exit;
+      for (int c = 0; c < comm->nChannels; ++c) {
+        struct ncclChannel* channel = comm->channels + c;
+        for (int r = 0; r < 2; ++r) {
+          bool needReg = false;
+          int peer;
+          struct ncclConnector* peerConn;
+          // P2P transport
+          if (r == 0)
+            peer = channel->ring.prev;
+          else
+            peer = channel->ring.next;
+          peerConn = &channel->peers[peer]->recv[0];
+          NCCLCHECK(registerCheckP2PConnection(comm, peerConn, &comm->graphs[NCCL_ALGO_RING], peer, &needReg));
+
+          if (needReg) {
+            bool found = false;
+            for (int p = 0; p < nPeers; ++p) {
+              if (peerRanks[p] == peer) {
+                found = true;
+                break;
+              }
+            }
+            if (!found) peerRanks[nPeers++] = peer;
+          }
+        }
+      }
+      if (nPeers > 0) {
+        if (ncclParamLocalRegister()) {
+          ncclIpcLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs);
+        }
+        if (!regBufFlag && comm->planner.persistent && ncclParamGraphRegister()) {
+          ncclIpcGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts);
+        }
+      }
+      if (regBufFlag) {
+        info->regBufType = NCCL_IPC_REG_BUFFER;
+      }
+    } else if (info->algorithm == NCCL_ALGO_TREE || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) {
+      struct ncclReg* recvRegRecord;
+      NCCLCHECK(ncclRegFind(comm, info->recvbuff, recvbuffSize, &recvRegRecord));
+      if (recvRegRecord == NULL) goto exit;
+      for (int c = 0; c < comm->nChannels; ++c) {
+        struct ncclChannel* channel = comm->channels + c;
+        struct ncclTree* tree = NULL;
+        int peers[NCCL_MAX_TREE_ARITY + 1];
+
+        if (info->algorithm == NCCL_ALGO_TREE)
+          tree = &channel->tree;
+        else
+          tree = &channel->collnetChain;
+        for (int p = 0; p < NCCL_MAX_TREE_ARITY; ++p) peers[p] = tree->down[p];
+        peers[NCCL_MAX_TREE_ARITY] = tree->up;
+        for (int p = 0; p < NCCL_MAX_TREE_ARITY + 1; ++p) {
+          int peer = peers[p];
+          bool peerNeedReg = false;
+          struct ncclConnector* recvConn = NULL;
+          // P2P transport
+          if (peer == -1 || peer == comm->nRanks) continue;
+          recvConn = &channel->peers[peer]->recv[0];
+          NCCLCHECK(registerCheckP2PConnection(comm, recvConn, &comm->graphs[info->algorithm], peer, &peerNeedReg));
+
+          if (peerNeedReg) {
+            bool found = false;
+            for (int pindex = 0; pindex < nPeers; ++pindex) {
+              if (peerRanks[pindex] == peer) {
+                found = true;
+                break;
+              }
+            }
+            if (!found) peerRanks[nPeers++] = peer;
+          }
+        }
+      }
+      if (nPeers > 0) {
+        if (ncclParamLocalRegister()) {
+          ncclIpcLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs);
+        }
+        if (!regBufFlag && comm->planner.persistent && ncclParamGraphRegister()) {
+          ncclIpcGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts);
+        }
+      }
+      if (regBufFlag) {
+        info->regBufType = NCCL_IPC_REG_BUFFER;
+      }
+    }
+
+    if (info->regBufType == NCCL_IPC_REG_BUFFER && comm->nNodes == 1 && 16 < info->nMaxChannels && info->nMaxChannels <= 24) {
+      info->nMaxChannels = 16;
+    }
   }
-fallback:
+exit:
 #endif
   return result;
 }
 
+static ncclResult_t registerP2pBuffer(struct ncclComm* comm, void* userbuff, int peerRank, size_t size, int* regFlag, void** regAddr, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue) {
+  ncclResult_t ret = ncclSuccess;
+  uintptr_t offset = 0;
+  uintptr_t* peerRmtAddrs = NULL;
+
+  *regFlag = 0;
+  if (ncclParamLocalRegister()) {
+    ncclIpcLocalRegisterBuffer(comm, userbuff, size, &peerRank, 1, NCCL_IPC_SENDRECV, regFlag, &offset, &peerRmtAddrs);
+  }
+  if (*regFlag == 0 && comm->planner.persistent && ncclParamGraphRegister()) {
+    ncclIpcGraphRegisterBuffer(comm, userbuff, size, &peerRank, 1, NCCL_IPC_SENDRECV, regFlag, &offset, &peerRmtAddrs, reinterpret_cast<void*>(cleanupQueue), NULL);
+  }
+
+  if (*regFlag)
+    *regAddr = (void*)((uintptr_t)peerRmtAddrs + offset);
+  return ret;
+}
+
 static ncclResult_t getCollNetSupport(struct ncclComm* comm, struct ncclTaskColl* task, int* collNetSupport);
 static ncclResult_t getAlgoInfo(
   struct ncclComm* comm, struct ncclTaskColl* task,
@@ -500,7 +648,7 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
     void* regBufSend[NCCL_MAX_LOCAL_RANKS];
     void* regBufRecv[NCCL_MAX_LOCAL_RANKS];
     bool regNeedConnect = true;
-    registerIntraNodeBuffers(comm, task, regBufSend, regBufRecv, &planner->collCleanupQueue, &regNeedConnect);
+    registerCollBuffers(comm, task, regBufSend, regBufRecv, &planner->collCleanupQueue, &regNeedConnect);
 
     if (comm->runtimeConn && comm->initAlgoChannels[task->algorithm] == false) {
       if (task->algorithm == NCCL_ALGO_NVLS_TREE && comm->initAlgoChannels[NCCL_ALGO_NVLS] == false && regNeedConnect == true) {
@@ -517,6 +665,10 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
     struct ncclDevWorkColl devWork = {};
     devWork.sendbuff = (void*)task->sendbuff;
     devWork.recvbuff = (void*)task->recvbuff;
+    devWork.sendbuffOffset = task->sendbuffOffset;
+    devWork.recvbuffOffset = task->recvbuffOffset;
+    devWork.sendbuffRmtAddrs = task->sendbuffRmtAddrs;
+    devWork.recvbuffRmtAddrs = task->recvbuffRmtAddrs;
     devWork.root = task->root;
     devWork.nWarps = task->nWarps;
     devWork.redOpArg = task->opDev.scalarArg;
@@ -527,35 +679,13 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
     struct ncclWorkList* workNode;
     switch (task->regBufType) {
     case NCCL_REGULAR_BUFFER:
+    case NCCL_IPC_REG_BUFFER:
     case NCCL_COLLNET_REG_BUFFER:
       { workNode = ncclMemoryStackAllocInlineArray<ncclWorkList, ncclDevWorkColl>(&comm->memScoped, 1);
         workNode->workType = ncclDevWorkTypeColl;
         workNode->size = sizeof(struct ncclDevWorkColl);
         memcpy((void*)(workNode+1), (void*)&devWork, workNode->size);
       } break;
-    case NCCL_IPC_REG_BUFFER:
-      { struct ncclDevWorkCollReg workReg = {};
-        workReg.coll = devWork;
-        struct ncclChannel *channel0 = &comm->channels[0];
-        for (int i=0; i < NCCL_MAX_DIRECT_ARITY; i++) {
-          int peer = channel0->collnetDirect.down[i];
-          if (peer == -1) break;
-          int j = comm->rankToLocalRank[peer]; // Get intra-node slot
-          workReg.dnInputs[i] = regBufSend[j]; // Input buffer of leaf peer
-          workReg.dnOutputs[i] = regBufRecv[j]; // Output buffer of leaf peer
-        }
-        for (int i=0; i < NCCL_MAX_DIRECT_ARITY; i++) {
-          int peer = channel0->collnetDirect.up[i];
-          if (peer == -1) break;
-          int j = comm->rankToLocalRank[peer];
-          // Output buffer of root peer
-          workReg.upOutputs[i] = regBufRecv[j];
-        }
-        workNode = ncclMemoryStackAllocInlineArray<ncclWorkList, ncclDevWorkCollReg>(&comm->memScoped, 1);
-        workNode->workType = ncclDevWorkTypeCollReg;
-        workNode->size = sizeof(struct ncclDevWorkCollReg);
-        memcpy((void*)(workNode+1), (void*)&workReg, workNode->size);
-      } break;
     case NCCL_NVLS_REG_BUFFER:
       { struct ncclDevWorkCollReg workReg = {};
         workReg.coll = devWork; // C++ struct assignment
@@ -590,6 +720,7 @@ static ncclResult_t scheduleCollTasksToPlan(
   int nChannels[2*2] = {0, 0, 0, 0}; // [collnet][nvls]
   int const nMaxChannels[2*2] = {comm->nChannels, comm->nvlsChannels, // [collnet][nvls]
                                  comm->nChannels, comm->nvlsChannels};
+  constexpr size_t MinTrafficPerChannel = 16 << 10; // 16K traffic as minimal
   do {
     size_t workBytes = 0;
     struct ncclTaskColl* task = ncclIntruQueueHead(&planner->collTaskQueue);
@@ -601,7 +732,7 @@ static ncclResult_t scheduleCollTasksToPlan(
       nPlanColls += 1;
       workBytes += workNode->size;
       int kind = 2*task->isCollnet + task->isNvls;
-      trafficBytes[kind] += task->trafficBytes;
+      trafficBytes[kind] += std::max(MinTrafficPerChannel, task->trafficBytes);
       nChannels[kind] += task->nMaxChannels;
       nChannels[kind] = std::min(nChannels[kind], nMaxChannels[kind]);
       task = task->next;
@@ -611,7 +742,6 @@ static ncclResult_t scheduleCollTasksToPlan(
   } while (0);
 
   int kindPrev = -1;
-  constexpr size_t MinTrafficPerChannel = 512;
   size_t trafficPerChannel = 0;
   int channelId = 0;
   size_t currentTraffic = 0;
@@ -650,14 +780,16 @@ static ncclResult_t scheduleCollTasksToPlan(
       for (int c=devWork->channelLo; c <= (int)devWork->channelHi; c++) {
         proxyOp.channelId = c;
         proxyOp.opCount = proxyOpId;
+        proxyOp.task.coll = task;
+        proxyOp.rank = comm->rank;
         addWorkBatchToPlan(comm, plan, c, workNode->workType, task->devFuncId, plan->workBytes);
         NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOp));
       }
     } else { // not task->isCollnet
-      constexpr size_t cellSize = 16;
+      int trafficPerByte = ncclFuncTrafficPerByte(task->func, comm->nRanks);
+      size_t cellSize = divUp(divUp(MinTrafficPerChannel, (size_t)trafficPerByte), 16) * 16;
       int elementsPerCell = cellSize/elementSize;
       size_t cells = divUp(task->count*elementSize, cellSize);
-      int trafficPerByte = ncclFuncTrafficPerByte(task->func, comm->nRanks);
       size_t trafficPerElement = elementSize*trafficPerByte;
       size_t trafficPerCell = cellSize*trafficPerByte;
       size_t cellsPerChannel = std::min(cells, divUp(trafficPerChannel, trafficPerCell));
@@ -665,7 +797,7 @@ static ncclResult_t scheduleCollTasksToPlan(
       if (channelId+1 == nMaxChannels[kind]) { // On last channel everything goes to "lo"
         cellsLo = cells;
       } else {
-        cellsLo = std::min(cells, (trafficPerChannel-currentTraffic)/trafficPerCell);
+        cellsLo = std::min(cells, divUp((trafficPerChannel-currentTraffic),trafficPerCell));
       }
       int nMidChannels = (cells-cellsLo)/cellsPerChannel;
       size_t cellsHi = (cells-cellsLo)%cellsPerChannel;
@@ -725,12 +857,12 @@ static ncclResult_t scheduleCollTasksToPlan(
       // Update the current channel and vacant traffic budget.
       if (countHi != 0) {
         channelId += nChannels-1;
-        currentTraffic = countHi*trafficPerElement;
+        currentTraffic = cellsHi*elementsPerCell*trafficPerElement;
       } else if (nMidChannels != 0) {
         channelId += nChannels;
         currentTraffic = 0;
       } else {
-        currentTraffic += countLo*trafficPerElement;
+        currentTraffic += cellsLo*elementsPerCell*trafficPerElement;
       }
 
       if (currentTraffic >= trafficPerChannel && channelId+1 != nMaxChannels[kind]) {
@@ -750,7 +882,12 @@ static ncclResult_t scheduleCollTasksToPlan(
         }
         proxyOp->channelId = c;
         proxyOp->opCount = proxyOpId;
+        proxyOp->task.coll = task;
+        proxyOp->rank = comm->rank;
         addWorkBatchToPlan(comm, plan, c, workNode->workType, task->devFuncId, plan->workBytes);
+        // Coverity reports "proxyOp->connection" as being possibly uninitialized.  It's hard to
+        // determine if that's actually true but it's also not clear if that would be an issue.
+        // coverity[uninit_use_in_call:FALSE]
         NCCLCHECK(addProxyOpIfNeeded(comm, plan, proxyOp));
       }
     }
@@ -790,6 +927,7 @@ static ncclResult_t scheduleCollTasksToPlan(
     ncclIntruQueueDequeue(&planner->collWorkQueue);
     nPlanColls -= 1;
     planner->nTasksColl -= 1;
+    ncclIntruQueueEnqueue(&plan->collTaskQueue, task);
     ncclIntruQueueEnqueue(&plan->workQueue, workNode);
     plan->workBytes += workNode->size;
   }
@@ -807,7 +945,8 @@ static ncclResult_t addP2pToPlan(
     struct ncclComm* comm, struct ncclKernelPlan* plan,
     int nChannelsMin, int nChannelsMax, int p2pRound,
     int sendRank, void* sendAddr, ssize_t sendBytes,
-    int recvRank, void* recvAddr, ssize_t recvBytes
+    int recvRank, void* recvAddr, ssize_t recvBytes,
+    struct ncclTaskP2p** p2pTasks
   ) {
   constexpr int connIndex = 1;
   bool selfSend = (sendRank == comm->rank);
@@ -842,7 +981,8 @@ static ncclResult_t addP2pToPlan(
   int chunkSize[2];
   int chunkDataSize[2];
   int chunkDataSize_u32fp8[2];
-  bool registered[2];
+  bool registered[2] = {false, false};
+  bool ipcRegistered[2] = {false, false};
 
   for (int dir=0; dir < 2; dir++) { // 0=recv, 1=send
     if (bytes[dir] != -1) protoLL[dir] &= bytes[dir] <= thresholdLL;
@@ -866,11 +1006,29 @@ static ncclResult_t addP2pToPlan(
     chunkSize[dir] = chunkDataSize[dir];
     if (protocol[dir] == NCCL_PROTO_LL) chunkSize[dir] *= 2;
 
-    registered[dir] = false;
-    if (bytes[dir] > 0 && network[dir] && proxySameProcess[dir] && protocol[dir] == NCCL_PROTO_SIMPLE) {
-      struct ncclReg* regRecord;
-      NCCLCHECK(ncclRegFind(comm, addrs[dir], bytes[dir], &regRecord));
-      registered[dir] = (regRecord && regRecord->nDevs);
+    if (network[dir]) {
+      if (bytes[dir] > 0 && proxySameProcess[dir] && protocol[dir] == NCCL_PROTO_SIMPLE) {
+        struct ncclReg* regRecord;
+        NCCLCHECK(ncclRegFind(comm, addrs[dir], bytes[dir], &regRecord));
+        registered[dir] = regRecord && regRecord->nDevs;
+      }
+    } else if (bytes[dir] > 0 && addrs[dir] && protocol[dir] == NCCL_PROTO_SIMPLE && !selfSend) {
+      int peerRank = dir ? sendRank : recvRank;
+      int regFlag = 0;
+      int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, 0);
+      struct ncclChannelPeer** channelPeers = comm->channels[channelId].peers;
+      struct ncclConnector* conn = dir ? &channelPeers[peerRank]->send[connIndex]
+        : &channelPeers[peerRank]->recv[connIndex];
+      void* regAddr = NULL;
+      if (conn->conn.flags & (NCCL_IPC_WRITE | NCCL_IPC_READ | NCCL_DIRECT_WRITE | NCCL_DIRECT_READ)) {
+        // We require users registering buffers on both sides
+        NCCLCHECK(registerP2pBuffer(comm, addrs[dir], peerRank, bytes[dir], &regFlag, &regAddr, &plan->cleanupQueue));
+        if (regFlag) {
+          if (dir == 0 && conn->conn.flags & (NCCL_IPC_WRITE | NCCL_DIRECT_WRITE)) recvAddr = regAddr;
+          else if (dir == 1 && conn->conn.flags & (NCCL_IPC_READ | NCCL_DIRECT_READ)) sendAddr = regAddr;
+        }
+      }
+      ipcRegistered[dir] = regFlag ? true : false;
     }
 
     if (bytes[dir] == -1) nChannels[dir] = 0;
@@ -900,6 +1058,7 @@ static ncclResult_t addP2pToPlan(
   work->nSendChannels = nChannels[1];
   work->sendProtoLL = protoLL[1];
   work->sendRegistered = registered[1];
+  work->sendIpcReg = ipcRegistered[1];
   work->sendChunkSize_u32fp8 = chunkDataSize_u32fp8[1];
   work->sendRank = sendRank;
   work->sendAddr = sendAddr;
@@ -907,6 +1066,7 @@ static ncclResult_t addP2pToPlan(
   work->nRecvChannels = nChannels[0];
   work->recvProtoLL = protoLL[0];
   work->recvRegistered = registered[0];
+  work->recvIpcReg = ipcRegistered[0];
   work->recvChunkSize_u32fp8 = chunkDataSize_u32fp8[0];
   work->recvRank = recvRank;
   work->recvAddr = recvAddr;
@@ -925,6 +1085,9 @@ static ncclResult_t addP2pToPlan(
     op->pattern = dir ? ncclPatternSend : ncclPatternRecv;
     op->chunkSize = chunkSize[dir];
     op->reg = registered[dir];
+    op->coll = p2pTasks[dir] ? p2pTasks[dir]->func : 0;
+    op->task.p2p = p2pTasks[dir];
+    op->rank = comm->rank;
     // The following are modified per channel part in addWorkToChannels():
     // op->buffer, op->nbytes, op->nsteps = ...;
   }
@@ -1041,13 +1204,16 @@ static ncclResult_t scheduleP2pTasksToPlan(
         if (!testBudget(budget, plan->nWorkBatches+nChannelsMax, plan->workBytes + sizeof(struct ncclDevWorkP2p))) {
           return ncclSuccess;
         }
-        NCCLCHECK(addP2pToPlan(comm, plan, nChannelsMin, nChannelsMax, round, sendRank, sendBuff, sendBytes, recvRank, recvBuff, recvBytes));
+        struct ncclTaskP2p* p2pTasks[2] = { recv, send };
+        NCCLCHECK(addP2pToPlan(comm, plan, nChannelsMin, nChannelsMax, round, sendRank, sendBuff, sendBytes, recvRank, recvBuff, recvBytes, p2pTasks));
         if (send != nullptr) {
           ncclIntruQueueDequeue(&peers[sendRank].sendQueue);
+          ncclIntruQueueEnqueue(&plan->p2pTaskQueue, send);
           comm->planner.nTasksP2p -= 1;
         }
         if (recv != nullptr) {
           ncclIntruQueueDequeue(&peers[recvRank].recvQueue);
+          ncclIntruQueueEnqueue(&plan->p2pTaskQueue, recv);
           comm->planner.nTasksP2p -= 1;
         }
       }
@@ -1100,29 +1266,44 @@ static void waitWorkFifoAvailable(struct ncclComm* comm, uint32_t desiredProduce
   }
 }
 
+namespace {
+  struct uploadWork_cleanup_t {
+    struct ncclCommEventCallback base;
+    void *hostBuf;
+  };
+  ncclResult_t uploadWork_cleanup_fn(
+      struct ncclComm* comm, struct ncclCommEventCallback* cb
+    ) {
+    struct uploadWork_cleanup_t* me = (struct uploadWork_cleanup_t*)cb;
+    free(me->hostBuf);
+    CUDACHECK(cudaEventDestroy(me->base.event));
+    return ncclSuccess;
+  }
+}
+
 static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* plan) {
   size_t workBytes = plan->workBytes;
   size_t batchBytes = plan->nWorkBatches*sizeof(struct ncclDevWorkBatch);
-  void* fifoBuf;
+  void* fifoBufHost;
   uint32_t fifoCursor, fifoMask;
 
   switch (plan->workStorageType) {
   case ncclDevWorkStorageTypeArgs:
     plan->kernelArgs->workBuf = nullptr;
-    fifoBuf = (void*)plan->kernelArgs;
+    fifoBufHost = (void*)plan->kernelArgs;
     fifoCursor = sizeof(ncclDevKernelArgs) + batchBytes;
     fifoMask = ~0u;
     break;
   case ncclDevWorkStorageTypeFifo:
-    fifoBuf = comm->workFifoBuf;
+    fifoBufHost = comm->workFifoBuf;
     fifoCursor = comm->workFifoProduced;
     fifoMask = comm->workFifoBytes-1;
     waitWorkFifoAvailable(comm, fifoCursor + workBytes);
     plan->kernelArgs->workBuf = comm->workFifoBufDev;
     break;
   case ncclDevWorkStorageTypePersistent:
-    ncclMemoryStackPush(&comm->memScoped);
-    fifoBuf = ncclMemoryStackAlloc(&comm->memScoped, workBytes, /*align=*/16);
+    static_assert(16 <= alignof(max_align_t), "We rely on 16-byte alignment.");
+    fifoBufHost = malloc(workBytes);
     fifoCursor = 0;
     fifoMask = ~0u;
     break;
@@ -1144,7 +1325,7 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla
   // Write the channel-shared work structs.
   struct ncclWorkList* workNode = ncclIntruQueueHead(&plan->workQueue);
   while (workNode != nullptr) {
-    char* dst = (char*)fifoBuf;
+    char* dst = (char*)fifoBufHost;
     char* src = (char*)(workNode+1);
     for (int n = workNode->size; n != 0; n -= 16) {
       memcpy(
@@ -1164,11 +1345,39 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla
     if (comm->workFifoBufGdrHandle != nullptr) wc_store_fence();
     break;
   case ncclDevWorkStorageTypePersistent:
-    NCCLCHECK(ncclCudaMalloc(&plan->workBufPersistent, workBytes));
-    plan->kernelArgs->workBuf = plan->workBufPersistent;
-    NCCLCHECK(ncclCudaMemcpy(plan->workBufPersistent, fifoBuf, workBytes));
-    ncclMemoryStackPop(&comm->memScoped);
-    break;
+    { ncclResult_t result = ncclSuccess;
+      cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
+      void* fifoBufDev = nullptr;
+      CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+
+      // Acquire deviceStream to gain access to deviceStream.cudaStream. Since the
+      // user's graph will be launched later, and it also acquires the deviceStream,
+      // it will observe this upload.
+      NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), result, finish_scope);
+
+      CUDACHECKGOTO(cudaMallocAsync(&fifoBufDev, workBytes, comm->memPool, comm->sharedRes->deviceStream.cudaStream), result, finish_scope);
+      plan->workBufPersistent = fifoBufDev;
+      plan->kernelArgs->workBuf = fifoBufDev;
+
+      CUDACHECKGOTO(cudaMemcpyAsync(fifoBufDev, fifoBufHost, workBytes, cudaMemcpyDefault, comm->sharedRes->deviceStream.cudaStream), result, finish_scope);
+      cudaEvent_t memcpyDone;
+      CUDACHECKGOTO(cudaEventCreateWithFlags(&memcpyDone, cudaEventDisableTiming), result, finish_scope);
+      CUDACHECKGOTO(cudaEventRecord(memcpyDone, comm->sharedRes->deviceStream.cudaStream), result, finish_scope);
+
+      struct uploadWork_cleanup_t* cleanup;
+      NCCLCHECK(ncclCalloc(&cleanup, 1));
+      cleanup->base.fn = uploadWork_cleanup_fn;
+      cleanup->base.event = memcpyDone;
+      cleanup->hostBuf = fifoBufHost;
+      ncclIntruQueueEnqueue(&comm->eventCallbackQueue, &cleanup->base);
+
+      NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream), result, finish_scope);
+      NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm), result, finish_scope);
+
+    finish_scope:
+      CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+      if (result != ncclSuccess) return result;
+    } break;
   default: break;
   }
   return ncclSuccess;
@@ -1182,6 +1391,11 @@ static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan*
 
   struct ncclProxyOp* op = ncclIntruQueueHead(&plan->proxyOpQueue);
   while (op != nullptr) {
+    op->profilerContext = comm->profilerContext;
+    op->eActivationMask = op->coll <= ncclFuncAllReduce ? op->task.coll->eActivationMask : op->task.p2p->eActivationMask;
+    op->taskEventHandle = op->coll <= ncclFuncAllReduce ? op->task.coll->eventHandle : op->task.p2p->eventHandle;
+    ncclProfilerAddPidToProxyOp(op);
+
     uint64_t oldId = op->opCount;
     // Ignoring the bottom tag bit, opCount's are zero-based within plan so
     // translate them to the tip of the comm's history.
@@ -1216,8 +1430,12 @@ static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan*
 }
 
 static ncclResult_t hostStreamPlanTask(struct ncclComm* comm, struct ncclKernelPlan* plan) {
+  NCCLCHECK(ncclProfilerStartGroupEvent(plan));
+  NCCLCHECK(ncclProfilerStartTaskEvents(plan));
   NCCLCHECK(uploadProxyOps(comm, plan));
   NCCLCHECK(ncclProxyStart(comm));
+  NCCLCHECK(ncclProfilerStopTaskEvents(plan));
+  NCCLCHECK(ncclProfilerStopGroupEvent(plan));
   if (!plan->persistent) {
     // Notify main thread of our reclaiming. This will reclaim plan concurrently.
     ncclIntruQueueMpscEnqueue(&comm->callbackQueue, &plan->reclaimer);
@@ -1238,13 +1456,30 @@ static ncclResult_t reclaimPlan(struct ncclComm* comm, struct ncclCommCallback*
   struct ncclKernelPlan* plan = (struct ncclKernelPlan*)me; // cast from first member `reclaim`
   if (plan->persistent) {
     comm->persistentRefs -= 1;
-    NCCLCHECK(ncclCudaFree(plan->workBufPersistent));
+    if (plan->workStorageType == ncclDevWorkStorageTypePersistent) {
+      cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
+      CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+      CUDACHECK(cudaFree(plan->workBufPersistent));
+      CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+    }
     struct ncclProxyOp* q = ncclIntruQueueHead(&plan->proxyOpQueue);
     while (q != nullptr) {
       struct ncclProxyOp* q1 = q->enqNext;
       ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, q);
       q = q1;
     }
+    struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue);
+    while (ct != nullptr) {
+      struct ncclTaskColl* ct1 = ct->next;
+      ncclMemoryPoolFree(&comm->memPool_ncclTaskColl, ct);
+      ct = ct1;
+    }
+    struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue);
+    while (pt != nullptr) {
+      struct ncclTaskP2p* pt1 = pt->next;
+      ncclMemoryPoolFree(&comm->memPool_ncclTaskP2p, pt);
+      pt = pt1;
+    }
     ncclResult_t result = ncclSuccess;
     while (!ncclIntruQueueEmpty(&plan->cleanupQueue)) {
       struct ncclCommCallback* cb = ncclIntruQueueDequeue(&plan->cleanupQueue);
@@ -1286,7 +1521,7 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
       plan->comm = comm;
       plan->reclaimer.fn = reclaimPlan;
       plan->persistent = persistent;
-      // uploadWork() promotes ncclDevWorkStorageType[Fifo|Buf]->Args if the work can fit.
+      // finishPlan() promotes ncclDevWorkStorageType[Fifo|Persistent]->Args if the work can fit.
       plan->workStorageType = persistent ? ncclDevWorkStorageTypePersistent
                                          : ncclDevWorkStorageTypeFifo;
 
@@ -1554,10 +1789,15 @@ static ncclResult_t updateCollCostTable(
 
   for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
     if ((a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) && collNetSupport != 1) continue;
+    // CollNetDirect is only supported for up to 8 local GPUs
+    if (a == NCCL_ALGO_COLLNET_DIRECT && comm->maxLocalRanks > NCCL_MAX_DIRECT_ARITY+1) continue;
     if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && nvlsSupport != 1 && info->func != ncclFuncAllGather) continue;
     if (a == NCCL_ALGO_NVLS && collNetSupport != 1 && comm->nNodes > 1) continue;
     /* now we only support single-node NVLS allgather and reducescatter */
     if (a == NCCL_ALGO_NVLS && (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter) && comm->nNodes > 1) continue;
+    /* Tree reduceScatter doesn't support scaling yet */
+    if (a == NCCL_ALGO_PAT && info->func == ncclFuncReduceScatter
+        && (info->opDev.op == ncclDevPreMulSum || info->opDev.op == ncclDevSumPostDiv)) continue;
     for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
       bool backup;
       float time;
@@ -1601,6 +1841,8 @@ static ncclResult_t topoGetAlgoInfo(
   info->protocol = protocol;
   float time = minTime;
 
+  // Yes, we are first assigning and then testing if protocol is sane, but that's OK in this case.
+  // coverity[check_after_sink]
   if (info->algorithm == NCCL_ALGO_UNDEF || info->protocol == NCCL_PROTO_UNDEF) {
     if (backupAlgo == NCCL_ALGO_UNDEF || backupProto == NCCL_PROTO_UNDEF) {
       WARN("Error : no algorithm/protocol available");
@@ -1610,7 +1852,7 @@ static ncclResult_t topoGetAlgoInfo(
     info->protocol = backupProto;
     time = backupTime;
   }
-  if (comm->rank == 0) INFO(NCCL_TUNING, "%ld Bytes -> Algo %d proto %d time %f", nBytes, info->algorithm, info->protocol, time);
+  if (comm->rank == 0) INFO(NCCL_TUNING, "%s: %ld Bytes -> Algo %d proto %d time %f", ncclFuncToString(info->func), nBytes, info->algorithm, info->protocol, time);
   if (simInfo) simInfo->estimatedTime = time;
   TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", nBytes, info->algorithm, info->protocol, time);
 
@@ -1653,6 +1895,7 @@ static ncclResult_t topoGetAlgoInfo(
   }
   nt = nt/WARP_SIZE < 3 ? 3*WARP_SIZE : nt;
   if (info->algorithm == NCCL_ALGO_TREE) nt = NCCL_MAX_NTHREADS; // Tree now uses all threads always.
+  if (info->algorithm == NCCL_ALGO_PAT) nt = NCCL_MAX_NTHREADS;
   info->nMaxChannels = nc;
   info->nWarps = nt/WARP_SIZE;
   return ncclSuccess;
@@ -1704,8 +1947,15 @@ static ncclResult_t calcCollChunking(
     pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUp : ncclPatternPipelineTo;
     break;
   case ncclFuncReduceScatter:
+    pattern =
+      info->algorithm == NCCL_ALGO_PAT ? ncclPatternPatUp :
+      info->algorithm == NCCL_ALGO_NVLS ? ncclPatternNvls :
+      info->algorithm == NCCL_ALGO_COLLNET_DIRECT ? ncclPatternCollnetDirect :
+      ncclPatternRing;
+    break;
   case ncclFuncAllGather:
     pattern =
+      info->algorithm == NCCL_ALGO_PAT ? ncclPatternPatDown :
       info->algorithm == NCCL_ALGO_NVLS ? ncclPatternNvls :
       info->algorithm == NCCL_ALGO_COLLNET_DIRECT ? ncclPatternCollnetDirect :
       ncclPatternRing;
@@ -1729,6 +1979,8 @@ static ncclResult_t calcCollChunking(
   case ncclPatternTreeUp:
   case ncclPatternTreeDown:
   case ncclPatternTreeUpDown:
+  case ncclPatternPatUp:
+  case ncclPatternPatDown:
   case ncclPatternPipelineFrom:
   case ncclPatternPipelineTo:
   case ncclPatternCollnetChain:
@@ -1776,13 +2028,17 @@ static ncclResult_t calcCollChunking(
     int maxChunkSize = comm->nvlsChunkSize;
     if (comm->nNodes > 1 && comm->bandwidths[ncclFuncAllReduce][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] < 150) maxChunkSize = 32768;
     if (chunkSize > maxChunkSize) chunkSize = maxChunkSize;
-    // Use uint64_t so that concurrentOps*chunkSize*X does not overflow
+    // Use uint64_t so that concurrentOps*chunkSize*X does not overflow.
+    // However, nChannels * comm->channels[0].nvls.nHeads should easily fit in 32 bits.
+    // coverity[overflow_before_widen]
     uint64_t concurrentOps = nChannels * comm->channels[0].nvls.nHeads;
     if ((nBytes < (64 * (concurrentOps * chunkSize))) && (chunkSize > 65536)) chunkSize = 65536;
     if ((nBytes < (8 * (concurrentOps * chunkSize))) && (chunkSize > 32768)) chunkSize = 32768;
     if ((nBytes < (2 * (concurrentOps * chunkSize))) && (chunkSize > 16384)) chunkSize = 16384;
   } else if (info->algorithm == NCCL_ALGO_NVLS_TREE) {
-    // Use uint64_t so that concurrentOps*chunkSize*X does not overflow
+    // Use uint64_t so that concurrentOps*chunkSize*X does not overflow.
+    // However, nChannels * comm->channels[0].nvls.nHeads should easily fit in 32 bits.
+    // coverity[overflow_before_widen]
     uint64_t concurrentOps = nChannels * comm->channels[0].nvls.nHeads;
     chunkSize = comm->nvlsChunkSize;
     int maxChunkSize = (int)ncclParamNvlsTreeMaxChunkSize();
@@ -1796,14 +2052,21 @@ static ncclResult_t calcCollChunking(
     int nNodes = comm->nNodes;
     float ppn = comm->nRanks / (float)nNodes;
     float nstepsLL128 = 1+log2i(nNodes) + 0.1*ppn;
+    // Yes, we are OK with the division on the left side of the < operand being integer.
+    // coverity[integer_division]
     while (nBytes / (nChannels*chunkSize) < nstepsLL128*64/ppn && chunkSize > 131072) chunkSize /= 2;
+    // coverity[integer_division]
     while (nBytes / (nChannels*chunkSize) < nstepsLL128*16/ppn && chunkSize > 32768) chunkSize /= 2;
+  } else if (info->func == ncclFuncAllGather && info->algorithm == NCCL_ALGO_PAT) {
+    while (chunkSize*nChannels*32 > nBytes && chunkSize > 65536) chunkSize /= 2;
+  } else if (info->func == ncclFuncReduceScatter && info->algorithm == NCCL_ALGO_PAT) {
+    while (chunkSize*nChannels*16 > nBytes && chunkSize > 65536) chunkSize /= 2;
   }
 
   // Compute directFlags of work struct.
   if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) {
     // Set direct direction for broadcast-gather (read or write)
-    *outDirectFlags = (nBytes/nChannels <= 1024*1024) ? NCCL_DIRECT_WRITE : NCCL_DIRECT_READ;
+    *outDirectFlags = (nBytes/nChannels <= 1024 * 4) ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
   } else {
     *outDirectFlags = 0;
   }
@@ -1852,6 +2115,10 @@ static ncclResult_t calcCollChunking(
     }
   }
 
+  if (pattern == ncclPatternPatUp || pattern == ncclPatternPatDown) {
+    proxyOp->nbytes = DIVUP(nBytes, nChannels);
+  }
+
   *outChunkSize = chunkSize;
   return ncclSuccess;
 }
@@ -1874,6 +2141,7 @@ static ncclResult_t hostToDevRedOp(
   opFull->proxyOp = op;
 
   int nbits = 8*ncclTypeSize(datatype);
+  if (nbits <= 0) return ncclInvalidArgument;
   uint64_t allBits = uint64_t(-1)>>(64-nbits);
   uint64_t signBit = allBits^(allBits>>1);
 
@@ -1947,8 +2215,12 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
 
     // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`.
     ncclGroupCommJoin(info->comm);
-    struct ncclTaskP2p* p2p = ncclMemoryStackAlloc<struct ncclTaskP2p>(&comm->memScoped);
+    struct ncclTaskP2p* p2p = ncclMemoryPoolAlloc<struct ncclTaskP2p>(&comm->memPool_ncclTaskP2p, &comm->memPermanent);
+    p2p->func = info->coll;
     p2p->buff = (void*)info->recvbuff;
+    p2p->count = info->count;
+    p2p->datatype = info->datatype;
+    p2p->root = info->root;
     p2p->bytes = nBytes;
     ncclIntruQueueEnqueue(
       isSendNotRecv ? &planner->peers[peer].sendQueue : &planner->peers[peer].recvQueue,
@@ -1996,7 +2268,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
     } else {
       // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`.
       ncclGroupCommJoin(info->comm);
-      struct ncclTaskColl* t = ncclMemoryStackAlloc<struct ncclTaskColl>(&comm->memScoped);
+      struct ncclTaskColl* t = ncclMemoryPoolAlloc<struct ncclTaskColl>(&comm->memPool_ncclTaskColl, &comm->memPermanent);
       t->func = info->coll;
       t->sendbuff = info->sendbuff;
       t->recvbuff = info->recvbuff;
@@ -2026,7 +2298,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
     while (true) {
       if (l == nullptr) { // Got to the end, this must be a new stream.
         struct ncclCudaGraph graph;
-        NCCLCHECK(ncclCudaGetCapturingGraph(&graph, info->stream))
+        NCCLCHECK(ncclCudaGetCapturingGraph(&graph, info->stream));
         if (planner->streams != nullptr && !ncclCudaGraphSame(planner->capturingGraph, graph)) {
           WARN("Streams given to a communicator within a NCCL group must either be all uncaptured or all captured by the same graph.");
           return ncclInvalidUsage;
@@ -2075,7 +2347,7 @@ exit:
   NCCLCHECK(ncclGroupEndInternal());
   /* if depth is 1, ncclGroupEndInternal() will trigger group ops. The state can change
    * so we have to check state here. */
-  if (info->comm && !info->comm->config.blocking) { NCCLCHECK(ncclCommGetAsyncError(info->comm, &ret)) };
+  if (info->comm && !info->comm->config.blocking) { NCCLCHECK(ncclCommGetAsyncError(info->comm, &ret)); }
   return ret;
 fail:
   if (info->comm && !info->comm->config.blocking) (void) ncclCommSetAsyncError(info->comm, ret);
@@ -2093,7 +2365,8 @@ ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataTyp
     int cap = 2*comm->userRedOpCapacity;
     if (cap < 4) cap = 4;
     ncclUserRedOp *ops = new ncclUserRedOp[cap];
-    std::memcpy(ops, comm->userRedOps, comm->userRedOpCapacity*sizeof(ncclUserRedOp));
+    if (comm->userRedOpCapacity > 0)
+      std::memcpy(ops, comm->userRedOps, comm->userRedOpCapacity*sizeof(ncclUserRedOp));
     for(int ix=comm->userRedOpCapacity; ix < cap; ix++)
       ops[ix].freeNext = ix + 1;
     delete[] comm->userRedOps;
@@ -2109,8 +2382,10 @@ ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataTyp
   user->datatype = datatype;
   user->opFull.op = ncclDevPreMulSum;
   if (residence == ncclScalarHostImmediate) {
+    int size = ncclTypeSize(datatype);
+    if (size < 1) return ncclInternalError;
     user->opFull.scalarArgIsPtr = false;
-    std::memcpy(&user->opFull.scalarArg, scalar, ncclTypeSize(datatype));
+    std::memcpy(&user->opFull.scalarArg, scalar, size);
   } else {
     user->opFull.scalarArgIsPtr = true;
     user->opFull.scalarArg = reinterpret_cast<uint64_t>(scalar);
@@ -2127,6 +2402,10 @@ ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm) {
     WARN("ncclRedOpDestroy : operator is a NCCL builtin.");
     return ncclInvalidArgument;
   }
+  // int(ncclMaxRedOp) < int(op) will always be false due to the sizes of
+  // the datatypes involved, and that's by design.  We keep the check though
+  // just as a reminder.
+  // coverity[result_independent_of_operands]
   if (int(op) < 0 || int(ncclMaxRedOp) < int(op)) {
     WARN("ncclRedOpDestroy :  operator is garbage.");
     return ncclInvalidArgument;
diff --git a/src/graph/connect.cc b/src/graph/connect.cc
index b1b99d4e37..3f639a0226 100644
--- a/src/graph/connect.cc
+++ b/src/graph/connect.cc
@@ -226,6 +226,8 @@ static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
       }
     }
     channel->collnetDirect.nHeads = nHeads;
+    // nHeads should always be greater than 0.
+    // coverity[divide_by_zero]
     channel->collnetDirect.shift = (rank%localRanks)%nHeads; // Shift by intraRank so that leaves don't send to same head simultaneously
     channel->collnetDirect.depth = (nUp == 0 && nDown == 0) ? 1 : 2;
     sprintf(line+strlen(line), "nUp %d nHeads %d ", nUp, nHeads);
@@ -374,20 +376,21 @@ NCCL_PARAM(UnpackDoubleNChannels, "UNPACK_DOUBLE_NCHANNELS", 1);
 
 ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, struct ncclComm* parent) {
   // Gather data from all ranks
-  int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1, *nvlsHeads;
+  ncclResult_t ret = ncclSuccess;
+  int *ringRecv = NULL, *ringSend = NULL, *ringPrev = NULL, *ringNext = NULL, *treeToParent = NULL, *treeToChild0 = NULL, *treeToChild1 = NULL, *nvlsHeads = NULL;
   int nranks = comm->nRanks;
   int nNodes = comm->nNodes;
   int nChannels = comm->nChannels;
   int minHeadNum = INT_MAX;
   int shared = parent && parent->nvlsSupport  && parent->config.splitShare;
   NCCLCHECK(ncclCalloc(&ringRecv, nNodes*MAXCHANNELS));
-  NCCLCHECK(ncclCalloc(&ringSend, nNodes*MAXCHANNELS));
-  NCCLCHECK(ncclCalloc(&ringPrev, nranks*MAXCHANNELS));
-  NCCLCHECK(ncclCalloc(&ringNext, nranks*MAXCHANNELS));
-  NCCLCHECK(ncclCalloc(&treeToParent, nNodes*MAXCHANNELS));
-  NCCLCHECK(ncclCalloc(&treeToChild0, nNodes*MAXCHANNELS));
-  NCCLCHECK(ncclCalloc(&treeToChild1, nNodes*MAXCHANNELS));
-  NCCLCHECK(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS));
+  NCCLCHECKGOTO(ncclCalloc(&ringSend, nNodes*MAXCHANNELS), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&ringPrev, nranks*MAXCHANNELS), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&ringNext, nranks*MAXCHANNELS), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&treeToParent, nNodes*MAXCHANNELS), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&treeToChild0, nNodes*MAXCHANNELS), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&treeToChild1, nNodes*MAXCHANNELS), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS), ret, fail);
 
   // Alternate rings to avoid crossing rails
   if (graphs[NCCL_ALGO_RING]->crossNic && (nChannels % 2) == 0) {
@@ -433,8 +436,8 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
   }
 
   // Connect rings and trees. This should also duplicate the channels.
-  NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext));
-  NCCLCHECK(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, treePatterns));
+  NCCLCHECKGOTO(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext), ret, fail);
+  NCCLCHECKGOTO(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, treePatterns), ret, fail);
 
   // Duplicate ringPrev/ringNext for ncclBuildRing
   memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int));
@@ -459,7 +462,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
       int collNetNchannels = std::min(MAXCHANNELS, nChannels+nChannels/2);
       nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext);
     }
-    NCCLCHECK(connectCollNet(comm, graphs[NCCL_ALGO_COLLNET_DIRECT]));
+    NCCLCHECKGOTO(connectCollNet(comm, graphs[NCCL_ALGO_COLLNET_DIRECT]), ret, fail);
   }
 
   // Use 4 compute channels per search channel to reach peak BW on <8 PPN
@@ -493,7 +496,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
   if (comm->nChannels < comm->nvlsChannels) {
     nChannels = comm->nChannels = copyChannels(comm, comm->nChannels, comm->nvlsChannels, ringPrev, ringNext);
   }
-  NCCLCHECK(connectNvls(comm, nvlsHeads, minHeadNum));
+  NCCLCHECKGOTO(connectNvls(comm, nvlsHeads, minHeadNum), ret, fail);
 #endif
   if (shared && comm->nChannels > parent->sharedRes->tpNChannels) {
     nChannels = comm->nChannels = parent->sharedRes->tpNChannels;
@@ -501,16 +504,18 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
   }
 
   // Create rings array and check all is fine
-  NCCLCHECK(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext));
+  NCCLCHECKGOTO(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext), ret, fail);
 
-  free(ringRecv);
-  free(ringSend);
-  free(ringPrev);
-  free(ringNext);
-  free(treeToParent);
-  free(treeToChild0);
-  free(treeToChild1);
-  free(nvlsHeads);
-
-  return ncclSuccess;
+exit:
+  if (ringRecv) free(ringRecv);
+  if (ringSend) free(ringSend);
+  if (ringPrev) free(ringPrev);
+  if (ringNext) free(ringNext);
+  if (treeToParent) free(treeToParent);
+  if (treeToChild0) free(treeToChild0);
+  if (treeToChild1) free(treeToChild1);
+  if (nvlsHeads) free(nvlsHeads);
+  return ret;
+fail:
+  goto exit;
 }
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index 1380d24499..999312a0df 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -36,13 +36,13 @@ NCCL_PARAM(NvbDisable, "NVB_DISABLE", 0);
 static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclTopoSystem* system) {
   if (baseNode->paths[baseNode->type] == NULL) {
     NCCLCHECK(ncclCalloc(baseNode->paths+baseNode->type, system->nodes[baseNode->type].count));
+    for (int i=0; i<system->nodes[baseNode->type].count; i++) baseNode->paths[baseNode->type][i].type = PATH_DIS;
   }
 
   // breadth-first search to set all paths to that node in the system
   struct ncclTopoNodeList nodeList;
-  struct ncclTopoNodeList nextNodeList;
+  struct ncclTopoNodeList nextNodeList = { { 0 }, 0 };
   nodeList.count = 1; nodeList.list[0] = baseNode;
-  nextNodeList.count = 0;
   struct ncclTopoLinkList* basePath;
   NCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath));
   basePath->count = 0;
@@ -116,9 +116,9 @@ static void printNodePaths(struct ncclTopoSystem* system, struct ncclTopoNode* n
   const int linesize = 1024;
   char line[linesize];
 #ifdef ENABLE_TRACE
-  INFO(NCCL_GRAPH, "Paths from %s/%lX :", topoNodeTypeStr[node->type], node->id);
+  INFO(NCCL_GRAPH, "Paths from %s/%lx-%lx :", topoNodeTypeStr[node->type], NCCL_TOPO_ID_SYSTEM_ID(node->id), NCCL_TOPO_ID_LOCAL_ID(node->id));
 #else
-  snprintf(line, linesize, "%s/%lX :", topoNodeTypeStr[node->type], node->id);
+  snprintf(line, linesize, "%s/%lx-%lx :", topoNodeTypeStr[node->type], NCCL_TOPO_ID_SYSTEM_ID(node->id), NCCL_TOPO_ID_LOCAL_ID(node->id));
   int offset = strlen(line);
 #endif
   for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
@@ -155,14 +155,14 @@ ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system) {
   return ncclSuccess;
 }
 
-static ncclResult_t getLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu) {
+ncclResult_t ncclGetLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu) {
   // Find the closest CPU to a GPU
   int minHops = 0;
   int localCpu = -1;
   struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[gpu].paths[CPU];
   for (int c=0; c<system->nodes[CPU].count; c++) {
     int hops = paths[c].count;
-    if (minHops == 0 || hops < minHops) {
+    if (hops > 0 && (minHops == 0 || hops < minHops)) {
       localCpu = c;
       minHops = hops;
     }
@@ -193,20 +193,15 @@ static ncclResult_t addInterStep(struct ncclTopoSystem* system, int tx, int ix,
   return ncclSuccess;
 }
 
-// Remove/free paths for a given type
-static void ncclTopoRemovePathType(struct ncclTopoSystem* system, int nodeType) {
-  for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
-    // Remove links _to_ the given type
-    for (int n=0; n<system->nodes[t].count; n++) {
-      struct ncclTopoNode* node = system->nodes[t].nodes+n;
-      free(node->paths[nodeType]);
-      node->paths[nodeType] = NULL;
-    }
-    // Remove links _from_ the given type
-    for (int n=0; n<system->nodes[nodeType].count; n++) {
-      struct ncclTopoNode* node = system->nodes[nodeType].nodes+n;
-      free(node->paths[t]);
-      node->paths[t] = NULL;
+// Remove/free all paths
+static void ncclTopoRemovePaths(struct ncclTopoSystem* system) {
+  for (int t1=0; t1<NCCL_TOPO_NODE_TYPES; t1++) {
+    for (int n=0; n<system->nodes[t1].count; n++) {
+      struct ncclTopoNode* node = system->nodes[t1].nodes+n;
+      for (int t2=0; t2<NCCL_TOPO_NODE_TYPES; t2++) {
+        if (node->paths[t2]) free(node->paths[t2]);
+        node->paths[t2] = NULL;
+      }
     }
   }
 }
@@ -220,6 +215,7 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
       if (str) {
         int disable = strtol(str, NULL, 0);
         if (disable == 1) l = 0;
+        if (l >= 0) INFO(NCCL_ALL, "%s set by environment to %d", disableEnv, disable);
       }
     }
     if (l == -1) {
@@ -241,9 +237,9 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
           if (oldLevel > maxOldLevel) oldLevel = maxOldLevel;
           l = levelsOldToNew[oldLevel];
         }
+        if (l >= 0) INFO(NCCL_ALL, "%s set by environment to %s", levelEnv, topoPathTypeStr[l]);
       }
     }
-    if (l >= 0) INFO(NCCL_ALL, "%s set by environment to %s", levelEnv, topoPathTypeStr[l]);
     *level = l >= 0 ? l : -2;
   }
   return ncclSuccess;
@@ -252,16 +248,16 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
 NCCL_PARAM(IgnoreDisabledP2p, "IGNORE_DISABLED_P2P", 0);
 
 int ncclTopoUserP2pLevel = -1;
-ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank) {
+ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank) {
   *p2p = 0;
   if (read) *read = 0;
   if (intermediateRank) *intermediateRank = -1;
 
   // Get GPUs from topology
   int g1, g2;
-  NCCLCHECK(ncclTopoIdToIndex(system, GPU, id1, &g1));
+  NCCLCHECK(ncclTopoRankToIndex(system, rank1, &g1));
   struct ncclTopoNode* gpu1 = system->nodes[GPU].nodes+g1;
-  if (ncclTopoIdToIndex(system, GPU, id2, &g2) == ncclInternalError) {
+  if (ncclTopoRankToIndex(system, rank2, &g2) == ncclInternalError) {
     // GPU not found, we can't use p2p.
     return ncclSuccess;
   }
@@ -277,8 +273,13 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_
     }
   }
 
-  // In general, use P2P whenever we can.
-  int p2pLevel = PATH_SYS;
+  // By default don't use P2P across CPU Host Bridges and further apart
+  int p2pLevel = PATH_PXB;
+
+  int arch, vendor, model;
+  NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
+  // Allow P2P between pairs of GPUs on AMD systems
+  if ((arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD) && system->nodes[GPU].count <= 2) p2pLevel = PATH_SYS;
 
   // User override
   if (ncclTopoUserP2pLevel == -1)
@@ -288,16 +289,6 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_
     goto compare;
   }
 
-  // Don't use P2P through ARM CPUs
-  int arch, vendor, model;
-  NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
-  if (arch == NCCL_TOPO_CPU_ARCH_ARM) p2pLevel = PATH_PXB;
-  if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
-    p2pLevel = PATH_PXB;
-  }
-  if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
-    p2pLevel = PATH_PXB;
-  }
 
 compare:
   // Compute the PCI distance and compare with the p2pLevel.
@@ -438,7 +429,7 @@ ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int
 NCCL_PARAM(NetDisableIntra, "NET_DISABLE_INTRA", 0);
 
 // Check whether going through the network would be faster than going through P2P/SHM.
-ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net) {
+ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int rank1, int rank2, int* net) {
   if (ncclParamNetDisableIntra() == 1) {
     *net = 0;
     return ncclSuccess;
@@ -446,8 +437,8 @@ ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_
   *net = 1;
   // First check the current GPU-to-GPU speed.
   int g1, g2;
-  if (ncclTopoIdToIndex(system, GPU, id1, &g1) != ncclSuccess ||
-      ncclTopoIdToIndex(system, GPU, id2, &g2) != ncclSuccess) {
+  if (ncclTopoRankToIndex(system, rank1, &g1) != ncclSuccess ||
+      ncclTopoRankToIndex(system, rank2, &g2) != ncclSuccess) {
     return ncclSuccess;
   }
 
@@ -545,7 +536,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
   // Precompute paths between GPUs/NICs.
 
   // Remove everything in case we're re-computing
-  for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) ncclTopoRemovePathType(system, t);
+  ncclTopoRemovePaths(system);
 
   // Set direct paths to CPUs. We need them in many cases.
   for (int c=0; c<system->nodes[CPU].count; c++) {
@@ -571,11 +562,11 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
   for (int g=0; g<system->nodes[GPU].count; g++) {
     for (int p=0; p<system->nodes[GPU].count; p++) {
       int p2p;
-      NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p, NULL, NULL));
+      NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].gpu.rank, system->nodes[GPU].nodes[g].gpu.rank, &p2p, NULL, NULL));
       if (p2p == 0) {
         // Divert all traffic through the CPU
         int cpu;
-        NCCLCHECK(getLocalCpu(system, g, &cpu));
+        NCCLCHECK(ncclGetLocalCpu(system, g, &cpu));
         NCCLCHECK(addInterStep(system, CPU, cpu, GPU, p, GPU, g));
       }
     }
@@ -587,10 +578,10 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
       if (p == g) continue;
       struct ncclPeerInfo* srcInfo = comm->peerInfo+system->nodes[GPU].nodes[p].gpu.rank;
       int p2p;
-      NCCLCHECK(ncclTransports[TRANSPORT_P2P]->canConnect(&p2p, system, NULL, srcInfo, dstInfo));
+      NCCLCHECK(ncclTransports[TRANSPORT_P2P]->canConnect(&p2p, comm, NULL, srcInfo, dstInfo));
       if (p2p == 0) {
         int shm;
-        NCCLCHECK(ncclTransports[TRANSPORT_SHM]->canConnect(&shm, system, NULL, srcInfo, dstInfo));
+        NCCLCHECK(ncclTransports[TRANSPORT_SHM]->canConnect(&shm, comm, NULL, srcInfo, dstInfo));
         if (shm == 0) {
           // Mark this peer as inaccessible. We'll trim it later.
           system->nodes[GPU].nodes[p].paths[GPU][g].type = PATH_NET;
@@ -631,7 +622,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
         if (gdr == 0) {
           // We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU
           int localCpu;
-          NCCLCHECK(getLocalCpu(system, g, &localCpu));
+          NCCLCHECK(ncclGetLocalCpu(system, g, &localCpu));
           NCCLCHECK(addInterStep(system, CPU, localCpu, NET, n, GPU, g));
           NCCLCHECK(addInterStep(system, CPU, localCpu, GPU, g, NET, n));
         }
@@ -642,11 +633,13 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
 }
 
 ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm) {
+  ncclResult_t ret = ncclSuccess;
   int *domains;
-  int64_t *ids;
-  NCCLCHECK(ncclCalloc(&domains, system->nodes[GPU].count));
-  NCCLCHECK(ncclCalloc(&ids, system->nodes[GPU].count));
+  int64_t *ids = NULL;
   int myDomain = 0;
+  int ngpus = system->nodes[GPU].count;
+  NCCLCHECK(ncclCalloc(&domains, system->nodes[GPU].count));
+  NCCLCHECKGOTO(ncclCalloc(&ids, system->nodes[GPU].count), ret, fail);
   for (int g=0; g<system->nodes[GPU].count; g++) {
     struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
     domains[g] = g;
@@ -659,7 +652,6 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
     if (gpu->gpu.rank == comm->rank) myDomain = domains[g];
   }
 
-  int ngpus = system->nodes[GPU].count;
   for (int i=0; i<ngpus; i++) {
     if (domains[i] == myDomain) continue;
     struct ncclTopoNode* gpu = NULL;
@@ -670,24 +662,26 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
     }
     if (gpu == NULL) {
       WARN("Could not find id %lx", ids[i]);
-      free(domains);
-      free(ids);
-      return ncclInternalError;
+      ret = ncclInternalError;
+      goto fail;
     }
-    NCCLCHECK(ncclTopoRemoveNode(system, GPU, g));
+    NCCLCHECKGOTO(ncclTopoRemoveNode(system, GPU, g), ret, fail);
   }
 
   if (system->nodes[GPU].count == comm->nRanks) {
     for (int n=system->nodes[NET].count-1; n>=0; n--)
-      NCCLCHECK(ncclTopoRemoveNode(system, NET, n));
+      NCCLCHECKGOTO(ncclTopoRemoveNode(system, NET, n), ret, fail);
   }
+exit:
   free(domains);
-  free(ids);
-  return ncclSuccess;
+  if (ids) free(ids);
+  return ret;
+fail:
+  goto exit;
 }
 
 void ncclTopoFree(struct ncclTopoSystem* system) {
-  for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) ncclTopoRemovePathType(system, t);
+  ncclTopoRemovePaths(system);
   free(system);
 }
 
diff --git a/src/graph/rings.cc b/src/graph/rings.cc
index 53130d1290..5d967abb97 100644
--- a/src/graph/rings.cc
+++ b/src/graph/rings.cc
@@ -6,17 +6,23 @@
 
 #include "core.h"
 
-#define MAXWIDTH 20
-#define PREFIXLEN 15
-#define STRLENGTH (PREFIXLEN+5*MAXWIDTH)
 void dumpLine(int* values, int nranks, const char* prefix) {
-  int prefixlen = strlen(prefix);
-  char line[STRLENGTH+1];
-  line[STRLENGTH] = '\0';
-  memset(line, ' ', STRLENGTH);
-  strncpy(line, prefix, PREFIXLEN);
-  for (int i=0; i<nranks && i<MAXWIDTH; i++) sprintf(line+prefixlen+4*i, " %3d", values[i]);
-  INFO(NCCL_INIT,"%s", line);
+  constexpr int line_length = 128;
+  char line[line_length];
+  int num_width = snprintf(nullptr, 0, "%d", nranks-1);  // safe as per "man snprintf"
+  int n = snprintf(line, line_length, "%s", prefix);
+  for (int i = 0; i < nranks && n < line_length-1; i++) {
+    n += snprintf(line + n, line_length - n, " %*d", num_width, values[i]);
+    // At this point n may be more than line_length-1, so don't use it
+    // for indexing into "line".
+  }
+  if (n >= line_length) {
+    // Sprintf wanted to write more than would fit in the buffer. Assume
+    // line_length is at least 4 and replace the end with "..." to
+    // indicate that it was truncated.
+    snprintf(line+line_length-4, 4, "...");
+  }
+  INFO(NCCL_INIT, "%s", line);
 }
 
 ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
@@ -32,7 +38,7 @@ ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* p
       rings[r*nranks+i] = current;
       current = next[r*nranks+current];
     }
-    sprintf(prefix, "Channel %02d/%02d : ", r, nrings);
+    snprintf(prefix, sizeof(prefix), "Channel %02d/%02d :", r, nrings);
     if (rank == 0) dumpLine(rings+r*nranks, nranks, prefix);
     if (current != rank) {
       WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank);
diff --git a/src/graph/search.cc b/src/graph/search.cc
index 7f16cb7697..ad6f580540 100644
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@@ -104,6 +104,9 @@ static ncclResult_t followPath(struct ncclTopoLinkList* path, struct ncclTopoNod
       if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, link->type, &revLink));
       revBw += fwBw;
     }
+    // Coverity thinks that revLink could be NULL below.  However, we access it only if revBw is non-0, and the
+    // logic of the code is that revBw can become non-0 only if revLink is non-NULL (see the "if" statement right above).
+    // coverity[var_deref_op]
     if (link->bw < fwBw || (revBw && revLink->bw < revBw)) { *steps = step; return ncclSuccess; }
     SUB_ROUND(link->bw, fwBw);
     if (revBw) SUB_ROUND(revLink->bw, revBw);
@@ -444,6 +447,7 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop
 // 2. add other NETs satisfying typeInter but not already in the list.
 
 ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int* nets, int* netCountRet) {
+  ncclResult_t ret = ncclSuccess;
   int netCount = 0;
   int localNetCount;
   int* localNets;
@@ -456,8 +460,8 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
     struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
     for (int c = 0; c<MAXCHANNELS; c++) {
       int64_t netId;
-      NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId, NULL));
-      NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount));
+      NCCLCHECKGOTO(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId, NULL), ret, fail);
+      NCCLCHECKGOTO(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount), ret, fail);
       if (localNetCount > 0 && localNets[localNetCount] == localNets[0]) break;
       localNetCount++;
     }
@@ -491,12 +495,15 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
   }
 
   *netCountRet = netCount;
+exit:
   free(localNets);
-
-  return ncclSuccess;
+  return ret;
+fail:
+  goto exit;
 }
 
 ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time) {
+  ncclResult_t ret = ncclSuccess;
   if ((*time) <= 0) return ncclSuccess;
   (*time)--;
 
@@ -518,6 +525,7 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
   }
   graph->intra[graph->nChannels*ngpus+step] = gpu->gpu.rank;
   int g = gpu - system->nodes[GPU].nodes;
+  int* nets = NULL;
   if (step == backToNet) {
     // first get back to NIC
     if (system->nodes[NET].count) {
@@ -525,15 +533,17 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
       NCCLCHECK(getNetIndex(system, graph->inter[graph->nChannels*2], &startNetIndex));
       struct ncclTopoNode* startNet = system->nodes[NET].nodes+startNetIndex;
       int netCount;
-      int* nets;
       NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
-      NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netCount));
+      NCCLCHECKGOTO(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netCount), ret, fail);
       for (int i=0; i<netCount; i++) {
         int n = nets[i];
         struct ncclTopoNode* net = system->nodes[NET].nodes+n;
         if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) continue; // Trees are symmetric
-        if (graph->crossNic != 1 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port)) continue;
-        if (graph->crossNic && (graph->nChannels & 1) && net->id != graph->inter[(graph->nChannels-1)*2]) continue;
+        if (graph->pattern == NCCL_TOPO_PATTERN_RING && graph->crossNic == 2) {
+          if (graph->nChannels & 1 && net->id != graph->inter[(graph->nChannels-1)*2]) continue;
+        } else {
+          if (graph->crossNic == 0 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port)) continue;
+        }
 
         // Balanced Tree : count half of the bandwidth on first two GPUs
         int nextBackToNet = -1;
@@ -545,18 +555,17 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
           graph->bwInter /= 2;
         }
 
-        NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net));
+        NCCLCHECKGOTO(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net), ret, fail);
         graph->bwInter = bwInterSave;
         if (net) {
           graph->inter[graph->nChannels*2+1] = net->id;
-          NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, nextBackToNet, backToFirstRank, forcedOrder, time));
+          NCCLCHECKGOTO(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, nextBackToNet, backToFirstRank, forcedOrder, time), ret, fail);
 
           if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) graph->bwInter /= 2;
-          NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net));
+          NCCLCHECKGOTO(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net), ret, fail);
           graph->bwInter = bwInterSave;
         }
       }
-      free(nets);
     }
   } else if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
     NCCLCHECK(ncclTopoSearchTryNvls(system, graph, saveGraph, g, ngpus, time));
@@ -592,23 +601,29 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
     // Next path
     NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, ngpus, -1, -1, forcedOrder, time));
   }
-  return ncclSuccess;
+exit:
+  if (nets) free(nets);
+  return ret;
+fail:
+  goto exit;
 }
 
 ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int* time) {
+  ncclResult_t ret = ncclSuccess;
   const int bw = graph->bwInter;
   int* nets;
   NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
   int netCount;
   int graphFound = 0;
-  NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netCount));
+  NCCLCHECKGOTO(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netCount), ret, fail);
   for (int i=0; i<netCount; i++) {
-    if ((graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) && graphFound) continue;
+    if ((graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) && graphFound) break;
     int n = nets[(graph->nChannels+i)%netCount];
     struct ncclTopoNode* net = system->nodes[NET].nodes+n;
     if (graph->collNet && net->net.collSupport == 0) continue;
     if (net->net.bw < bw) continue;
-    if (graph->crossNic && (graph->nChannels & 1) && net->id != graph->inter[(graph->nChannels-1)*2+1]) continue;
+    if (graph->pattern == NCCL_TOPO_PATTERN_RING && graph->crossNic == 2
+        && (graph->nChannels & 1) && net->id != graph->inter[(graph->nChannels-1)*2+1]) continue;
 
     graph->inter[graph->nChannels*2] = net->id;
     graph->latencyInter = net->net.latency;
@@ -624,31 +639,34 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
       // NVLS search only tries to find NIC:GPU combinations to compute the heads.
       if (graph->nChannels < netCount) {
         int gpu;
-        int duplicate = 0;
-        NCCLCHECK(ncclTopoGetLocalGpu(system, net->id, &gpu));
-        // check whether there is duplicate head when one GPU connects with multiple NICs
-        for (int gc = 0; gc < graph->nChannels; gc++) {
-          if (graph->intra[gc * system->nodes[GPU].count] == system->nodes[GPU].nodes[gpu].gpu.rank) {
-            duplicate = 1;
-            break;
+        NCCLCHECKGOTO(ncclTopoGetLocalGpu(system, net->id, &gpu), ret, fail);
+        if (gpu != -1) {
+          int duplicate = 0;
+          // check whether there is duplicate head when one GPU connects with multiple NICs
+          for (int gc = 0; gc < graph->nChannels; gc++) {
+            if (graph->intra[gc * system->nodes[GPU].count] == system->nodes[GPU].nodes[gpu].gpu.rank) {
+              duplicate = 1;
+              break;
+            }
+          }
+          if (!duplicate) {
+            NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, gpu), ret, fail);
+            graphFound = 1;
           }
         }
-        if (duplicate) continue;
-        if (gpu != -1) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, gpu));
-        graphFound = 1;
       }
     } else {
       if (graph->nChannels > 0) {
         // Try to replay the last channel
         int g;
-        NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g));
-        NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g));
+        NCCLCHECKGOTO(ncclTopoReplayGetGpu(system, graph, -1, &g), ret, fail);
+        NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g), ret, fail);
       }
       if (graph->nChannels == 0 || graph->sameChannels == 0) {
         if (graph->nChannels == 0 && system->nodes[NVS].count == 0) {
           // Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long
           int t = 1 << 10;
-          NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0));
+          NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0), ret, fail);
           if (t == -1) *time = -1;
         }
 
@@ -660,7 +678,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
           if (paths[g].bw > maxBw) {
             maxBw = paths[g].bw;
             minHops = paths[g].count;
-          } else if (paths[g].bw == maxBw && paths[g].count < minHops) {
+          } else if (paths[g].bw == maxBw && paths[g].count > 0 && paths[g].count < minHops) {
             minHops = paths[g].count;
           }
         }
@@ -668,7 +686,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
           for (int i=0; i<system->nodes[GPU].count; i++) {
             int g = (graph->nChannels+i)%system->nodes[GPU].count;
             if (paths[g].bw == maxBw && paths[g].count == minHops) {
-              NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g));
+              NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g), ret, fail);
             }
           }
         }
@@ -682,8 +700,11 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
       }
     }
   }
+exit:
   free(nets);
-  return ncclSuccess;
+  return ret;
+fail:
+  goto exit;
 }
 
 /* Search Patterns
@@ -1040,9 +1061,10 @@ search:
     }
     tmpGraph.typeInter = PATH_PIX;
 
-    if (crossNic == 2 && tmpGraph.crossNic == 0) {
+    if (crossNic == 2 && tmpGraph.crossNic == 0
+        && (graph->pattern == NCCL_TOPO_PATTERN_RING || graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE)) {
       // Try again with crossNic if permitted
-      tmpGraph.crossNic = 1;
+      tmpGraph.crossNic = 2;
       goto search;
     }
     tmpGraph.crossNic = crossNic == 1 ? 1 : 0;
@@ -1112,7 +1134,7 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr
     sprintf(line, "%2d :", c);
     int offset = strlen(line);
     if (system->nodes[NET].count > 0) {
-      sprintf(line+offset, " %s/%lx", topoNodeTypeStr[NET], graph->inter[2*c]);
+      sprintf(line+offset, " %s/%lx-%lx", topoNodeTypeStr[NET], NCCL_TOPO_ID_SYSTEM_ID(graph->inter[2*c]), NCCL_TOPO_ID_LOCAL_ID(graph->inter[2*c]));
       offset = strlen(line);
     }
     for (int i=0; i<ngpus; i++) {
@@ -1120,7 +1142,7 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr
       offset = strlen(line);
     }
     if (system->nodes[NET].count > 0) {
-      sprintf(line+offset, " %s/%lx", topoNodeTypeStr[NET], graph->inter[2*c+1]);
+      sprintf(line+offset, " %s/%lx-%lx", topoNodeTypeStr[NET], NCCL_TOPO_ID_SYSTEM_ID(graph->inter[2*c+1]), NCCL_TOPO_ID_LOCAL_ID(graph->inter[2*c]));
       offset = strlen(line);
     }
     INFO(NCCL_GRAPH, "%s", line);
@@ -1129,16 +1151,20 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr
 }
 
 ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, struct ncclTopoGraph** graphs) {
+  ncclResult_t ret = ncclSuccess;
   const char* str = ncclGetEnv("NCCL_GRAPH_DUMP_FILE");
+  struct ncclXml* xml = NULL;
   if (str) {
     INFO(NCCL_ENV, "NCCL_GRAPH_DUMP_FILE set by environment to %s", str);
-    struct ncclXml* xml;
     NCCLCHECK(xmlAlloc(&xml, NCCL_GRAPH_XML_MAX_NODES));
-    NCCLCHECK(ncclTopoGetXmlFromGraphs(ngraphs, graphs, system, xml));
-    NCCLCHECK(ncclTopoDumpXmlToFile(str, xml));
-    free(xml);
+    NCCLCHECKGOTO(ncclTopoGetXmlFromGraphs(ngraphs, graphs, system, xml), ret, fail);
+    NCCLCHECKGOTO(ncclTopoDumpXmlToFile(str, xml), ret, fail);
   }
-  return ncclSuccess;
+exit:
+  if (xml) free(xml);
+  return ret;
+fail:
+  goto exit;
 }
 
 #include "comm.h"
diff --git a/src/graph/topo.cc b/src/graph/topo.cc
index d6af9282e5..9771ae05cb 100644
--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@@ -192,6 +192,7 @@ int getBcmGen(uint64_t id, int level) {
   return 0;
 }
 ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) {
+  ncclResult_t ret = ncclSuccess;
   for (int s=0; s<system->nodes[PCI].count; s++) {
     struct ncclTopoNode* pciSwitch = system->nodes[PCI].nodes+s;
     int gen = getBcmGen(pciSwitch->pci.device, 0);
@@ -217,7 +218,7 @@ ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) {
       for (int s=0; s<subs; s++) {
         // Find sub switch (system->nodes[PCI].nodes is changing every time we remove a node)
         int index;
-        NCCLCHECK(ncclTopoIdToIndex(system, PCI, subSwIds[s], &index));
+        NCCLCHECKGOTO(ncclTopoIdToIndex(system, PCI, subSwIds[s], &index), ret, fail);
         struct ncclTopoNode* sub = system->nodes[PCI].nodes+index;
         // Connect all sub PCI devices to the parent switch
         for (int l=0; l<sub->nlinks; l++) {
@@ -226,7 +227,8 @@ ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) {
           // Add link from parent PCI switch -> PCI device
           if (pciSwitch->nlinks == NCCL_TOPO_MAX_LINKS) {
             WARN("Error : too many Topo links (max %d)", NCCL_TOPO_MAX_LINKS);
-            return ncclInternalError;
+            ret = ncclInternalError;
+            goto fail;
           }
           memcpy(pciSwitch->links+pciSwitch->nlinks, sub->links+l, sizeof(struct ncclTopoLink));
           pciSwitch->nlinks++;
@@ -238,16 +240,20 @@ ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) {
             }
           }
         }
-        NCCLCHECK(ncclTopoRemoveNode(system, PCI, index));
+        NCCLCHECKGOTO(ncclTopoRemoveNode(system, PCI, index), ret, fail);
       }
       // Set subdevice to 0xffff to make sure we don't merge this switch again.
       pciSwitch->pci.device |= 0xffff;
       free(subSwIds);
       // Restart, as system->nodes[PCI].nodes has changed.
       s = 0;
+      continue;
+fail:
+      free(subSwIds);
+      return ret;
     }
   }
-  return ncclSuccess;
+  return ret;
 }
 
 ncclResult_t ncclTopoConnectCpus(struct ncclTopoSystem* system) {
@@ -281,7 +287,7 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN
   for (int l=0; l<node->nlinks; l++) {
     struct ncclTopoLink* link = node->links+l;
     if (link->type == LINK_LOC) {
-      sprintf(line+offset, "+ %s[%2.1f] - %s/%lX", topoLinkTypeStr[link->type], link->bw, topoNodeTypeStr[link->remNode->type], link->remNode->id);
+      sprintf(line+offset, "+ %s[%2.1f] - %s/%lx-%lx", topoLinkTypeStr[link->type], link->bw, topoNodeTypeStr[link->remNode->type], NCCL_TOPO_ID_SYSTEM_ID(link->remNode->id), NCCL_TOPO_ID_LOCAL_ID(link->remNode->id));
       INFO(NCCL_GRAPH, "%s", line);
     } else if (link->type != LINK_PCI || link->remNode != prevNode) {
       sprintf(line+offset, "+ %s[%2.1f] - ", topoLinkTypeStr[link->type], link->bw);
@@ -290,9 +296,9 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN
         NCCLCHECK(ncclTopoPrintRec(link->remNode, node, line, nextOffset));
       } else {
         if (link->remNode->type == NET) {
-          sprintf(line+nextOffset, "%s/%lX (%lx/%d/%f)", topoNodeTypeStr[link->remNode->type], link->remNode->id, link->remNode->net.asic, link->remNode->net.port, link->remNode->net.bw);
+          sprintf(line+nextOffset, "%s/%lx-%lx (%lx/%d/%f)", topoNodeTypeStr[link->remNode->type], NCCL_TOPO_ID_SYSTEM_ID(link->remNode->id), NCCL_TOPO_ID_LOCAL_ID(link->remNode->id), link->remNode->net.asic, link->remNode->net.port, link->remNode->net.bw);
         } else {
-          sprintf(line+nextOffset, "%s/%lX", topoNodeTypeStr[link->remNode->type], link->remNode->id);
+          sprintf(line+nextOffset, "%s/%lx-%lx", topoNodeTypeStr[link->remNode->type], NCCL_TOPO_ID_SYSTEM_ID(link->remNode->id), NCCL_TOPO_ID_LOCAL_ID(link->remNode->id));
         }
         INFO(NCCL_GRAPH, "%s", line);
       }
@@ -720,84 +726,87 @@ ncclResult_t ncclTopoRefreshBcmP2pLinks(void) {
 }
 
 ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) {
+  ncclResult_t ret = ncclSuccess;
   struct ncclXml* xml;
+  char* mem = NULL;
+  int* localRanks = NULL;
+  int netDevCount = 0;
+  struct ncclXml* rankXml;
+  int localRank = -1, nLocalRanks = 0;
   NCCLCHECK(xmlAlloc(&xml, NCCL_TOPO_XML_MAX_NODES));
   const char* xmlTopoFile = ncclGetEnv("NCCL_TOPO_FILE");
   if (xmlTopoFile) {
     INFO(NCCL_ENV, "NCCL_TOPO_FILE set by environment to %s", xmlTopoFile);
-    NCCLCHECK(ncclTopoGetXmlFromFile(xmlTopoFile, xml, 1));
+    NCCLCHECKGOTO(ncclTopoGetXmlFromFile(xmlTopoFile, xml, 1), ret, fail);
   } else {
     // Try default XML topology location
-    NCCLCHECK(ncclTopoGetXmlFromFile("/var/run/nvidia-topologyd/virtualTopology.xml", xml, 0));
+    NCCLCHECKGOTO(ncclTopoGetXmlFromFile("/var/run/nvidia-topologyd/virtualTopology.xml", xml, 0), ret, fail);
   }
   if (xml->maxIndex == 0) {
     // Create top tag
     struct ncclXmlNode* top;
-    NCCLCHECK(xmlAddNode(xml, NULL, "system", &top));
-    NCCLCHECK(xmlSetAttrInt(top, "version", NCCL_TOPO_XML_VERSION));
+    NCCLCHECKGOTO(xmlAddNode(xml, NULL, "system", &top), ret, fail);
+    NCCLCHECKGOTO(xmlSetAttrInt(top, "version", NCCL_TOPO_XML_VERSION), ret, fail);
   }
 
-  NCCLCHECK(ncclTopoRefreshBcmP2pLinks());
+  NCCLCHECKGOTO(ncclTopoRefreshBcmP2pLinks(), ret, fail);
 
   // Detect only the GPU managed by this process.  We'll get any others through XML fusion.
   char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
-  NCCLCHECK(int64ToBusId(comm->peerInfo[comm->rank].busId, busId));
+  NCCLCHECKGOTO(int64ToBusId(comm->peerInfo[comm->rank].busId, busId), ret, fail);
   struct ncclXmlNode* node;
-  NCCLCHECK(ncclTopoFillGpu(xml, busId, &node));
+  NCCLCHECKGOTO(ncclTopoFillGpu(xml, busId, &node), ret, fail);
   if (node) {
-    NCCLCHECK(xmlSetAttrInt(node, "keep", 1));
-    NCCLCHECK(xmlSetAttrInt(node, "rank", comm->rank));
-    NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[comm->rank].gdrSupport));
+    NCCLCHECKGOTO(xmlSetAttrInt(node, "keep", 1), ret, fail);
+    NCCLCHECKGOTO(xmlSetAttrInt(node, "rank", comm->rank), ret, fail);
+    NCCLCHECKGOTO(xmlInitAttrInt(node, "gdr", comm->peerInfo[comm->rank].gdrSupport), ret, fail);
   }
   // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
   // so we start with collnet so that it has precedence.
-  int netDevCount = 0;
   if (collNetSupport(comm)) {
-    NCCLCHECK(collNetDevices(comm, &netDevCount));
+    NCCLCHECKGOTO(collNetDevices(comm, &netDevCount), ret, fail);
     for (int n=0; n<netDevCount; n++) {
       ncclNetProperties_t props;
-      NCCLCHECK(collNetGetProperties(comm, n, &props));
+      NCCLCHECKGOTO(collNetGetProperties(comm, n, &props), ret, fail);
       struct ncclXmlNode* netNode;
-      NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
-      NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
-      NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
-      NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
-      NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
-      NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
-      NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
+      NCCLCHECKGOTO(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode), ret, fail);
+      NCCLCHECKGOTO(xmlSetAttrInt(netNode, "keep", 1), ret, fail);
+      NCCLCHECKGOTO(xmlSetAttrInt(netNode, "dev", n), ret, fail);
+      NCCLCHECKGOTO(xmlInitAttrInt(netNode, "speed", props.speed), ret, fail);
+      NCCLCHECKGOTO(xmlInitAttrInt(netNode, "port", props.port), ret, fail);
+      NCCLCHECKGOTO(xmlInitAttrUint64(netNode, "guid", props.guid), ret, fail);
+      NCCLCHECKGOTO(xmlInitAttrInt(netNode, "maxconn", props.maxComms), ret, fail);
       bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
       INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
-      NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport));
-      NCCLCHECK(xmlInitAttrInt(netNode, "coll", 1));
+      NCCLCHECKGOTO(xmlInitAttrInt(netNode, "gdr", gdrSupport), ret, fail);
+      NCCLCHECKGOTO(xmlInitAttrInt(netNode, "coll", 1), ret, fail);
     }
   }
   if (netDevCount == 0) {
-    NCCLCHECK(comm->ncclNet->devices(&netDevCount));
+    NCCLCHECKGOTO(comm->ncclNet->devices(&netDevCount), ret, fail);
   }
   for (int n=0; n<netDevCount; n++) {
     ncclNetProperties_t props;
-    NCCLCHECK(comm->ncclNet->getProperties(n, &props));
+    NCCLCHECKGOTO(comm->ncclNet->getProperties(n, &props), ret, fail);
     comm->netDeviceType = props.netDeviceType;
     struct ncclXmlNode* netNode;
-    NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
-    NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
-    NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
-    NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
-    NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
-    NCCLCHECK(xmlInitAttrFloat(netNode, "latency", props.latency));
-    NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
-    NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
+    NCCLCHECKGOTO(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode), ret, fail);
+    NCCLCHECKGOTO(xmlSetAttrInt(netNode, "keep", 1), ret, fail);
+    NCCLCHECKGOTO(xmlSetAttrInt(netNode, "dev", n), ret, fail);
+    NCCLCHECKGOTO(xmlInitAttrInt(netNode, "speed", props.speed), ret, fail);
+    NCCLCHECKGOTO(xmlInitAttrInt(netNode, "port", props.port), ret, fail);
+    NCCLCHECKGOTO(xmlInitAttrFloat(netNode, "latency", props.latency), ret, fail);
+    NCCLCHECKGOTO(xmlInitAttrUint64(netNode, "guid", props.guid), ret, fail);
+    NCCLCHECKGOTO(xmlInitAttrInt(netNode, "maxconn", props.maxComms), ret, fail);
     bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
     INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
-    NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport));
+    NCCLCHECKGOTO(xmlInitAttrInt(netNode, "gdr", gdrSupport), ret, fail);
   }
 
   // Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
-  NCCLCHECK(ncclTopoTrimXml(xml));
+  NCCLCHECKGOTO(ncclTopoTrimXml(xml), ret, fail);
 
   // XML topo fusion.
-  int* localRanks;
-  int localRank = -1, nLocalRanks = 0;
   if (comm->MNNVL) {
     // MNNVL clique support
     nLocalRanks = comm->clique.size;
@@ -805,7 +814,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
     localRanks = comm->clique.ranks;
   } else {
     // Intra-node fusion.  Much of the comm is not initialized yet at this point so we need to do our own calculations.
-    NCCLCHECK(ncclCalloc(&localRanks, comm->nRanks));
+    NCCLCHECKGOTO(ncclCalloc(&localRanks, comm->nRanks), ret, fail);
     for (int i = 0; i < comm->nRanks; i++) {
       if (comm->peerInfo[i].hostHash == comm->peerInfo[comm->rank].hostHash) {
         if (i == comm->rank)
@@ -814,37 +823,42 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
       }
     }
   }
-  char* mem;
-  NCCLCHECK(ncclCalloc(&mem, nLocalRanks * xmlMemSize(NCCL_TOPO_XML_MAX_NODES)));
-  struct ncclXml* rankXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*localRank);
+  NCCLCHECKGOTO(ncclCalloc(&mem, nLocalRanks * xmlMemSize(NCCL_TOPO_XML_MAX_NODES)), ret, fail);
+  rankXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*localRank);
   memcpy(rankXml, xml, xmlMemSize(NCCL_TOPO_XML_MAX_NODES));
-  NCCLCHECK(ncclTopoConvertXml(rankXml, (uintptr_t)xml->nodes, 1));
-  NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, localRanks, localRank, nLocalRanks, mem, xmlMemSize(NCCL_TOPO_XML_MAX_NODES)));
+  NCCLCHECKGOTO(ncclTopoConvertXml(rankXml, (uintptr_t)xml->nodes, 1), ret, fail);
+  // nLocalRanks can't actually be 0, or we wouldn't be running at all...
+  // coverity[divide_by_zero]
+  NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, localRanks, localRank, nLocalRanks, mem, xmlMemSize(NCCL_TOPO_XML_MAX_NODES)), ret, fail);
   if (comm->MNNVL) {
     // Ensure that we have enough room when fusing topos from multiple nodes.
     free(xml);
-    NCCLCHECK(xmlAlloc(&xml, nLocalRanks*NCCL_TOPO_XML_MAX_NODES));
+    xml = NULL;
+    NCCLCHECKGOTO(xmlAlloc(&xml, nLocalRanks*NCCL_TOPO_XML_MAX_NODES), ret, fail);
   } else {
     // In the intra-node case there's no need to enlarge the topo xml.
     xml->maxIndex = 0;
-    free(localRanks);
   }
   for (int i = 0; i < nLocalRanks; i++) {
     struct ncclXml* peerXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*i);
-    NCCLCHECK(ncclTopoConvertXml(peerXml, (uintptr_t)peerXml->nodes, 0));
-    NCCLCHECK(ncclTopoFuseXml(xml, peerXml));
+    NCCLCHECKGOTO(ncclTopoConvertXml(peerXml, (uintptr_t)peerXml->nodes, 0), ret, fail);
+    NCCLCHECKGOTO(ncclTopoFuseXml(xml, peerXml), ret, fail);
   }
-  free(mem);
 
   xmlTopoFile = ncclGetEnv("NCCL_TOPO_DUMP_FILE");
   if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) {
     INFO(NCCL_ENV, "NCCL_TOPO_DUMP_FILE set by environment to %s", xmlTopoFile);
-    NCCLCHECK(ncclTopoDumpXmlToFile(xmlTopoFile, xml));
+    NCCLCHECKGOTO(ncclTopoDumpXmlToFile(xmlTopoFile, xml), ret, fail);
   }
 
-  NCCLCHECK(ncclTopoGetSystemFromXml(xml, system, comm->peerInfo[comm->rank].hostHash));
+  NCCLCHECKGOTO(ncclTopoGetSystemFromXml(xml, system, comm->peerInfo[comm->rank].hostHash), ret, fail);
+exit:
+  if (!comm->MNNVL && localRanks) free(localRanks);
+  if (mem) free(mem);
   free(xml);
-  return ncclSuccess;
+  return ret;
+fail:
+  goto exit;
 }
 
 ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType, int** locals, int* localCount, int* pathType) {
@@ -853,6 +867,7 @@ ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index
   int count = 0;
   NCCLCHECK(ncclCalloc(locals, system->nodes[resultType].count));
   struct ncclTopoLinkList* paths = system->nodes[type].nodes[index].paths[resultType];
+  if (paths == NULL) { *localCount = 0; return ncclSuccess; }
   for (int i=0; i<system->nodes[resultType].count; i++) {
     if (paths[i].bw > maxBw || (paths[i].bw == maxBw && paths[i].type < minType)) {
       maxBw = paths[i].bw;
@@ -891,6 +906,7 @@ ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *c
 }
 
 ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev) {
+  ncclResult_t ret = ncclSuccess;
   int gpu;
   NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu));
   int* localNets;
@@ -898,39 +914,46 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int ch
   NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, &localNets, &localNetCount, NULL));
   int* localGpus = NULL;
   int localGpuCount;
-  NCCLCHECK(ncclTopoGetLocal(system, NET, localNets[0], GPU, &localGpus, &localGpuCount, NULL));
-  int net = system->nodes[GPU].nodes[gpu].gpu.dev;
+  int net;
+  NCCLCHECKGOTO(ncclTopoGetLocal(system, NET, localNets[0], GPU, &localGpus, &localGpuCount, NULL), ret, fail);
+  net = system->nodes[GPU].nodes[gpu].gpu.dev;
   if (isPow2(localNetCount)) net = mirrorBits(net, localNetCount);
   net += channelId%(DIVUP(localNetCount,localGpuCount));
   if (id) *id = system->nodes[NET].nodes[localNets[net%localNetCount]].id;
   if (dev) *dev = system->nodes[NET].nodes[localNets[net%localNetCount]].net.dev;
+exit:
   free(localNets);
-  free(localGpus);
-  return ncclSuccess;
+  if (localGpus) free(localGpus);
+  return ret;
+fail:
+  goto exit;
 }
 
 ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, int* gpuIndex) {
+  ncclResult_t ret = ncclSuccess;
   int netIndex;
   NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &netIndex));
   int* localGpus = NULL;
   int localGpuCount;
+  int foundGpu = -1;
   NCCLCHECK(ncclTopoGetLocal(system, NET, netIndex, GPU, &localGpus, &localGpuCount, NULL));
   for (int c=0; c<MAXCHANNELS; c++) {
     for (int lg=0; lg<localGpuCount; lg++) {
       int g = localGpus[lg];
       struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
       int64_t id;
-      NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id, NULL));
+      NCCLCHECKGOTO(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id, NULL), ret, fail);
       if (netId == id) {
-        *gpuIndex = g;
-        free(localGpus);
-        return ncclSuccess;
+        foundGpu = g;
+        goto exit;
       }
     }
   }
+exit:
+  *gpuIndex = foundGpu;
+fail:
   free(localGpus);
-  *gpuIndex = -1;
-  return ncclSuccess;
+  return ret;
 }
 
 /****************************/
@@ -948,25 +971,11 @@ NCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0);
 
 ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity) {
   struct ncclTopoNode* cpu = NULL, *gpu = NULL;
-  for (int g=0; g<system->nodes[GPU].count; g++) {
-    if (system->nodes[GPU].nodes[g].gpu.rank == rank) {
-      gpu = system->nodes[GPU].nodes+g;
-      // Find closer CPU
-      int cpuIndex = -1, minHops = 0;
-      for (int c=0; c<system->nodes[CPU].count; c++) {
-        int nHops = system->nodes[GPU].nodes[g].paths[CPU][c].count;
-        if (cpuIndex == -1 || nHops < minHops) {
-          cpuIndex = c;
-          minHops = nHops;
-        }
-      }
-      cpu = system->nodes[CPU].nodes+cpuIndex;
-    }
-  }
-  if (cpu == NULL) {
-    WARN("Set CPU affinity : unable to find GPU/CPU for rank %d", rank);
-    return ncclInternalError;
-  }
+  int gpuIndex, cpuIndex;
+  NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpuIndex));
+  NCCLCHECK(ncclGetLocalCpu(system, gpuIndex, &cpuIndex));
+  gpu = system->nodes[GPU].nodes+gpuIndex;
+  cpu = system->nodes[CPU].nodes+cpuIndex;
 
   // Query the CPU affinity set we were provided
   cpu_set_t mask;
diff --git a/src/graph/topo.h b/src/graph/topo.h
index 6613f3271d..0837fb4b38 100644
--- a/src/graph/topo.h
+++ b/src/graph/topo.h
@@ -30,7 +30,7 @@
 // to GPU traffic consumes more PCI bandwidth.
 #define INTEL_P2P_OVERHEAD(bw) (bw*6/5)
 
-#define NCCL_TOPO_NODE_TYPES 7
+#define NCCL_TOPO_NODE_TYPES 6
 #define GPU 0
 #define PCI 1
 #define NVS 2
@@ -103,9 +103,10 @@ struct ncclTopoLinkList {
 
 #define NCCL_TOPO_UNDEF (-1)
 
+#define NCCL_TOPO_ID_LOCAL_ID_MASK 0x00ffffffffffffff
 #define NCCL_TOPO_ID_SYSTEM_ID(id) (id >> 56)
-#define NCCL_TOPO_ID_LOCAL_ID(id) (id & 0x00ffffffffffffff)
-#define NCCL_TOPO_ID(systemid, localid) (((int64_t)systemid << 56) + localid)
+#define NCCL_TOPO_ID_LOCAL_ID(id) (id & NCCL_TOPO_ID_LOCAL_ID_MASK)
+#define NCCL_TOPO_ID(systemid, localid) (((int64_t)systemid << 56) + (localid & NCCL_TOPO_ID_LOCAL_ID_MASK))
 
 struct ncclTopoNode {
   int type;
diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc
index f9d814a25b..f0a6224528 100644
--- a/src/graph/tuning.cc
+++ b/src/graph/tuning.cc
@@ -54,7 +54,7 @@ ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* li
 // Latencies in us, Bandwidths in GB/s
 // Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple }
 static const float baseLat  [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = {
-       {  6.8, 14.0,    0 }, {  6.6, 14.0,  8.4 },  // Tree, Ring
+       {  6.8, 14.0,  8.4 }, {  6.6, 14.0,  8.4 },  // Tree, Ring
        {    0,    0,    0 }, {    0,    0,    0 },  // Collnet Direct, Chain
        {    0,    0,    0 }, {    0,    0,    0 }}; // NVLS, NVLS Tree
 
@@ -64,15 +64,15 @@ static const float baseLat  [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = {
 #define NCCL_HW_NET 2
 static float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
 { /* NVLINK */
-  { /* Tree (LL/LL128/Simple)*/ { .6, 1.25, 28 }, /* Ring (LL/LL128/Simple)*/ { .6, 1.9, 3.4 },
+  { /* Tree (LL/LL128/Simple)*/ { .6, 1.25, 4.0 }, /* Ring (LL/LL128/Simple)*/ { .6, 1.9, 3.4 },
     /* CollNetDirect (Simple)*/ { 0, 0, 3.7 }, /* CollNetChain (Simple)*/ { 0, 0, 2.8 },
     /* NVLS */ { 0, 0, 25 }, /* NVLSTree */ { 0, 0, 25 } },
   /* PCI */
-  { /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 },
+  { /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 4.0 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 },
     /* CollNetDirect (Simple)*/ { 0, 0, 3.7 }, /* CollNetChain (Simple)*/ { 0, 0, 2.8 },
     /* NVLS */ { 0, 0, 0 }, /* NVLSTree */ { 0, 0, 0 } },
   /* NET */
-  { /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 28 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 14.0 },
+  { /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 14 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 14.0 },
     /* CollNetDirect (Simple)*/ { 0, 0, 31 }, /* CollNetChain (Simple)*/ { 0, 0, 30 },
     /* NVLS */ { 0, 0, 18 }, /* NVLSTree */ { 0, 0, 14 } }
 };
@@ -105,6 +105,15 @@ static const double perChMaxTreeBws[3][3] = {
   /* Hopper (N1/N2/N4) */ {38.7, 41.4, 36.0},
 };
 
+NCCL_PARAM(PatEnable, "PAT_ENABLE", 2);
+static int ncclPatEnable(struct ncclComm* comm) {
+  int patEnable = ncclParamPatEnable();
+  if (patEnable != 2) return patEnable;
+  if (comm->nNodes != comm->nRanks) return 0; // PAT only supports 1 GPU per node
+  if (comm->netDeviceType != NCCL_NET_DEVICE_HOST) return 0;   // PAT doesn't support net device offload
+  return 1;
+}
+
 // Network post overhead in ns (1000 = 1 us)
 NCCL_PARAM(NetOverhead, "NET_OVERHEAD", -2);
 
@@ -146,7 +155,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
   double perChMaxTreeLL128Bw = perChMaxTreeLL128Bws[compCapIndex][index2];
   // De-penalize Tree/Simple latency on Power systems to favor Tree than Ring
   if (cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE];
-  float ppn = (float)nRanks / nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount
+  float ppn = (float)nRanks / nNodes;
 
   int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS];
   for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->typeIntra == LINK_NVL ? NCCL_HW_NVLINK : NCCL_HW_PCI;
@@ -156,18 +165,18 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
     int nsteps = coll == ncclFuncAllReduce ? 2*(nRanks-1) :
       coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nRanks-1 :
       nRanks;
-    int nInterSteps = coll == ncclFuncAllReduce ? (nNodes > 1 ? 2*nNodes :0) :
-      coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nNodes-1 :
-      nNodes;
 
     for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
-      if (coll == ncclFuncBroadcast && a != NCCL_ALGO_RING) continue;
-      if (coll == ncclFuncReduce && a != NCCL_ALGO_RING) continue;
-      if (coll == ncclFuncReduceScatter && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS && a != NCCL_ALGO_COLLNET_DIRECT) continue;
-      if (coll == ncclFuncAllGather && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS && a != NCCL_ALGO_COLLNET_DIRECT) continue;
+      if ((coll == ncclFuncBroadcast || coll == ncclFuncReduce) && a != NCCL_ALGO_RING) continue;
+      if ((coll == ncclFuncReduceScatter || coll == ncclFuncAllGather)
+          && a != NCCL_ALGO_PAT && a != NCCL_ALGO_RING
+          && a != NCCL_ALGO_NVLS && a != NCCL_ALGO_COLLNET_DIRECT) continue;
+      if (coll == ncclFuncAllReduce && a == NCCL_ALGO_PAT) continue;
 
       for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
         if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && p != NCCL_PROTO_SIMPLE) continue;
+        if ((coll == ncclFuncReduceScatter || coll == ncclFuncAllGather)
+            && a == NCCL_ALGO_PAT && (p != NCCL_PROTO_SIMPLE || ncclPatEnable(comm) == 0)) continue;
         int collnet = (a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) ? 1 : 0;
         float bw = nNodes <= 2 || collnet ? graphs[a]->bwIntra : graphs[a]->bwInter;
         if (a == NCCL_ALGO_NVLS) bw = std::min(graphs[a]->bwIntra, graphs[a]->bwInter);
@@ -176,11 +185,12 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
 
         // Various model refinements
         if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * .5); }
-        if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), graphs[a]->nChannels*perChMaxRingLL128Bw);
-        if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw);
+        if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (0.92 /*120.0/128.0*/), graphs[a]->nChannels*perChMaxRingLL128Bw);
+        if (a == NCCL_ALGO_TREE && coll == ncclFuncAllReduce) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw);
         if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw);
         if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), graphs[a]->nChannels*perChMaxTreeLL128Bw);
         if (a == NCCL_ALGO_TREE && graphs[a]->pattern == NCCL_TOPO_PATTERN_TREE) busBw *= .85;
+        if (a == NCCL_ALGO_PAT) busBw *= .85;
         if (a == NCCL_ALGO_COLLNET_DIRECT && p != NCCL_PROTO_SIMPLE) busBw = 0;  // Not used
         if (a == NCCL_ALGO_COLLNET_CHAIN && p != NCCL_PROTO_SIMPLE) busBw = 0;  // Not used
         if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE) {
@@ -208,7 +218,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
         }
 
         // Convert bus BW to algorithm BW
-        if (!(a == NCCL_ALGO_COLLNET_DIRECT && (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter))) {
+        if (!(a != NCCL_ALGO_RING && (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter))) {
           float ratio = 1.0f;
           if (a == NCCL_ALGO_RING) ratio *= (1.0 * nRanks) / nsteps;
           else if (a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) ratio *= 5.0/6.0;
@@ -222,7 +232,9 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
 
         comm->latencies[coll][a][p] = baseLat[a][p];
         float intraLat = hwLat[intraHw[a]][a][p];
-        float interLat = hwLat[NCCL_HW_NET][a][p] + graphs[a]->latencyInter;
+        // With ppn=1 latencies are fully exposed, use the Tree network latency
+        float interLat = ppn == 1 ? hwLat[NCCL_HW_NET][NCCL_ALGO_TREE][p] : hwLat[NCCL_HW_NET][a][p];
+        interLat += graphs[a]->latencyInter;
         // Also add the flush extra latency
         if (p == NCCL_PROTO_SIMPLE) interLat += graphs[a]->latencyInter;
 
@@ -243,11 +255,14 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
               if (p == NCCL_PROTO_SIMPLE) netOverhead *= 3;
             }
             intraLat = std::max(intraLat, netOverhead);
+            int nInterSteps = nNodes == 1 ? 0 : coll == ncclFuncAllReduce ? 2*(nNodes-1) : nNodes-1;
             comm->latencies[coll][a][p] += (nsteps-nInterSteps)*intraLat + nInterSteps*interLat;
           }
         } else if (a == NCCL_ALGO_TREE) {
-          comm->latencies[coll][a][p] +=
-            2 * ((nRanks/nNodes-1) * intraLat + log2i(nNodes) * interLat);
+          if (coll == ncclFuncAllReduce) {
+            comm->latencies[coll][a][p] +=
+              2 * ((nRanks/nNodes-1) * intraLat + log2i(nNodes) * interLat);
+          }
         } else if (a == NCCL_ALGO_COLLNET_DIRECT) {
           comm->latencies[coll][a][p] +=
             2 * (std::min(1, (nRanks/nNodes-1)) * intraLat + (nRanks/nNodes-1) * 0.4) + interLat;  // Add 0.4 us arity serialization latency
@@ -258,6 +273,12 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
           if (nNodes > 1) comm->latencies[coll][a][p] += interLat;
         } else if (a == NCCL_ALGO_NVLS_TREE) {
           comm->latencies[coll][a][p] += intraLat + 2 * log2i(nNodes) * interLat;
+        } else if (a == NCCL_ALGO_PAT) {
+          if (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter) {
+            comm->latencies[coll][a][p] = 8 // Base time
+              + log2i(nNodes) * (interLat/3.5) // Log latency
+              + nRanks * 2.8; // Still a linear part; hopefully we'll manage to remove it at some point.
+          }
         }
       }
     }
@@ -266,7 +287,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
   // Protocols/Algorithms enable/disable, and user overrides.
   // All are enabled except ll128 which is enabled by default only in certain cases.
   int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 };
-  int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1, 1, 1 };
+  int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1, 1, 1, 1 };
 
   const char *protoStr = ncclGetEnv("NCCL_PROTO");
   if (protoStr) {
@@ -336,23 +357,25 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
 
   if (comm->rank == 0) {
     char line[1024];
-    for (int block=0; block<2; block++) {
+    for (int block=0; block<DIVUP(NCCL_NUM_ALGORITHMS, 3); block++) {
       sprintf(line, "  Algorithm   |");
-      for (int ba=0; ba<NCCL_NUM_ALGORITHMS/2; ba++) {
-	int a = block*NCCL_NUM_ALGORITHMS/2+ba;
+      for (int ba=0; ba<3; ba++) {
+	int a = block*3+ba;
+        if (a >= NCCL_NUM_ALGORITHMS) continue;
         sprintf(line+strlen(line), " %14s   %14s   %14s |", "", ncclAlgoStr[a], "");
       }
       INFO(NCCL_TUNING, "%s", line);
       sprintf(line, "  Protocol    |");
-      for (int ba=0; ba<NCCL_NUM_ALGORITHMS/2; ba++) {
+      for (int ba=0; ba<3; ba++) {
         for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
           sprintf(line+strlen(line), " %14s |", ncclProtoStr[p]);
         }
       }
       INFO(NCCL_TUNING, "%s", line);
       sprintf(line, " Max NThreads |");
-      for (int ba=0; ba<NCCL_NUM_ALGORITHMS/2; ba++) {
-	int a = block*NCCL_NUM_ALGORITHMS/2+ba;
+      for (int ba=0; ba<3; ba++) {
+	int a = block*3+ba;
+        if (a >= NCCL_NUM_ALGORITHMS) continue;
         for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
           sprintf(line+strlen(line), " %14d |", comm->maxThreads[a][p]);
         }
@@ -360,8 +383,9 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
       INFO(NCCL_TUNING, "%s", line);
       for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) {
         sprintf(line, "%13s |", ncclFuncStr[c]);
-        for (int ba=0; ba<NCCL_NUM_ALGORITHMS/2; ba++) {
-	  int a = block*NCCL_NUM_ALGORITHMS/2+ba;
+        for (int ba=0; ba<3; ba++) {
+	  int a = block*3+ba;
+          if (a >= NCCL_NUM_ALGORITHMS) continue;
           for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
             sprintf(line+strlen(line), "%8.1f/%6.1f |", comm->latencies[c][a][p], comm->bandwidths[c][a][p]);
           }
@@ -431,7 +455,7 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm,
     *time = -1.0; return ncclSuccess;
   }
   int logSize = log2i(nBytes>>6);
-  if (algorithm == NCCL_ALGO_TREE && logSize >= 0 && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize];
+  if (algorithm == NCCL_ALGO_TREE && coll == ncclFuncAllReduce && logSize >= 0 && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize];
   if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && comm->nNodes > 1
       && coll == ncclFuncAllReduce && nBytes/(comm->nChannels*comm->nRanks) >= 64) {
     lat *= comm->minCompCap < 80 ? 1.9 : 1.4; // Plateau effect of ring
diff --git a/src/graph/xml.cc b/src/graph/xml.cc
index c2c6a1c811..bb123b7980 100644
--- a/src/graph/xml.cc
+++ b/src/graph/xml.cc
@@ -468,8 +468,8 @@ ncclResult_t ncclTopoGetXmlFromCpu(struct ncclXmlNode* cpuNode, struct ncclXml*
       return ncclInternalError;
     }
     // Set affinity
-    char cpumaskPath[] = "/sys/devices/system/node/node0000";
-    sprintf(cpumaskPath, "/sys/devices/system/node/node%s", numaId);
+    char cpumaskPath[] = "/sys/devices/system/node/node000000";
+    snprintf(cpumaskPath, sizeof(cpumaskPath), "/sys/devices/system/node/node%s", numaId);
     NCCLCHECK(ncclTopoSetAttrFromSys(cpuNode, cpumaskPath, "cpumap", "affinity"));
   }
 
@@ -690,6 +690,9 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
     }
     pciNode->parent = parent;
     // Keep PCI sub devices ordered by PCI Bus ID (Issue #820)
+    // Coverity complains about dereferenced parent being NULL
+    // but this can never happen.
+    // coverity[var_deref_op]
     int subIndex = parent->nSubs;
     const char* newBusId;
     NCCLCHECK(xmlGetAttrStr(pciNode, "busid", &newBusId));
diff --git a/src/group.cc b/src/group.cc
index 7158b45c2a..3d3ecb88c0 100644
--- a/src/group.cc
+++ b/src/group.cc
@@ -57,7 +57,12 @@ ncclResult_t ncclAsyncLaunch(
       WARN("Blocking and nonblocking communicators are not allowed in the same group.");
       ret = ncclInvalidArgument;
     }
-    ncclIntruQueueEnqueue(&ncclAsyncJobs, job);
+    if (ret == ncclSuccess) {
+      ncclIntruQueueEnqueue(&ncclAsyncJobs, job);
+    } else {
+      // no need to undo, the job hasn't run
+      if (destructor) destructor(job);
+    }
   }
 
   return ret;
@@ -75,7 +80,7 @@ void* ncclAsyncJobMain(void* arg) {
 
 ncclResult_t ncclAsyncJobComplete(struct ncclAsyncJob* job) {
   ncclResult_t ret;
-  SYSCHECK(pthread_join(job->thread, NULL), "pthread_join");
+  PTHREADCHECK(pthread_join(job->thread, NULL), "pthread_join");
   if (job->result != ncclSuccess) {
     WARN("ncclAsyncJobComplete: job %p failed, job error %d", job, job->result);
   }
@@ -165,6 +170,12 @@ ncclResult_t ncclCollPreconnectFunc(struct ncclAsyncJob* job_) {
           NCCLCHECKGOTO(ncclCollNetDirectBufferSetup(comm), ret, fail);
           break;
         }
+        case NCCL_ALGO_PAT: {
+          NCCLCHECKGOTO(ncclTransportPatConnect(comm), ret, fail);
+          break;
+        }
+        // Yes, it's a dead code.  That's fine...
+        // coverity[dead_error_begin]
         default: {
           ret = ncclInternalError;
           goto fail;
@@ -301,7 +312,7 @@ static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** g
       ncclKernelPlanner::Peer* tmp = comm->planner.peers;
       memset(&comm->planner, 0, sizeof(comm->planner));
       comm->planner.peers = tmp;
-      memset(comm->planner.peers, 0, comm->nRanks*sizeof(comm->planner.peers[0]));
+      if (comm->planner.peers != NULL) memset(comm->planner.peers, 0, comm->nRanks*sizeof(comm->planner.peers[0]));
     }
 
     if (!comm->config.blocking)
@@ -329,7 +340,7 @@ static ncclResult_t asyncJobLaunch(struct ncclIntruQueue<struct ncclAsyncJob, &n
   if (!ncclIntruQueueEmpty(asyncJobsMain)) {
     struct ncclAsyncJob* job = ncclIntruQueueHead(asyncJobsMain);
     do {
-      SYSCHECKGOTO(pthread_create(&job->thread, nullptr, ncclAsyncJobMain, job), ret, fail);
+      PTHREADCHECKGOTO(pthread_create(&job->thread, nullptr, ncclAsyncJobMain, job), "pthread_create", ret, fail);
       job = job->next;
     } while (job != nullptr);
 
@@ -341,8 +352,9 @@ static ncclResult_t asyncJobLaunch(struct ncclIntruQueue<struct ncclAsyncJob, &n
         if (state == ncclGroupJobRunning) {
           jobsDone = false;
         } else if (state == ncclGroupJobDone) {
-          if (pthread_join(job->thread, nullptr) != 0) {
-            WARN("Error waiting for pthread_join : %s", strerror(errno));
+          int err;
+          if ((err = pthread_join(job->thread, nullptr)) != 0) {
+            WARN("Error waiting for pthread_join: %s", strerror(err));
             ret = ncclSystemError;
           }
           job->state = ncclGroupJobJoined;
@@ -373,13 +385,6 @@ static ncclResult_t asyncJobLaunch(struct ncclIntruQueue<struct ncclAsyncJob, &n
     if (ret != ncclSuccess) goto fail;
   }
 
-  while (!ncclIntruQueueEmpty(asyncJobsMain)) {
-    struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsMain);
-    if (!job->destroyFlag && job->comm && !job->comm->config.blocking)
-      (void) ncclCommSetAsyncError(job->comm, ret);
-    if (job->destructor) job->destructor((void*)job);
-  }
-
 exit:
   return ret;
 fail:
@@ -393,6 +398,7 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
   struct ncclComm *groupCommHeadMain = *gjob->groupCommHeadPtr;
   struct ncclComm *groupCommPreconnectHeadMain = *gjob->groupCommPreconnectHeadPtr;
   struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsMain = gjob->asyncJobsPtr;
+
   bool *groupAbortFlag = gjob->abortFlagPtr;
 
   CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, fail);
@@ -409,7 +415,7 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
       job->base.abortFlag = comm->abortFlag;
       job->base.abortFlagDev = comm->abortFlagDev;
       job->comm = comm;
-      ncclIntruQueueEnqueue(asyncJobsMain, &job->base);
+      ncclIntruQueueEnqueue(asyncJobsMain,  (struct ncclAsyncJob*)job);
 
       struct ncclComm* next = comm->preconnectNext;
       comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
@@ -422,12 +428,14 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
   /* Connect channels at runtime if cumem is supported */
   if (groupCommHeadMain != nullptr) {
     struct ncclComm* comm = groupCommHeadMain;
-
+    struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> asyncCollJobs;
+    ncclIntruQueueConstruct(&asyncCollJobs);
     do {
       bool needConnect = false;
       bool algoNeedConnect[NCCL_NUM_ALGORITHMS];
       memset(algoNeedConnect, 0, sizeof(bool) * NCCL_NUM_ALGORITHMS);
 
+      CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
       NCCLCHECKGOTO(ncclPrepareTasks(comm, algoNeedConnect, &needConnect, simInfo), ret, fail);
 
       if (comm->cuMemSupport && needConnect) {
@@ -438,21 +446,33 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
         job->base.destructor = free;
         job->base.state = ncclGroupJobRunning;
         job->base.abortFlag = comm->abortFlag;
+        job->base.abortFlagDev = comm->abortFlagDev;
         job->comm = comm;
         NCCLCHECKGOTO(ncclCalloc(&job->algoNeedConnect, NCCL_NUM_ALGORITHMS), ret, fail);
         memcpy(job->algoNeedConnect, algoNeedConnect, sizeof(bool) * NCCL_NUM_ALGORITHMS);
-        ncclIntruQueueEnqueue(asyncJobsMain, &job->base);
+        ncclIntruQueueEnqueue(&asyncCollJobs, &job->base);
       }
       comm = comm->groupNext;
     } while (comm);
 
-    NCCLCHECKGOTO(asyncJobLaunch(asyncJobsMain, groupAbortFlag), ret, fail);
+    NCCLCHECKGOTO(asyncJobLaunch(&asyncCollJobs, groupAbortFlag), ret, fail);
+    while (!ncclIntruQueueEmpty(&asyncCollJobs)) {
+      struct ncclAsyncJob* job = ncclIntruQueueDequeue(&asyncCollJobs);
+      if (job->destructor) job->destructor((void*)job);
+    }
   }
 
   if ((!simInfo) && (groupCommHeadMain != nullptr)) {
     NCCLCHECKGOTO(doLaunches(groupCommHeadMain), ret, fail);
   }
 
+  while (!ncclIntruQueueEmpty(asyncJobsMain)) {
+    struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsMain);
+    if (!job->destroyFlag && job->comm && !job->comm->config.blocking)
+      (void) ncclCommSetAsyncError(job->comm, ret);
+    if (job->destructor) job->destructor((void*)job);
+  }
+
   while (groupCommHeadMain != nullptr) {
     struct ncclComm* comm = groupCommHeadMain;
     struct ncclComm* next = comm->groupNext;
@@ -517,7 +537,7 @@ ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo) {
     ncclGroupJobMainPtr = &ncclGroupJobMain;
     /* make sure ncclGroupBlocking has been set. */
     assert(ncclGroupBlocking == 0 || ncclGroupBlocking == 1);
-    if (ncclGroupBlocking == 0 && (ncclGroupCommPreconnectHead != nullptr || !ncclIntruQueueEmpty(&ncclAsyncJobs))) {
+    if (ncclGroupBlocking == 0) {
       /* nonblocking group */
       if (!ncclIntruQueueEmpty(&ncclAsyncJobs)) {
         ncclAsyncJob* job = ncclIntruQueueHead(&ncclAsyncJobs);
@@ -539,7 +559,7 @@ ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo) {
       }
 
       ncclGroupJobMainPtr->base.func = groupLaunchNonBlocking;
-      SYSCHECKGOTO(pthread_create(&ncclGroupJobMainPtr->base.thread, NULL, ncclAsyncJobMain, (void*)&ncclGroupJobMainPtr->base), ret, fail);
+      PTHREADCHECKGOTO(pthread_create(&ncclGroupJobMainPtr->base.thread, NULL, ncclAsyncJobMain, (void*)&ncclGroupJobMainPtr->base), "pthread_create", ret, fail);
       ret = ncclInProgress;
     } else {
       /* blocking group */
diff --git a/src/include/alloc.h b/src/include/alloc.h
index 71d0777cce..7744119c3c 100644
--- a/src/include/alloc.h
+++ b/src/include/alloc.h
@@ -17,6 +17,11 @@
 #include <stdlib.h>
 #include <string.h>
 
+#if CUDART_VERSION >= 11030
+#include <cuda.h>
+#include "cudawrap.h"
+#endif
+
 uint64_t clockNano(); // from utils.h with which we have a circular dependency
 
 template<typename T>
@@ -24,6 +29,81 @@ constexpr size_t ncclSizeOfT() { return sizeof(T); }
 template<>
 constexpr size_t ncclSizeOfT<void>() { return 1; }
 
+#if CUDART_VERSION >= 12020
+
+static inline ncclResult_t ncclCuMemHostAlloc(void** ptr, CUmemGenericAllocationHandle *handlep, size_t size) {
+  ncclResult_t result = ncclSuccess;
+  size_t granularity = 0;
+  CUdevice currentDev;
+  CUmemAllocationProp prop = {};
+  CUmemAccessDesc accessDesc = {};
+  CUmemGenericAllocationHandle handle;
+  int cudaDev;
+  int cpuNumaNodeId = -1;
+  CUmemAllocationHandleType type = ncclCuMemHandleType;
+
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  CUCHECK(cuDeviceGet(&currentDev, cudaDev));
+  CUCHECK(cuDeviceGetAttribute(&cpuNumaNodeId, CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID, currentDev));
+  if (cpuNumaNodeId < 0) cpuNumaNodeId = 0;
+  prop.location.type = CU_MEM_LOCATION_TYPE_HOST_NUMA;
+  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  prop.requestedHandleTypes = type; // So it can be exported
+  prop.location.id = cpuNumaNodeId;
+  CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
+  ALIGN_SIZE(size, granularity);
+  /* Allocate the physical memory on the device */
+  CUCHECK(cuMemCreate(&handle, size, &prop, 0));
+  /* Reserve a virtual address range */
+  CUCHECK(cuMemAddressReserve((CUdeviceptr*)ptr, size, granularity, 0, 0));
+  /* Map the virtual address range to the physical allocation */
+  CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
+  /* Now allow RW access to the newly mapped memory for local GPU */
+  accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  accessDesc.location.id = cudaDev;
+  accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+  CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
+
+  /* Now allow RW access to the newly mapped memory from the CPU */
+  accessDesc.location.type = CU_MEM_LOCATION_TYPE_HOST_NUMA;
+  accessDesc.location.id = cpuNumaNodeId;
+  accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+  CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
+
+  if (handlep) *handlep = handle;
+  INFO(NCCL_ALLOC, "CUMEM Host Alloc Size %zi pointer %p handle %llx numa %d dev %d granularity %ld", size, *ptr, handle, cpuNumaNodeId, cudaDev, granularity);
+  return result;
+}
+
+static inline ncclResult_t ncclCuMemHostFree(void* ptr) {
+  if (ptr == NULL) return ncclSuccess;
+  ncclResult_t result = ncclSuccess;
+  CUmemGenericAllocationHandle handle;
+  size_t size = 0;
+  CUCHECK(cuMemRetainAllocationHandle(&handle, ptr));
+  CUCHECK(cuMemRelease(handle));
+  CUCHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
+  TRACE(NCCL_ALLOC, "CUMEM Host Free Size %zi pointer %p handle 0x%llx", size, ptr, handle);
+  CUCHECK(cuMemUnmap((CUdeviceptr)ptr, size));
+  CUCHECK(cuMemRelease(handle));
+  CUCHECK(cuMemAddressFree((CUdeviceptr)ptr, size));
+  return result;
+}
+
+#else /* CUDART_VERSION >= 12020 */
+
+static inline ncclResult_t ncclCuMemHostAlloc(void** ptr, void* handlep, size_t size) {
+  WARN("CUMEM Host is not supported prior to CUDA 12.2");
+  return ncclInternalError;
+}
+
+static inline ncclResult_t ncclCuMemHostFree(void* ptr) {
+  WARN("CUMEM Host is not supported prior to CUDA 12.2");
+  return ncclInternalError;
+}
+
+#endif  /* CUDART_VERSION >= 12020 */
+
 template <typename T>
 ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
   ncclResult_t result = ncclSuccess;
@@ -40,24 +120,25 @@ finish:
   INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT<T>(), *ptr);
   return result;
 }
-#define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
 
-inline ncclResult_t ncclCudaHostFree(void* ptr) {
+static inline ncclResult_t ncclCudaHostFree(void* ptr) {
   CUDACHECK(cudaFreeHost(ptr));
   return ncclSuccess;
 }
 
+#define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
+
 template <typename T>
 ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
   if (nelem > 0) {
-    void* p = malloc(nelem*ncclSizeOfT<T>());
+    T* p = (T*)malloc(nelem*ncclSizeOfT<T>());
     if (p == NULL) {
       WARN("Failed to malloc %ld bytes", nelem*ncclSizeOfT<T>());
       return ncclSystemError;
     }
     //INFO(NCCL_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT<T>(), p);
     memset(p, 0, nelem*ncclSizeOfT<T>());
-    *ptr = (T*)p;
+    *ptr = p;
   } else {
     *ptr = NULL;
   }
@@ -67,17 +148,17 @@ ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int li
 
 template <typename T>
 ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) {
-  if (nelem < oldNelem) return ncclInternalError;
+  T* oldp = *ptr;
+  if (nelem < oldNelem || (oldp == NULL && oldNelem > 0)) return ncclInternalError;
   if (nelem == oldNelem) return ncclSuccess;
 
-  T* oldp = *ptr;
   T* p = (T*)malloc(nelem*ncclSizeOfT<T>());
   if (p == NULL) {
     WARN("Failed to malloc %ld bytes", nelem*ncclSizeOfT<T>());
     return ncclSystemError;
   }
-  memcpy(p, oldp, oldNelem*ncclSizeOfT<T>());
-  free(oldp);
+  if (oldp && oldNelem) memcpy(p, oldp, oldNelem * ncclSizeOfT<T>());
+  if (oldp) free(oldp);
   memset(p+oldNelem, 0, (nelem-oldNelem)*ncclSizeOfT<T>());
   *ptr = (T*)p;
   INFO(NCCL_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem*ncclSizeOfT<T>(), nelem*ncclSizeOfT<T>(), *ptr);
@@ -89,6 +170,40 @@ ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) {
 #include <cuda.h>
 #include "cudawrap.h"
 
+// ncclCuMemAllocAddr takes memory handle and size and returns the mapped address pointer
+static inline ncclResult_t ncclCuMemAllocAddr(void **ptr, CUmemGenericAllocationHandle *handleIn, size_t size) {
+  ncclResult_t result = ncclSuccess;
+  size_t granularity = 0;
+  CUmemAllocationProp prop = {};
+  CUmemAccessDesc accessDesc = {};
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  CUCHECK(cuMemGetAllocationPropertiesFromHandle(&prop, *handleIn));
+  CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
+  ALIGN_SIZE(size, granularity);
+  /* Reserve a virtual address range */
+  CUCHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0));
+  /* Map the virtual address range to the physical allocation */
+  CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, *handleIn, 0));
+  /* Now allow RW access to the newly mapped memory */
+  accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  accessDesc.location.id = cudaDev;
+  accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+  CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
+  TRACE(NCCL_ALLOC, "CuMem Map Size %zu pointer %p handle %llx", size, *ptr, *handleIn);
+  return result;
+}
+
+static inline ncclResult_t ncclCuMemFreeAddr(void *ptr) {
+  if (ptr == NULL) return ncclSuccess;
+  ncclResult_t result = ncclSuccess;
+  size_t size = 0;
+  CUCHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
+  CUCHECK(cuMemUnmap((CUdeviceptr)ptr, size));
+  CUCHECK(cuMemAddressFree((CUdeviceptr)ptr, size));
+  return result;
+}
+
 static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHandle *handlep, size_t size) {
   ncclResult_t result = ncclSuccess;
   size_t granularity = 0;
@@ -106,7 +221,7 @@ static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHand
   prop.requestedHandleTypes = type;
   prop.location.id = currentDev;
   // Query device to see if RDMA support is available
-  CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, currentDev));
+  CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev));
   if (flag) prop.allocFlags.gpuDirectRDMACapable = 1;
   CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
   ALIGN_SIZE(size, granularity);
@@ -154,6 +269,15 @@ static inline ncclResult_t ncclCuMemFree(void *ptr) {
   return ncclInternalError;
 }
 
+static inline ncclResult_t ncclCuMemAllocAddr(void **ptr, CUmemGenericAllocationHandle *handleIn, size_t size) {
+  WARN("CUMEM not supported prior to CUDA 11.3");
+  return ncclInternalError;
+}
+
+static inline ncclResult_t ncclCuMemFreeAddr(void *ptr) {
+  WARN("CUMEM not supported prior to CUDA 11.3");
+  return ncclInternalError;
+}
 #endif
 
 template <typename T>
@@ -274,7 +398,8 @@ finish:
 // and if they are shared, that could cause a crash in a child process
 inline ncclResult_t ncclIbMallocDebug(void** ptr, size_t size, const char *filefunc, int line) {
   if (size > 0) {
-    size_t page_size = sysconf(_SC_PAGESIZE);
+    long page_size = sysconf(_SC_PAGESIZE);
+    if (page_size < 0) return ncclSystemError;
     void* p;
     int size_aligned = ROUNDUP(size, page_size);
     int ret = posix_memalign(&p, page_size, size_aligned);
diff --git a/src/include/bitops.h b/src/include/bitops.h
index 95620cbe30..a650aa7f46 100644
--- a/src/include/bitops.h
+++ b/src/include/bitops.h
@@ -185,6 +185,8 @@ inline __host__ __device__ Int pow2Up(Int x) {
 
 template<typename Int>
 inline __host__ __device__ Int pow2Down(Int x) {
+  // True, log2Down can return -1, but we don't normally pass 0 as an argument...
+  // coverity[negative_shift]
   return Int(1)<<log2Down(x);
 }
 
@@ -274,4 +276,13 @@ inline __host__ __device__ uint32_t u32fp8Decode(uint8_t x) {
   return u32fpDecode(x, 3);
 }
 
+inline __host__ __device__ uint64_t getHash(const char* string, int n) {
+  // Based on DJB2a, result = result * 33 ^ char
+  uint64_t result = 5381;
+  for (int c = 0; c < n; c++) {
+    result = ((result << 5) + result) ^ string[c];
+  }
+  return result;
+}
+
 #endif
diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h
index 8c5f081c48..85e33f69e1 100644
--- a/src/include/bootstrap.h
+++ b/src/include/bootstrap.h
@@ -19,8 +19,8 @@ static_assert(sizeof(struct ncclBootstrapHandle) <= sizeof(ncclUniqueId), "Boots
 ncclResult_t bootstrapNetInit();
 ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv);
 ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle);
-ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm* comm);
-ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks);
+ncclResult_t bootstrapInit(int nHandles, void* handle, struct ncclComm* comm);
+ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks);
 ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
 ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
 ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
diff --git a/src/include/checks.h b/src/include/checks.h
index 89355c3da4..cbb5a2de41 100644
--- a/src/include/checks.h
+++ b/src/include/checks.h
@@ -38,21 +38,17 @@
 
 #include <errno.h>
 // Check system calls
-#define SYSCHECK(call, name) do { \
+#define SYSCHECK(statement, name) do { \
   int retval; \
-  SYSCHECKVAL(call, name, retval); \
-} while (false)
-
-#define SYSCHECKVAL(call, name, retval) do { \
-  SYSCHECKSYNC(call, name, retval); \
+  SYSCHECKSYNC((statement), name, retval); \
   if (retval == -1) { \
-    WARN("Call to " name " failed : %s", strerror(errno)); \
+    WARN("Call to " name " failed: %s", strerror(errno)); \
     return ncclSystemError; \
   } \
 } while (false)
 
-#define SYSCHECKSYNC(call, name, retval) do { \
-  retval = call; \
+#define SYSCHECKSYNC(statement, name, retval) do { \
+  retval = (statement); \
   if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
     INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
   } else { \
@@ -60,14 +56,33 @@
   } \
 } while(true)
 
-#define SYSCHECKGOTO(statement, RES, label) do { \
-  if ((statement) == -1) {    \
-    /* Print the back trace*/ \
-    RES = ncclSystemError;    \
-    INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno));    \
+#define SYSCHECKGOTO(statement, name, RES, label) do { \
+  int retval; \
+  SYSCHECKSYNC((statement), name, retval); \
+  if (retval == -1) { \
+    WARN("Call to " name " failed: %s", strerror(errno)); \
+    RES = ncclSystemError; \
     goto label; \
   } \
-} while (0);
+} while (0)
+
+// Pthread calls don't set errno and never return EINTR.
+#define PTHREADCHECK(statement, name) do { \
+  int retval = (statement); \
+  if (retval != 0) { \
+    WARN("Call to " name " failed: %s", strerror(retval)); \
+    return ncclSystemError; \
+  } \
+} while (0)
+
+#define PTHREADCHECKGOTO(statement, name, RES, label) do { \
+  int retval = (statement); \
+  if (retval != 0) { \
+    WARN("Call to " name " failed: %s", strerror(retval)); \
+    RES = ncclSystemError; \
+    goto label; \
+  } \
+} while (0)
 
 #define NEQCHECK(statement, value) do {   \
   if ((statement) != value) {             \
@@ -75,7 +90,7 @@
     INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno));    \
     return ncclSystemError;     \
   }                             \
-} while (0);
+} while (0)
 
 #define NEQCHECKGOTO(statement, value, RES, label) do { \
   if ((statement) != value) { \
@@ -84,7 +99,7 @@
     INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno));    \
     goto label; \
   } \
-} while (0);
+} while (0)
 
 #define EQCHECK(statement, value) do {    \
   if ((statement) == value) {             \
@@ -92,7 +107,7 @@
     INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno));    \
     return ncclSystemError;     \
   }                             \
-} while (0);
+} while (0)
 
 #define EQCHECKGOTO(statement, value, RES, label) do { \
   if ((statement) == value) { \
@@ -101,7 +116,7 @@
     INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno));    \
     goto label; \
   } \
-} while (0);
+} while (0)
 
 // Propagate errors up
 #define NCCLCHECK(call) do { \
@@ -111,7 +126,7 @@
     if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES);    \
     return RES; \
   } \
-} while (0);
+} while (0)
 
 #define NCCLCHECKGOTO(call, RES, label) do { \
   RES = call; \
@@ -120,7 +135,7 @@
     if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES);    \
     goto label; \
   } \
-} while (0);
+} while (0)
 
 #define NCCLWAIT(call, cond, abortFlagPtr) do {         \
   uint32_t* tmpAbortFlag = (abortFlagPtr);     \
@@ -130,7 +145,7 @@
     return ncclInternalError;             \
   }                                       \
   if (__atomic_load(tmpAbortFlag, __ATOMIC_ACQUIRE)) NEQCHECK(*tmpAbortFlag, 0); \
-} while (!(cond));
+} while (!(cond))
 
 #define NCCLWAITGOTO(call, cond, abortFlagPtr, RES, label) do { \
   uint32_t* tmpAbortFlag = (abortFlagPtr);             \
@@ -140,7 +155,7 @@
     goto label;                           \
   }                                       \
   if (__atomic_load(tmpAbortFlag, __ATOMIC_ACQUIRE)) NEQCHECKGOTO(*tmpAbortFlag, 0, RES, label); \
-} while (!(cond));
+} while (!(cond))
 
 #define NCCLCHECKTHREAD(a, args) do { \
   if (((args)->ret = (a)) != ncclSuccess && (args)->ret != ncclInProgress) { \
diff --git a/src/include/collectives.h b/src/include/collectives.h
index fb7af3bff8..e45d78f26f 100644
--- a/src/include/collectives.h
+++ b/src/include/collectives.h
@@ -64,4 +64,490 @@ struct ncclConnFifo {
   ssize_t size;
   void* ptr;
 };
+
+#include <stdio.h>
+
+template<typename T>
+class PatRSAlgorithm{
+  size_t offset;
+  size_t end;
+  size_t count;
+  int chunkCount;
+  int nelem;
+  int rank;
+  int nranks;
+  int nrPow2;
+  int postFreq;
+  int lastA;
+
+  int aggFactor;
+  int as; // aggregated steps
+  int a; // step inside aggregated step
+  int sendSkipped; // number of skipped steps during aggregation
+  int recvSkipped; // number of skipped steps during aggregation
+  int phase2recv;  // receive offset for phase 2
+  int aggDelta;
+  int scale;
+  int phase;
+
+  __device__ __host__ int min(int a, int b) {
+    return (a<b)?a:b;
+  }
+
+  __device__ __host__ int getNelem() {
+    return min(chunkCount, end-offset);
+  }
+
+  __device__ __host__ int mirrorInvert(int i, int max) {
+    int ret = 0;
+    for (int mask=1, imask=max/2; mask<max; mask<<=1, imask>>=1) {
+      if ((i&mask) == 0) ret += imask;
+    }
+    return ret;
+  }
+
+  __device__ __host__ int firstBitSet(int i, int max) {
+    int ffs =
+#ifdef __CUDA_ARCH__
+      __ffs(i);
+#else
+      __builtin_ffs(i);
+#endif
+    return ffs ? ffs-1 : max;
+  }
+
+  __device__ __host__ void resetA() {
+    a = 0;
+    sendSkipped = recvSkipped = 0;
+    lastA = aggFactor;
+    if (phase >= 2) lastA /= 2*scale;
+  }
+
+  __device__ __host__ void reset() {
+    nelem = getNelem();
+    phase = 0;
+    scale = 1;
+    phase2recv = 0;
+    as = aggDelta - 1;
+    resetA();
+  }
+
+  __device__ __host__ int nBitsSet(int i) {
+    int nbits =
+#ifdef __CUDA_ARCH__
+      __popc(i);
+#else
+      __builtin_popcount(i);
+#endif
+    return nbits;
+  }
+
+  // Return 1 when only upper bits are set. For example, if nrpow2==16 we'll return 1 for 8, 12, 14, 15.
+  // A number being in the form of 1111000 implies that the complementary is 0000111 meaning it's a power of 2 minus 1.
+  __device__ __host__ int newPeer(int i, int pow2) {
+    //printf("New peer %d/%d -> %d\n", i, pow2, nBitsSet((i ^ (pow2-1)) + 1) == 1 ? 1 : 0);
+    return nBitsSet((i ^ (pow2-1)) + 1) == 1 ? 1 : 0;
+  }
+
+public:
+   __device__ __host__ PatRSAlgorithm(int stepSize, int stepDepth, size_t offset, size_t end, size_t count, int chunkCount, int rank, int nranks):
+     offset(offset), end(end), count(count), chunkCount(chunkCount), rank(rank), nranks(nranks) {
+    aggDelta = nrPow2 = (1<<log2Up(nranks));
+
+    aggFactor = 1;
+    size_t channelSize = end-offset;
+    while (stepSize / (channelSize*sizeof(T)*aggFactor) >= 2 && aggFactor < nranks/2) {
+      aggFactor *= 2;
+      aggDelta /= 2;
+    }
+    postFreq = aggFactor;
+    int d = stepDepth;
+    while (d > 1 && aggFactor < nranks/2) {
+      d /= 2;
+      aggFactor *= 2;
+      aggDelta /= 2;
+    }
+
+    reset();
+  }
+
+  __device__ __host__ void getNextOp(int &recvDim, int &sendDim, size_t &inpIx, size_t &outIx, int &recvOffset, int &sendOffset, int &sendStepOffset, int &nelemOut, int &postRecv, int &postSend, int &last) {
+restart:
+    last = 0;
+    nelemOut = nelem;
+    outIx = offset;
+    int skip = 0;
+    //printf("Phase %d as %d/%d a %d/%d scale %d\n", phase, as, aggDelta, a, lastA, scale);
+    if (phase == 0) {
+      int s = mirrorInvert(a, lastA)*aggDelta + as;
+      if (s >= nranks) skip = 1;
+      int sendDataRank = (rank + s) % nranks;
+      inpIx = sendDataRank * count + offset;
+      recvDim = -1;
+      sendDim = 0;
+      outIx = 0;
+      recvOffset = -1;
+      sendOffset = ((a - sendSkipped)%postFreq) * nelem;
+      sendStepOffset = 0;
+      if ((((a - sendSkipped)%postFreq) + 1 >= postFreq) || (a == lastA-1)) {
+        postSend = 1;
+      } else {
+        postSend = 0;
+      }
+      postRecv = 0;
+      if (skip) sendSkipped++;
+      if (++a == lastA) {
+        phase = as == 1 ? (aggFactor > 1 ? 2 : 4) : 1; // If as == 1, switch to phase 2
+        resetA();
+      }
+      if (skip == 0) return;
+    } else if (phase == 1) {
+      int s = mirrorInvert(a, lastA)*aggDelta + as;
+      if (s >= nranks) skip = 1;
+      recvDim = firstBitSet(s, nrPow2);
+      sendOffset = ((a - sendSkipped)%postFreq)*nelem;
+      recvOffset = ((a - recvSkipped)%postFreq)*nelem;
+      postSend = 0;
+      if (recvDim == 0) {
+        if ((((a - sendSkipped)%postFreq) + 1 >= postFreq) || (a == lastA-1)) postSend = 1;
+        sendStepOffset = 0;
+      } else {
+        sendStepOffset = (a - sendSkipped)/postFreq;
+      }
+      if ((((a - recvSkipped)%postFreq) + 1 >= postFreq) || (a == lastA-1)) {
+        postRecv = 1;
+      } else {
+        postRecv = 0;
+      }
+      s -= (1<<recvDim);
+      int recvDataRank = (rank + nranks + s) % nranks;
+      inpIx = recvDataRank * count + offset;
+      sendDim = s ? firstBitSet(s, nrPow2) : -1;
+      if (sendDim == -1) {
+        sendOffset = -1;
+        sendStepOffset = 0;
+      } else if (as - (1<<recvDim) == 0) {
+        if (newPeer(a, aggFactor)) sendSkipped = a;
+        int foffset = a - sendSkipped;
+        sendStepOffset = recvDim == 0 ? 0 : foffset/postFreq;
+        sendOffset = (foffset%postFreq)*nelem;
+      }
+      if (s < nranks && skip) {
+        recvDim = -1;
+        recvOffset = -1;
+        postRecv = 0;
+        skip = 0;
+      }
+      if (skip || recvDim == -1) recvSkipped++;
+      if (skip) sendSkipped++;
+      if (++a == lastA) {
+        as--;
+        phase = as % 2 == 1 ? 0 : 1;
+        resetA();
+      }
+      if (skip == 0) return;
+    } else if (phase == 2) {
+      int s = (2*mirrorInvert(a, lastA)+1)*scale*aggDelta + 1;
+      postRecv = 0;
+      if (s >= nranks) skip = 1;
+      recvDim = 0;
+      postSend = a == lastA-1 ? 1 : 0;
+      s -= 1;
+      if (s < nranks && skip) {
+        recvDim = -1;
+        recvOffset = -1;
+        skip = 0;
+      } else if (!skip) {
+        int foffset = phase2recv;
+        phase2recv++;
+        postRecv |= ((foffset+1)%postFreq) == 0 ? 1 : 0;
+        recvOffset = (foffset%postFreq) * nelem;
+      }
+      int recvDataRank = (rank + nranks + s) % nranks;
+      inpIx = recvDataRank * count + offset;
+      sendDim = s ? firstBitSet(s, nrPow2) : -1;
+      int foffset = a - sendSkipped;
+      postSend |= ((foffset+1)%postFreq) == 0 ? 1 : 0;
+      sendStepOffset = 0;
+      sendOffset = (foffset%postFreq) * nelem;
+      if (skip || sendDim == -1) sendSkipped++;
+      if (++a == lastA) {
+        phase = 3;
+        resetA();
+      }
+      if (skip == 0) return;
+    } else if (phase == 3) {
+      int s = (2*mirrorInvert(a, lastA)+1)*scale*aggDelta;
+      postRecv = a == lastA-1 ? 1 : 0;
+      if (s >= nranks) skip = 1;
+      recvDim = firstBitSet(s, nrPow2);
+      postSend = 0;
+      s -= (1<<recvDim);
+      int foffset = a - recvSkipped;
+      postRecv |= (foffset+1)%postFreq == 0 ? 1 : 0;
+      recvOffset = (foffset%postFreq) * nelem;
+      int recvDataRank = (rank + nranks + s) % nranks;
+      inpIx = recvDataRank * count + offset;
+      sendDim = s ? firstBitSet(s, nrPow2) : -1;
+      if (s < nranks && skip) {
+        recvDim = -1;
+        recvOffset = -1;
+        postRecv = 0;
+        skip = 0;
+      }
+      if (newPeer(a, aggFactor/(2*scale))) sendSkipped = a;
+      foffset = a - sendSkipped;
+      sendStepOffset = foffset / postFreq; // Accumulate on next steps
+      sendOffset = sendDim >= 0 ? (foffset%postFreq) * nelem : -1;
+      if (skip || recvDim == -1) recvSkipped++;
+      if (skip) sendSkipped++;
+      if (++a == lastA) {
+        scale *= 2;
+        phase = scale < aggFactor ? 2 : 4;
+        resetA();
+      }
+      if (skip == 0) return;
+    } else if (phase == 4) {
+      recvDim = 0;
+      sendDim = -1;
+      inpIx = rank * count + offset;
+      recvOffset = (phase2recv%postFreq) * nelem;
+      sendStepOffset = 0;
+      sendOffset = -1;
+      postRecv = 1;
+      postSend = 0;
+      offset += chunkCount;
+      if (offset >= end) {
+        last = 1;
+      } else {
+        reset();
+      }
+      return;
+    }
+    goto restart;
+  }
+};
+
+template<typename T>
+class PatAGAlgorithm{
+  size_t offset;
+  size_t end;
+  size_t count;
+  int chunkCount;
+  int nelem;
+  int rank;
+  int nranks;
+  int nrPow2;
+  int postFreq;
+  int lastA;
+
+  int aggFactor;
+  int as; // aggregated steps
+  int a; // step inside aggregated step
+  int aggDelta;
+
+  int scale;
+
+  int phase;
+
+  // AS computation
+  int asDim;
+  int v;
+  int bitCount[32];
+  int bitZeroStep[32];
+
+  __device__ __host__ int min(int a, int b) {
+    return (a<b)?a:b;
+  }
+
+  __device__ __host__ int getNelem() {
+    return min(chunkCount, end-offset);
+  }
+
+  __device__ __host__ int mirror(int i, int max) {
+    int ret = 0;
+    for (int mask=1, imask=max/2; mask<max; mask<<=1, imask>>=1) {
+      if ((i&mask)) ret += imask;
+    }
+    return ret;
+  }
+
+  __device__ __host__ int firstBitSet(int i, int max) {
+    int ffs =
+#ifdef __CUDA_ARCH__
+      __ffs(i);
+#else
+      __builtin_ffs(i);
+#endif
+    return ffs ? ffs-1 : max;
+  }
+
+  __device__ __host__ void resetA() {
+    a = 0;
+    lastA = aggFactor;
+    if (phase >= 2) lastA /= 2*scale;
+  }
+
+  __device__ __host__ void reset() {
+    nelem = getNelem();
+    scale = aggFactor/2;
+    phase = scale ? 2 : 1;
+    v = 0;
+    for (int i = 0; i<asDim; i++) {
+      bitCount[i] = asDim-i;
+      bitZeroStep[i] = 1;
+    }
+    as = nextAs();
+    resetA();
+  }
+
+  __device__ __host__ int nextAs() {
+    for (int d=0; d<asDim; d++) {
+      int p = 1<<d;
+      bitCount[d]--;
+      if (bitCount[d] == 0) {
+        v ^= p;
+        bitCount[d] = p;
+        if ((v&p) == 0) {
+          bitCount[d] += firstBitSet(bitZeroStep[d], asDim) - 1;
+          if (bitCount[d] == 0) {
+            v ^= p;
+            bitCount[d] = p;
+          }
+          bitZeroStep[d]++;
+        }
+      }
+    }
+    return v;
+  }
+
+
+public:
+   __device__ __host__ PatAGAlgorithm(int stepSize, int stepDepth, size_t offset, size_t end, size_t count, int chunkCount, int rank, int nranks):
+     offset(offset), end(end), count(count), chunkCount(chunkCount), rank(rank), nranks(nranks) {
+    aggDelta = nrPow2 = (1<<log2Up(nranks));
+
+    aggFactor = 1;
+    size_t channelSize = end-offset;
+    while (stepSize / (channelSize*sizeof(T)*aggFactor) >= 2 && aggFactor < nranks/2) {
+      aggFactor *= 2;
+      aggDelta /= 2;
+    }
+    postFreq = aggFactor;
+    int d = stepDepth;
+    while (d > 1 && aggFactor < nranks/2) {
+      d /= 2;
+      aggFactor *= 2;
+      aggDelta /= 2;
+    }
+    //printf("AggFactor %d PostFreq %d AggDelta %d\n", aggFactor, postFreq, aggDelta);
+
+    asDim = log2Up(aggDelta);
+    reset();
+  }
+
+  __device__ __host__ void getNextOp(int &recvDim, int &sendDim, size_t &inpIx, size_t &outIx, int &recvOffset, int &sendOffset, int &recvStepOffset, int &nelemOut, int &postRecv, int &postSend, int &last) {
+restart:
+    //printf("Phase %d as %d/%d a %d/%d scale %d\n", phase, as, aggDelta, a, lastA, scale);
+    last = 0;
+    nelemOut = nelem;
+    inpIx = offset;
+    int skip = 0;
+    if (phase == 0) {
+      int s = a*aggDelta + as;
+      if (s >= nranks) skip = 1;
+      int nextSkip = (a+1)*aggDelta + as >= nranks ? 1 : 0;
+      int recvDataRank = (rank + s) % nranks;
+      outIx = recvDataRank * count + offset;
+      sendDim = -1;
+      recvDim = 0;
+      inpIx = 0;
+      sendOffset = -1;
+      recvOffset = (a % postFreq) * nelem;
+      recvStepOffset = 0;
+      postRecv = (a % postFreq == postFreq-1) || ((a+1)*aggDelta+as >= nranks) ? 1 : 0;
+      postSend = 0;
+      a++;
+      if (nextSkip) {
+        as = nextAs();
+        if (as == aggDelta/2) {
+          offset += chunkCount;
+          if (offset >= end) {
+            last = 1;
+          } else {
+            reset();
+          }
+          return;
+        }
+        phase = 1;
+        resetA();
+      }
+      if (skip == 0) return;
+   } else if (phase == 1) {
+      int s = a*aggDelta + as;
+      if (s >= nranks) skip = 1;
+      sendDim = firstBitSet(s, nrPow2);
+      s -= (1<<sendDim);
+      int sendDataRank = (rank + nranks + s) % nranks;
+      outIx = sendDataRank * count + offset;
+      recvDim = s ? firstBitSet(s, nrPow2) : -1;
+      sendOffset = recvOffset = (a % postFreq) * nelem;
+      postSend = (a % postFreq == postFreq-1) || ((a+1)*aggDelta+as >= nranks) ? 1 : 0;
+      postRecv = (sendDim == 0) && ((a % postFreq == postFreq-1) || ((a+1)*aggDelta+as-1 >= nranks)) ? 1 : 0;
+      recvStepOffset = (sendDim == 0) ? 0 : a/postFreq;
+      if (recvDim == -1) {
+        recvOffset = -1;
+        postRecv = 0;
+      } else if (as - (1<<sendDim) == 0) {
+        int foffset = (a*aggDelta) >> (recvDim+1);
+        recvOffset = (foffset%postFreq)*nelem;
+        postRecv = (sendDim == 0) && ((foffset % postFreq == postFreq-1) || ((((foffset+1)*2)+1)<<recvDim) >= nranks) ? 1 : 0;
+        recvStepOffset = (sendDim == 0) ? 0 : foffset/postFreq;
+      }
+      if (s < nranks && sendDim == 0 && skip) {
+        // Don't forget to receive at least once even if we don't send afterwards
+        sendDim = -1;
+        sendOffset = -1;
+        postSend = 0;
+        skip = 0;
+      }
+      if (++a == lastA) {
+        if (as % 2 == 1) {
+          phase = 0;
+        } else {
+          as = nextAs();
+        }
+        resetA();
+      }
+      if (skip == 0) return;
+    } else if (phase == 2) {
+      int s = (2*a+1)*scale*aggDelta;
+      postSend = (a % postFreq == postFreq-1) || ((2*(a+1)+1)*scale*aggDelta >= nranks) ? 1 : 0;
+      postRecv = 0;
+      if (s >= nranks) skip = 1;
+      sendDim = firstBitSet(s, nrPow2);
+      s -= (1<<sendDim);
+      sendOffset = (a%postFreq) * nelem;
+      recvStepOffset = a / postFreq;
+      int sendDataRank = (rank + nranks + s) % nranks;
+      outIx = sendDataRank * count + offset;
+      recvDim = s ? firstBitSet(s, nrPow2) : -1;
+      s -= (1<<recvDim);
+      if (recvDim == -1) {
+        recvOffset = -1;
+      } else {
+        int foffset = (a*2*scale*aggDelta) >> (recvDim+1);
+        recvOffset = (foffset%postFreq)*nelem;
+        recvStepOffset = foffset / postFreq;
+      }
+      if (++a == lastA) {
+        scale /= 2;
+        phase = scale ? 2 : 1;
+        resetA();
+      }
+      if (skip == 0) return;
+    }
+    goto restart;
+  }
+};
 #endif
diff --git a/src/include/comm.h b/src/include/comm.h
index 0cc0a8911c..9d102dfed2 100644
--- a/src/include/comm.h
+++ b/src/include/comm.h
@@ -16,6 +16,7 @@
 #include "nccl_net.h"
 #include "register.h"
 #include "graph.h"
+#include "profiler.h"
 
 #if CUDART_VERSION < 9000
 struct cudaLaunchParams {
@@ -104,6 +105,11 @@ struct ncclCommCallback {
   struct ncclCommCallback* next;
   ncclResult_t(*fn)(struct ncclComm* comm, struct ncclCommCallback* cb);
 };
+struct ncclCommEventCallback {
+  struct ncclCommEventCallback* next;
+  cudaEvent_t event;
+  ncclResult_t(*fn)(struct ncclComm* comm, struct ncclCommEventCallback* cb);
+};
 
 struct ncclSharedResources {
   int refCount;
@@ -173,6 +179,54 @@ struct ncclCollnetHandleList {
   struct ncclProxyConnector* proxyconn;
 };
 
+struct ncclTaskColl {
+  struct ncclTaskColl* next;
+  ncclFunc_t func;
+  void const* sendbuff;
+  void* recvbuff;
+  size_t count;
+  int root;
+  ncclDataType_t datatype;
+  ncclRedOp_t opHost;
+  struct ncclDevRedOpFull opDev;
+  int chunkSteps, sliceSteps;
+  // Computed later:
+  size_t trafficBytes;
+  int32_t nMaxChannels:8;
+  int32_t nWarps:8;
+  int32_t algorithm:8, protocol:8;
+  uint32_t isCollnet:1, isNvls:1;
+  uint32_t devFuncId:30;
+  enum ncclRegBufferType regBufType;
+  // number of elements in planner->ipcMemQueue associated with this collective
+  int nCleanupQueueElts;
+
+  void* sendMhandle;
+  void* recvMhandle;
+  // index for IPC record lookup
+  uintptr_t sendbuffOffset;
+  uintptr_t recvbuffOffset;
+  uintptr_t* sendbuffRmtAddrs;
+  uintptr_t* recvbuffRmtAddrs;
+
+  // Profiler plugin
+  int eActivationMask;
+  void* eventHandle;
+};
+struct ncclTaskP2p {
+  struct ncclTaskP2p* next;
+  ncclFunc_t func;
+  void* buff;
+  size_t count;
+  ncclDataType_t datatype;
+  int root;
+  size_t bytes;
+
+  // Profiler plugin
+  int eActivationMask;
+  void* eventHandle;
+};
+
 struct ncclKernelPlan {
   // A kernel plan is also a callback that reclaims itself. Hence this must
   // be the first member.
@@ -198,40 +252,12 @@ struct ncclKernelPlan {
   struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next> cleanupQueue;
   void* workBufPersistent;
 
+  struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> p2pTaskQueue;
+  struct ncclIntruQueue<struct ncclTaskColl, &ncclTaskColl::next> collTaskQueue;
   struct ncclIntruQueue<struct ncclProxyOp, &ncclProxyOp::enqNext> proxyOpQueue;
-};
 
-////////////////////////////////////////////////////////////////////////////////
-
-struct ncclTaskColl {
-  struct ncclTaskColl* next;
-  ncclFunc_t func;
-  void const* sendbuff;
-  void* recvbuff;
-  size_t count;
-  int root;
-  ncclDataType_t datatype;
-  ncclRedOp_t opHost;
-  struct ncclDevRedOpFull opDev;
-  int chunkSteps, sliceSteps;
-  // Computed later:
-  size_t trafficBytes;
-  int32_t nMaxChannels:8;
-  int32_t nWarps:8;
-  int32_t algorithm:8, protocol:8;
-  uint32_t isCollnet:1, isNvls:1;
-  uint32_t devFuncId:30;
-  enum ncclRegBufferType regBufType;
-  // number of elements in planner->ipcMemQueue associated with this collective
-  int nCleanupQueueElts;
-
-  void* sendMhandle;
-  void* recvMhandle;
-};
-struct ncclTaskP2p {
-  struct ncclTaskP2p* next;
-  void* buff;
-  size_t bytes;
+  // Profiler plugin
+  void* groupEventHandle;
 };
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -383,6 +409,8 @@ struct ncclComm {
   struct ncclChannel channels[MAXCHANNELS];
   struct ncclPeerInfo* peerInfo;
   struct ncclTopoSystem* topo;
+  struct ncclProxyConnector* gproxyConn;
+  struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next> legacyRegCleanupQueue;
 
   int netPluginLoaded;
   ncclNet_t* ncclNet;
@@ -395,10 +423,12 @@ struct ncclComm {
   struct ncclTopoGraph graphs[NCCL_NUM_ALGORITHMS];
   bool initAlgoChannels[NCCL_NUM_ALGORITHMS];
   bool runtimeConn; // if dynamic connection is supported
+  bool directMode;
   int cuMemSupport;
 
   uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches.
 
+  const char* commName;
   uint64_t commHash;
   int rank;    // my rank in the communicator
   int nRanks;  // number of GPUs in communicator
@@ -504,7 +534,7 @@ struct ncclComm {
   int collNetSupport;
   bool collNetRegSupport;
   uint8_t collNetSupportMatrix[4/*sum,prod,max,min*/][ncclNumTypes];
-  int intraHighestTransportType;
+  bool intraNodeP2pSupport;
   int* collNetHeads;
   int collNetHeadsNum;
   int* collNetDenseToUserRank;
@@ -519,6 +549,8 @@ struct ncclComm {
   struct ncclNvlsSharedRes* nvlsResources;
 
   // pools backed by comm->memPermanent
+  struct ncclMemoryPool memPool_ncclTaskColl;
+  struct ncclMemoryPool memPool_ncclTaskP2p;
   struct ncclMemoryPool memPool_ncclProxyOp;
   struct ncclMemoryPool memPool_ncclKernelPlan;
 
@@ -532,6 +564,13 @@ struct ncclComm {
 
   struct ncclKernelPlanner planner;
 
+  cudaMemPool_t memPool;
+  // Queue of events and associated callbacks for cleaning up asynchronous work.
+  // Using this is preferable to using CUDA host callbacks because host callbacks
+  // won't allow the work following the callback to run until the callback completes,
+  // which comes at expense to perf.
+  struct ncclIntruQueue<struct ncclCommEventCallback, &ncclCommEventCallback::next> eventCallbackQueue;
+
   // user-created reduction ops
   int userRedOpCapacity, userRedOpFreeHead;
   ncclUserRedOp *userRedOps;
@@ -553,6 +592,11 @@ struct ncclComm {
   int tunerPluginLoaded;
   ncclTuner_t* tuner;
   void *tunerContext;
+
+  // Profiler plugin
+  void* profilerContext;
+  uint64_t seqNumber[NCCL_NUM_FUNCTIONS];
+
   // buffer registration cache
   struct ncclRegCache regCache;
   uint64_t endMagic;
@@ -583,6 +627,27 @@ inline ncclResult_t ncclCommPollCallbacks(struct ncclComm* comm, bool waitSome)
   return ncclSuccess;
 }
 
+inline ncclResult_t ncclCommPollEventCallbacks(struct ncclComm *comm) {
+  ncclResult_t result = ncclSuccess;
+  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  while (true) {
+    struct ncclCommEventCallback* cb = ncclIntruQueueHead(&comm->eventCallbackQueue);
+    if (cb == nullptr) break;
+    cudaError_t ok = cudaEventSynchronize(cb->event);
+    if (ok == cudaErrorNotReady) break;
+    ncclIntruQueueDequeue(&comm->eventCallbackQueue);
+    if (ok == cudaSuccess) {
+      NCCLCHECKGOTO(cb->fn(comm, cb), result, finish);
+    } else {
+      CUDACHECKGOTO(ok, result, finish);
+    }
+  }
+finish:
+  cudaThreadExchangeStreamCaptureMode(&mode);
+  return ncclSuccess;
+}
+
 inline void ncclCommIntraBarrierIn(struct ncclComm* comm, uint32_t x) {
   int phase = comm->intraBarrierPhase;
   if (comm->intraRanks == 1) {
diff --git a/src/include/cudawrap.h b/src/include/cudawrap.h
index fd7b0310e8..bf6132657f 100644
--- a/src/include/cudawrap.h
+++ b/src/include/cudawrap.h
@@ -13,6 +13,7 @@
 
 // Is cuMem API usage enabled
 extern int ncclCuMemEnable();
+extern int ncclCuMemHostEnable();
 
 #if CUDART_VERSION >= 11030
 #include <cudaTypedefs.h>
@@ -96,6 +97,7 @@ DECLARE_CUDA_PFN_EXTERN(cuMemRelease);
 DECLARE_CUDA_PFN_EXTERN(cuMemRetainAllocationHandle);
 DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess);
 DECLARE_CUDA_PFN_EXTERN(cuMemUnmap);
+DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationPropertiesFromHandle);
 #if CUDA_VERSION >= 11070
 DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange); // DMA-BUF support
 #endif
diff --git a/src/include/device.h b/src/include/device.h
index 76a909f7a8..153b5ae36c 100644
--- a/src/include/device.h
+++ b/src/include/device.h
@@ -128,6 +128,8 @@ struct ncclConnInfo {
 };
 
 struct ncclProxyConnector {
+  bool initialized;
+  int rank;
   int tpRank;
   int tpLocalRank;
   int sameProcess;
@@ -141,6 +143,8 @@ struct ncclConnector {
   struct ncclTransportComm* transportComm;
   void* transportResources;
   struct ncclConnInfo conn;
+  int sendMemSameProcess;
+  int recvMemSameProcess;
 };
 
 struct ncclRing {
@@ -225,6 +229,7 @@ struct alignas(16) ncclDevWorkP2p {
 
   uint8_t sendProtoLL:1, recvProtoLL:1;
   uint8_t sendRegistered:1, recvRegistered:1;
+  uint8_t sendIpcReg:1, recvIpcReg:1;
 };
 
 // Compute the subset of the data transfer corresponding to the given part index.
@@ -266,6 +271,10 @@ struct alignas(16) ncclDevWorkColl {
   uint32_t root;
   void* recvbuff;
   void* sendbuff;
+  uintptr_t sendbuffOffset;
+  uintptr_t recvbuffOffset;
+  uintptr_t* sendbuffRmtAddrs;
+  uintptr_t* recvbuffRmtAddrs;
   union {
     // Continuous-byte-distribution scheduling. The lo and hi channels are of
     // different size than the channels in the middle.
@@ -384,6 +393,7 @@ struct ncclDevComm {
   int nNodes;
   int buffSizes[NCCL_NUM_PROTOCOLS];
   int p2pChunkSize;
+  int isNvlink;
 
   // Work fifo return credits
   uint32_t* workConsumed/*[MAXCHANNELS]*/;
@@ -395,6 +405,7 @@ struct ncclDevComm {
 
   // Channels, device side
   struct ncclDevChannel* channels/*[MAXCHANNELS]*/;
+  int* rankToLocalRank;
 };
 
 struct alignas(16) ncclDevCommAndChannels {
@@ -539,11 +550,12 @@ inline int ncclDevFuncId(int coll, int devRedOp, int type, int algo, int proto)
     if (coll == ncclFuncSendRecv) break;
     row += 1;
 
-    int nAlgos = 3;
+    int nAlgos = 4;
     if (coll == ncclFuncAllGather) {
       int algo1 = algo == NCCL_ALGO_RING ? 0 :
                   algo == NCCL_ALGO_COLLNET_DIRECT ? 1 :
-                /*algo == NCCL_ALGO_NVLS*/ 2;
+                  algo == NCCL_ALGO_NVLS ? 2 :
+                /*algo == NCCL_ALGO_PAT*/ 3;
       row += algo1*NCCL_NUM_PROTOCOLS + proto;
       break;
     }
@@ -556,7 +568,7 @@ inline int ncclDevFuncId(int coll, int devRedOp, int type, int algo, int proto)
     }
     row += nAlgos*NCCL_NUM_PROTOCOLS;
 
-    nAlgos = NCCL_NUM_ALGORITHMS;
+    nAlgos = 6;
     if (coll == ncclFuncAllReduce) {
       row += ((devRedOp*NumTypes + type)*nAlgos + algo)*NCCL_NUM_PROTOCOLS + proto;
       break;
@@ -570,11 +582,12 @@ inline int ncclDevFuncId(int coll, int devRedOp, int type, int algo, int proto)
     }
     row += ncclNumDevRedOps*NumTypes*nAlgos*NCCL_NUM_PROTOCOLS;
 
-    nAlgos = 3;
+    nAlgos = 4;
     if (coll == ncclFuncReduceScatter) {
       int algo1 = algo == NCCL_ALGO_RING ? 0 :
                   algo == NCCL_ALGO_COLLNET_DIRECT ? 1 :
-                /*algo == NCCL_ALGO_NVLS*/ 2;
+                  algo == NCCL_ALGO_NVLS ? 2 :
+                /*algo == NCCL_ALGO_PAT*/ 3;
       row += ((devRedOp*NumTypes + type)*nAlgos + algo1)*NCCL_NUM_PROTOCOLS + proto;
       break;
     }
diff --git a/src/include/graph.h b/src/include/graph.h
index 0271b52d12..b6d86b398e 100644
--- a/src/include/graph.h
+++ b/src/include/graph.h
@@ -33,13 +33,14 @@ ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm);
 
 // Query topology
 ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int64_t* id, int* dev, int* proxyRank);
-ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank);
+ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank);
 ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* ret);
 ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int64_t netId, int read, int* useGdr);
 ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int* flush);
-ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net);
+ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int rank1, int rank2, int* net);
 int ncclPxnDisable(struct ncclComm* comm);
 ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks);
+ncclResult_t ncclGetLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu);
 
 // Find CPU affinity
 ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity);
@@ -76,7 +77,7 @@ ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system);
 #define NCCL_TOPO_PATTERN_COLLNET_DIRECT 6  // Collnet Direct
 struct ncclTopoGraph {
   // Input / output
-  int id; // ring : 0, tree : 1, collnet : 2
+  int id; // ring : 0, tree : 1, collnet : 2, nvls : 3, collnetDirect : 4
   int pattern;
   int crossNic;
   int collNet;
diff --git a/src/include/nccl_common.h b/src/include/nccl_common.h
index a0fb3a55fe..26851b17e3 100644
--- a/src/include/nccl_common.h
+++ b/src/include/nccl_common.h
@@ -50,7 +50,7 @@ typedef enum {
   ncclNumFuncs = 8
 } ncclFunc_t;
 
-#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
+#define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet*
 #define NCCL_ALGO_UNDEF -1
 #define NCCL_ALGO_TREE 0
 #define NCCL_ALGO_RING 1
@@ -58,6 +58,7 @@ typedef enum {
 #define NCCL_ALGO_COLLNET_CHAIN 3
 #define NCCL_ALGO_NVLS 4
 #define NCCL_ALGO_NVLS_TREE 5
+#define NCCL_ALGO_PAT 6
 
 #define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
 #define NCCL_PROTO_UNDEF -1
diff --git a/src/include/nvtx.h b/src/include/nvtx.h
index 3bdfec59d7..14b317fddd 100644
--- a/src/include/nvtx.h
+++ b/src/include/nvtx.h
@@ -16,20 +16,23 @@
 #endif
 
 // Define all NCCL-provided static schema IDs here (avoid duplicates).
-#define NVTX_SID_CommInitRank  0
-#define NVTX_SID_CommInitAll   1
-#define NVTX_SID_CommDestroy   2 // same schema as NVTX_SID_CommInitRank
-#define NVTX_SID_CommAbort     3 // same schema as NVTX_SID_CommInitRank
-#define NVTX_SID_AllGather     4
-#define NVTX_SID_AllReduce     5
-#define NVTX_SID_Broadcast     6
-#define NVTX_SID_ReduceScatter 7
-#define NVTX_SID_Reduce        8
-#define NVTX_SID_Send          9
-#define NVTX_SID_Recv          10
+#define NVTX_SID_CommInitRank         0
+#define NVTX_SID_CommInitAll          1
+#define NVTX_SID_CommDestroy          2 // same schema as NVTX_SID_CommInitRank
+#define NVTX_SID_CommAbort            3 // same schema as NVTX_SID_CommInitRank
+#define NVTX_SID_AllGather            4
+#define NVTX_SID_AllReduce            5
+#define NVTX_SID_Broadcast            6
+#define NVTX_SID_ReduceScatter        7
+#define NVTX_SID_Reduce               8
+#define NVTX_SID_Send                 9
+#define NVTX_SID_Recv                 10
+#define NVTX_SID_CommInitRankConfig   11 // same schema as NVTX_SID_CommInitRank
+#define NVTX_SID_CommInitRankScalable 12 // same schema as NVTX_SID_CommInitRank
+#define NVTX_SID_CommSplit            13
 
 // Define static schema ID for the reduction operation.
-#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 11 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
+#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 14 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
 
 extern const nvtxDomainHandle_t ncclNvtxDomainHandle;
 
diff --git a/src/include/p2p.h b/src/include/p2p.h
index 5c73a6cf17..e49c45dfe7 100644
--- a/src/include/p2p.h
+++ b/src/include/p2p.h
@@ -34,11 +34,36 @@ typedef union {
   // Legacy CUDA IPC
   cudaIpcMemHandle_t devIpc;
   // cuMem API support
-  ncclCuDesc cuDesc;
+  struct {
+    ncclCuDesc cuDesc;
+    CUmemGenericAllocationHandle memHandle;
+  };
 } ncclIpcDesc;
 
-ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, void **ptr);
+enum ncclIpcRegType {
+  NCCL_IPC_SENDRECV = 0,
+  NCCL_IPC_COLLECTIVE = 1
+};
+
+struct ncclIpcImpInfo {
+  void* rmtRegAddr;
+  bool legacyIpcCap;
+  uintptr_t offset;
+};
+
+struct ncclIpcRegInfo {
+  int peerRank;
+  void* baseAddr;
+  struct ncclProxyConnector* ipcProxyconn;
+  struct ncclIpcImpInfo impInfo;
+};
+
+ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, int directMap, ncclIpcDesc *ipcDesc, void **ptr);
 ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc);
-ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr);
+ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int peer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr);
+ncclResult_t ncclIpcLocalRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut);
+ncclResult_t ncclIpcGraphRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut, void* cleanupQueuePtr, int* nCleanupQueueElts);
+
+ncclResult_t ncclIpcDeregBuffer(struct ncclComm* comm, struct ncclIpcRegInfo* regInfo);
 
 #endif
diff --git a/src/include/profiler.h b/src/include/profiler.h
index 103af99adf..36774dc848 100644
--- a/src/include/profiler.h
+++ b/src/include/profiler.h
@@ -4,34 +4,52 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#ifndef NCCL_PROFILER_H_
-#define NCCL_PROFILER_H_
+#ifndef PROFILER_H_
+#define PROFILER_H_
 
-#include "proxy.h"
+#include <cuda_runtime.h>
+#include "nccl_profiler.h"
 
-enum ncclProxyProfileState {
-  ncclProxyProfileBegin = 0,
+struct ncclProxyArgs;
+struct ncclKernelPlan;
+struct ncclTaskColl;
+struct ncclTaskP2p;
+struct ncclInfo;
+struct ncclComm;
+struct ncclProxyOp;
 
-  ncclProxyProfileSendGPUWait = 1,
-  ncclProxyProfileSendWait = 2,
+// Plugin Init/Finalize Wrappers
+ncclResult_t ncclProfilerPluginInit(struct ncclComm* comm);
+ncclResult_t ncclProfilerPluginFinalize(struct ncclComm* comm);
 
-  ncclProxyProfileRecvWait = 1,
-  ncclProxyProfileRecvFlushWait = 2,
-  ncclProxyProfileRecvGPUWait = 3,
+// Profiler Start/Stop Group Wrappers
+ncclResult_t ncclProfilerStartGroupEvent(struct ncclKernelPlan* plan);
+ncclResult_t ncclProfilerStopGroupEvent(struct ncclKernelPlan* plan);
 
-  ncclProxyProfileEnd = 4,
+// Profiler Start/Stop Task Events Wrappers
+ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan);
+ncclResult_t ncclProfilerStopTaskEvents(struct ncclKernelPlan* plan);
 
-  ncclProxyProfileSleep = 8,
-  ncclProxyProfileWakeup = 9,
+// Proxy Op Start/Stop Event Wrappers
+ncclResult_t ncclProfilerStartSendProxyOpEvent(int sub, struct ncclProxyArgs* args);
+ncclResult_t ncclProfilerStartRecvProxyOpEvent(int sub, struct ncclProxyArgs* args);
+ncclResult_t ncclProfilerStopProxyOpEvent(int sub, struct ncclProxyArgs* args);
 
-  ncclProxyProfileIdle = 16,
-  ncclProxyProfileActive = 17,
+// Proxy Step Start/Stop Event Wrappers
+ncclResult_t ncclProfilerStartSendProxyStepEvents(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi);
+ncclResult_t ncclProfilerStartRecvProxyStepEvents(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi);
+ncclResult_t ncclProfilerStopProxyStepEvents(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi);
 
-  ncclProxyProfileAppend = 24,
-  ncclProxyProfileAppendEnd = 25
-};
+// Proxy Control Start/Stop Events Wrappers
+ncclResult_t ncclProfilerStartProxyCtrlEvent(void* profilerContext, void** eHandle);
+ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle);
 
-ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state);
-void ncclProfilingDump();
+// Record Event Wrappers
+ncclResult_t ncclProfilerRecordProxyOpEventState(int sub, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState);
+ncclResult_t ncclProfilerRecordProxyStepEventStates(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi, ncclProfilerEventState_t eState);
+ncclResult_t ncclProfilerRecordProxyCtrlEventState(void*eHandle, int appended, ncclProfilerEventState_t eState);
+
+// Profiler utility functions
+ncclResult_t ncclProfilerAddPidToProxyOp(struct ncclProxyOp* op);
 
 #endif
diff --git a/src/include/proxy.h b/src/include/proxy.h
index eab6930fe4..a1c44d6b1f 100644
--- a/src/include/proxy.h
+++ b/src/include/proxy.h
@@ -13,7 +13,7 @@
 #include "ipcsocket.h"
 #include "nccl_net.h"
 #include <pthread.h>
-#include "shm.h"
+#include "shmutils.h"
 #include "p2p.h"
 
 typedef enum : uint8_t {
@@ -28,6 +28,8 @@ typedef enum : uint8_t {
   ncclPatternCollnetDirect,
   ncclPatternNvls,
   ncclPatternNvlsTree,
+  ncclPatternPatUp,
+  ncclPatternPatDown,
   ncclPatternSend,
   ncclPatternRecv
 } ncclPattern_t;
@@ -72,6 +74,19 @@ struct ncclProxyOp {
 
   union ncclProxyOpSpecifics specifics;
 
+  // Profiler plugin
+  union {
+    struct ncclTaskColl* coll;
+    struct ncclTaskP2p* p2p;
+  } task;
+
+  int eActivationMask;
+  void* taskEventHandle;
+  int rank;
+  int peer;
+  pid_t pid;
+  void* profilerContext;
+
   struct ncclProxyOp *enqNext;
 };
 
@@ -100,7 +115,15 @@ struct ncclProxySubArgs {
   uint64_t done;
   uint64_t end;
   void* requests[NCCL_STEPS];
-  void* profilingEvents[NCCL_STEPS];
+
+  // Profiler plugin
+  int eActivationMask;
+  int rank;
+  void* taskEventHandle;
+  void* opEventHandle;
+  void* stepEventHandles[NCCL_STEPS];
+  size_t transSize;
+
   void* recvRequestsCache[NCCL_STEPS];
   int recvRequestsSubCount;
 };
@@ -129,6 +152,10 @@ struct ncclProxyArgs {
 
   int idle;
 
+  // Profiler plugin
+  pid_t pid;
+  void* profilerContext;
+
   // Element linking
   struct ncclProxyArgs* next;
   struct ncclProxyArgs* nextPeer;
@@ -261,6 +288,7 @@ struct ncclProxyState {
   ncclNet_t* ncclNet;
   ncclCollNet_t* ncclCollNet;
   uint32_t* abortFlag;
+  bool directMode;
   // Service threads
   pthread_t thread;
   pthread_t threadUDS;
@@ -281,6 +309,9 @@ struct ncclProxyState {
   // Progress thread
   struct ncclProxyProgressState progressState;
 
+  // Profiler plugin
+  void* profilerContext;
+
   // Queue of expected responses from the proxy
   struct ncclExpectedProxyResponse* expectedResponses;
 };
@@ -332,8 +363,9 @@ enum ncclProxyMsgType {
   ncclProxyMsgAbort = 7,
   ncclProxyMsgStop = 8,
   ncclProxyMsgGetFd = 9, // cuMem API support (UDS)
-  ncclProxyMsgRegister = 10,
-  ncclProxyMsgDeregister = 11
+  ncclProxyMsgQueryFd = 10,
+  ncclProxyMsgRegister = 11,
+  ncclProxyMsgDeregister = 12
 };
 
 // This function is called by a client of the proxy that needs to invoke any of the non-progress proxyOp types
@@ -347,6 +379,7 @@ ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnec
 
 // UDS support
 ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, int rank, void *handle, int* convertedFd);
+ncclResult_t ncclProxyClientQueryFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int localFd, int* rmtFd);
 
 ncclResult_t ncclProxyStop(struct ncclComm* comm);
 ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm);
diff --git a/src/include/register.h b/src/include/register.h
index 9f7c83faaf..7c60535d9a 100644
--- a/src/include/register.h
+++ b/src/include/register.h
@@ -11,7 +11,13 @@ enum {
   NVLS_REG_COMPLETE = 0x02,
   NVLS_REG_POSSIBLE = 0x04,
   NVLS_REG_NO_SUPPORT = 0x08,
-  COLLNET_REG_COMPLETE = 0x10
+  COLLNET_REG_COMPLETE = 0x10,
+  IPC_REG_COMPLETE = 0x20
+};
+
+struct ncclPeerRegIpcAddr {
+  uintptr_t* devPeerRmtAddrs;
+  uintptr_t* hostPeerRmtAddrs;
 };
 
 struct ncclReg {
@@ -34,7 +40,10 @@ struct ncclReg {
   uintptr_t caddrs[NCCL_MAX_LOCAL_RANKS]; /* use to check if NVLS buffers match among intra-node ranks */
   // collnet reg
   void* collnetHandle;
-  struct ncclProxyConnector* proxyconn;
+  struct ncclProxyConnector* collnetProxyconn;
+  // general ipc reg
+  struct ncclPeerRegIpcAddr regIpcAddrs;
+  struct ncclIpcRegInfo* ipcInfos[NCCL_MAX_LOCAL_RANKS];
 };
 
 struct ncclRegCache {
diff --git a/src/include/shm.h b/src/include/shm.h
index 1db16662d5..b519e5dc92 100644
--- a/src/include/shm.h
+++ b/src/include/shm.h
@@ -1,26 +1,37 @@
-/*************************************************************************
- * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
 #ifndef NCCL_SHM_H_
 #define NCCL_SHM_H_
 
-#include "nccl.h"
+#include "comm.h"
 
-typedef void* ncclShmHandle_t;
-ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle);
-ncclResult_t ncclShmClose(ncclShmHandle_t handle);
-ncclResult_t ncclShmUnlink(ncclShmHandle_t handle);
-
-struct ncclShmemCollBuff {
-  volatile size_t *cnt[2];
-  volatile void *ptr[2];
-  int round;
-  size_t maxTypeSize;
+struct shmLegacyIpc {
+  char shmSuffix[7];
+  ncclShmHandle_t handle;
+  size_t shmSize;
 };
 
-ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize);
+struct shmCuIpc {
+  union {
+    CUmemFabricHandle handle;
+    CUmemGenericAllocationHandle data;
+  };
+  int tpProxyRank;
+  void *ptr;
+  size_t size;
+};
+
+struct shmIpcDesc {
+  union
+  {
+    struct shmLegacyIpc shmli;
+    struct shmCuIpc shmci;
+  };
+  bool legacy;
+};
+
+typedef struct shmIpcDesc ncclShmIpcDesc_t;
+
+ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool legacy, ncclShmIpcDesc_t *descOut, void **hptr, void **dptr);
+ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut);
+ncclResult_t ncclShmIpcClose(ncclShmIpcDesc_t *desc);
 
 #endif
diff --git a/src/include/timer.h b/src/include/timer.h
index 284fec6e05..e8b0ba38ed 100644
--- a/src/include/timer.h
+++ b/src/include/timer.h
@@ -33,15 +33,15 @@ static double startTimes[8];
 #define TIME_START(index) do { \
   counts[index]++; \
   startTimes[index] = gettime(); \
-} while (0);
+} while (0)
 
 #define TIME_STOP(index) do { \
   times[index] += gettime() - startTimes[index]; \
-} while (0);
+} while (0)
 
 #define TIME_CANCEL(index) do { \
   counts[index]--; \
-} while (0);
+} while (0)
 
 #define TIME_PRINT(name) do { \
   printf("%s stats", name); \
@@ -50,11 +50,11 @@ static double startTimes[8];
     counts[i] = 0; \
   } \
   printf("\n"); \
-} while (0);
+} while (0)
 #else
-#define TIME_START(index) while(0);
-#define TIME_STOP(index) while(0);
-#define TIME_CANCEL(index) while(0);
+#define TIME_START(index) do {} while(0)
+#define TIME_STOP(index) do {} while(0)
+#define TIME_CANCEL(index) do {} while(0)
 #define TIME_PRINT(name)
 #endif
 #endif
diff --git a/src/include/transport.h b/src/include/transport.h
index 07fbb3ec43..cbeb613ca5 100644
--- a/src/include/transport.h
+++ b/src/include/transport.h
@@ -48,9 +48,10 @@ struct ncclPeerInfo {
   // MNNVL support
   nvmlGpuFabricInfoV_t fabricInfo;
   int cuMemSupport;
+  int version;
 };
 
-#define CONNECT_SIZE 128
+#define CONNECT_SIZE 256
 struct ncclConnect {
   char data[CONNECT_SIZE];
 };
@@ -91,7 +92,6 @@ struct ncclCollNetSharedRes {
   void* resources;
   int nChannels;
   size_t buffSize;
-  int intraHighestTransportType;
 };
 
 struct ncclTransportComm {
@@ -109,13 +109,14 @@ struct ncclTransportComm {
 
 struct ncclTransport {
   const char name[8];
-  ncclResult_t (*canConnect)(int*, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*);
+  ncclResult_t (*canConnect)(int*, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*);
   struct ncclTransportComm send;
   struct ncclTransportComm recv;
 };
 
 ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
 ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL);
+ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* intraNodeP2pSupport, bool* directMode);
 
 ncclResult_t ncclNvlsInit(struct ncclComm* comm);
 ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent);
@@ -127,7 +128,7 @@ ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdevi
 ncclResult_t ncclNvlsFree(struct ncclComm* comm);
 
 enum { collNetRecv=0, collNetSend=1 };
-int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type, ncclConnect* connect);
+bool ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type, ncclConnect* connect);
 ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail);
 ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm);
 ncclResult_t ncclCollnetLocalRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, int* outRegBufUsed, void** outHandle);
@@ -136,6 +137,7 @@ ncclResult_t ncclCollnetDeregBuffer(struct ncclComm* comm, struct ncclProxyConne
 
 ncclResult_t ncclTransportRingConnect(struct ncclComm* comm);
 ncclResult_t ncclTransportTreeConnect(struct ncclComm* comm);
+ncclResult_t ncclTransportPatConnect(struct ncclComm* comm);
 
 ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTopoGraph* graphs[]);
 ncclResult_t ncclCollNetChainBufferSetup(ncclComm_t comm);
diff --git a/src/include/utils.h b/src/include/utils.h
index abecf2257a..5a1b749a76 100644
--- a/src/include/utils.h
+++ b/src/include/utils.h
@@ -27,7 +27,6 @@ ncclResult_t busIdToInt64(const char* busId, int64_t* id);
 ncclResult_t getBusId(int cudaDev, int64_t *busId);
 
 ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
-uint64_t getHash(const char* string, int n);
 uint64_t getHostHash();
 uint64_t getPidHash();
 ncclResult_t getRandomData(void* buffer, size_t bytes);
diff --git a/src/init.cc b/src/init.cc
index 16e02d49c6..94c2fb10ee 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -37,7 +37,7 @@
 #endif
 
 const char* ncclFuncStr[NCCL_NUM_FUNCTIONS] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce" };
-const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNetDirect", "CollNetChain", "NVLS", "NVLSTree" };
+const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNetDirect", "CollNetChain", "NVLS", "NVLSTree", "PAT" };
 const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" };
 
 NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
@@ -101,9 +101,15 @@ NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out);
 ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
   NCCLCHECK(ncclInit());
   NCCLCHECK(PtrCheck(out, "GetUniqueId", "out"));
-  ncclResult_t res = bootstrapGetUniqueId((struct ncclBootstrapHandle*)out);
+  struct ncclBootstrapHandle handle;
+  NCCLCHECK(bootstrapGetUniqueId(&handle));
+  // ncclUniqueId and bootstrapHandle don't have the same size and alignment
+  // reset to 0 to avoid undefined data
+  memset(out, 0, sizeof(*out));
+  // copy to avoid alignment mismatch
+  memcpy(out, &handle, sizeof(handle));
   TRACE_CALL("ncclGetUniqueId(0x%llx)", (unsigned long long)hashUniqueId(*out));
-  return res;
+  return ncclSuccess;
 }
 
 // Prevent compiler from optimizing out these operations
@@ -147,7 +153,7 @@ void ncclCommPushCudaFree(struct ncclComm* comm, void* obj) {
 }
 
 static ncclResult_t ncclDestructorFnCudaHostFree(struct ncclDestructor* dtor) {
-  CUDACHECK(cudaFreeHost(dtor->obj));
+  NCCLCHECK(ncclCudaHostFree(dtor->obj));
   return ncclSuccess;
 }
 void ncclCommPushCudaHostFree(struct ncclComm* comm, void* obj) {
@@ -180,13 +186,15 @@ static ncclResult_t commFree(ncclComm_t comm) {
    * free all intra-process communicators; therefore, we only need to focus on local
    * resource cleanup in commFree(). */
   if (comm->proxyState && comm->proxyRefCountOld == 0 && comm->proxyState->thread) {
-    pthread_join(comm->proxyState->thread, nullptr);
+    PTHREADCHECK(pthread_join(comm->proxyState->thread, nullptr), "pthread_join");
     if (comm->proxyState->threadUDS) {
       // UDS support
-      pthread_join(comm->proxyState->threadUDS, nullptr);;
+      PTHREADCHECK(pthread_join(comm->proxyState->threadUDS, nullptr), "pthread_join");
     }
   }
 
+  CUDACHECK(cudaMemPoolDestroy(comm->memPool));
+
   delete[] comm->userRedOps;
 
   free(comm->connectSend);
@@ -244,12 +252,14 @@ static ncclResult_t commFree(ncclComm_t comm) {
 
   free(comm->topParentRanks);
   free(comm->topParentLocalRanks);
+  free(comm->gproxyConn);
 
   NCCLCHECK(ncclRegCleanup(comm));
 
   INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - %s COMPLETE", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId, abort ? "Abort" : "Destroy");
 
   commPoison(comm); // poison comm before free to avoid comm reuse.
+  NCCLCHECK(ncclProfilerPluginFinalize(comm));
   NCCLCHECK(ncclNetFinalize(comm));
   NCCLCHECK(ncclNetPluginUnload(comm));
   free(comm);
@@ -328,6 +338,7 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
 
   NCCLCHECK(ncclNetPluginLoad(comm));
   NCCLCHECK(ncclNetInit(comm));
+  NCCLCHECK(ncclProfilerPluginInit(comm));
   INFO(NCCL_INIT, "Using network %s", comm->ncclNet->name);
 
   if (parent && parent->config.splitShare) {
@@ -393,8 +404,28 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
   }
 
   ncclIntruQueueMpscConstruct(&comm->callbackQueue);
+  ncclIntruQueueConstruct(&comm->legacyRegCleanupQueue);
 
   comm->regCache.pageSize = sysconf(_SC_PAGESIZE);
+
+  do {
+    cudaMemPoolProps props = {};
+    props.allocType = cudaMemAllocationTypePinned;
+    props.handleTypes = cudaMemHandleTypeNone;
+    props.location.type = cudaMemLocationTypeDevice;
+    props.location.id = comm->cudaDev;
+    CUDACHECK(cudaMemPoolCreate(&comm->memPool, &props));
+    uint64_t releaseThreshold = ~uint64_t(0);
+    CUDACHECK(cudaMemPoolSetAttribute(comm->memPool, cudaMemPoolAttrReleaseThreshold, &releaseThreshold));
+  } while (0);
+
+  ncclIntruQueueConstruct(&comm->eventCallbackQueue);
+
+  //  setup intraComm0 and intraRanks 0 to default values to ensure proper cleanup of the communicator
+  comm->intraComm0 = comm;
+  comm->intraRank = 0;
+  comm->intraRanks = 1;
+
   return ncclSuccess;
 }
 
@@ -408,12 +439,16 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
   NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), ret, fail);
   NCCLCHECKGOTO(ncclCudaCallocAsync(&devCommAndChans, 1, comm->sharedRes->deviceStream.cudaStream), ret, fail);
   ncclCommPushCudaFree(comm, devCommAndChans);
+  NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.rankToLocalRank, comm->nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
+  ncclCommPushCudaFree(comm, tmpCommAndChans.comm.rankToLocalRank);
+  NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.comm.rankToLocalRank, comm->rankToLocalRank, comm->nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
   comm->devComm = &devCommAndChans->comm;
   tmpCommAndChans.comm.rank = comm->rank;
   tmpCommAndChans.comm.nRanks = nRanks;
   tmpCommAndChans.comm.node = comm->node;
   tmpCommAndChans.comm.nNodes = comm->nNodes;
   tmpCommAndChans.comm.abortFlag = comm->abortFlagDev;
+  tmpCommAndChans.comm.isNvlink = ncclTopoPathAllNVLink(comm->topo);
   for (int p=0; p < NCCL_NUM_PROTOCOLS; p++) {
     tmpCommAndChans.comm.buffSizes[p] = comm->buffSizes[p];
   }
@@ -498,10 +533,13 @@ static void showVersion() {
   }
 }
 
+NCCL_PARAM(MNNVLCliqueId, "MNNVL_CLIQUE_ID", -1);
+
 static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, uint64_t commHash) {
   info->rank = comm->rank;
   info->cudaDev = comm->cudaDev;
   info->nvmlDev = comm->nvmlDev;
+  NCCLCHECK(ncclGetVersion(&info->version));
   info->hostHash=getHostHash()+commHash;
   info->pidHash=getPidHash()+commHash;
   info->cuMemSupport = ncclCuMemEnable();
@@ -534,6 +572,7 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u
            ((long *)&info->fabricInfo.clusterUuid)[0], ((long *)&info->fabricInfo.clusterUuid)[1],
            info->fabricInfo.cliqueId, info->fabricInfo.state, info->fabricInfo.healthMask);
     }
+    if (ncclParamMNNVLCliqueId() != -1) info->fabricInfo.cliqueId = ncclParamMNNVLCliqueId();
   }
 
   return ncclSuccess;
@@ -677,7 +716,8 @@ static int checkMNNVL(struct ncclComm* comm) {
 #define TIMER_INIT_TOPO 4
 #define TIMER_INIT_GRAPHS 5
 #define TIMER_INIT_CONNECT 6
-#define TIMERS_INIT_COUNT 7
+#define TIMER_INIT_ALLOC 7
+#define TIMERS_INIT_COUNT 8
 
 static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* parent, uint64_t timers[TIMERS_INIT_COUNT]) {
   // We use 2 AllGathers
@@ -693,7 +733,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   struct ncclTopoGraph* collNetChainGraph = &comm->graphs[NCCL_ALGO_COLLNET_CHAIN];
   struct ncclTopoGraph* collNetDirectGraph = &comm->graphs[NCCL_ALGO_COLLNET_DIRECT];
   struct ncclTopoGraph* nvlsGraph = &comm->graphs[NCCL_ALGO_NVLS];
-  struct ncclTopoGraph* graphs[] = { treeGraph, ringGraph, collNetDirectGraph, collNetChainGraph, nvlsGraph, nvlsGraph };
+  struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetDirectGraph, collNetChainGraph, nvlsGraph, nvlsGraph, treeGraph };
 
   struct graphInfo {
     int pattern;
@@ -722,7 +762,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   struct ncclProxyConnector proxyConn;
   int* pxnPeers = NULL;
   int *topParentLocalRanks = NULL;
-  int tpProxyRank;
 
   timers[TIMER_INIT_ALLGATHER] = clockNano();
   // AllGather1 - begin
@@ -732,6 +771,12 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
 
   comm->cuMemSupport = 1;
   for (int i = 0; i < nranks; i++) {
+    if (comm->peerInfo[i].version != comm->peerInfo[rank].version) {
+      WARN("Mismatched NCCL version detected : rank %d version %d rank %d version %d",
+           i, comm->peerInfo[i].version, rank, comm->peerInfo[rank].version);
+      ret = ncclInvalidUsage;
+      goto fail;
+    }
     if (comm->peerInfo[i].hostHash != comm->peerInfo[rank].hostHash) nNodes++;
     if (!comm->peerInfo[i].cuMemSupport) comm->cuMemSupport = 0;
     if ((i != rank) && (comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].busId == comm->peerInfo[rank].busId)) {
@@ -869,7 +914,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   collNetChainGraph->maxChannels = ringGraph->nChannels;
 
   memset(collNetDirectGraph, 0, sizeof(struct ncclTopoGraph));
-  collNetDirectGraph->id = 2;
+  collNetDirectGraph->id = 4;
   collNetDirectGraph->pattern = NCCL_TOPO_PATTERN_COLLNET_DIRECT;
   collNetDirectGraph->collNet = 1;
   collNetDirectGraph->minChannels = 1;
@@ -1031,18 +1076,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
       INFO(NCCL_INIT, "Communicator has %d nodes which is less than CollNet node threshold %d, disabling CollNet", comm->nNodes, collNetNodeThreshold);
       comm->collNetSupport = 0;
     }
-    comm->collNetRegSupport = true;
-    for (int n=0; n<comm->nNodes; n++) {
-      if (comm->nodeRanks[n].localRanks > NCCL_MAX_DIRECT_ARITY+1) {
-        WARN("CollNet currently only supports up to %d GPUs per node, disabling CollNet", NCCL_MAX_DIRECT_ARITY+1);
-        comm->collNetSupport = 0;
-        break;
-      }
-      if (comm->nodeRanks[n].localRanks > 1) {
-        // As long as there is more than 1 rank on any node, we need to disable collnet reg
-        comm->collNetRegSupport = false;
-      }
-    }
+    // As long as there is more than 1 rank on any node, we need to disable collnet reg
+    comm->collNetRegSupport = (comm->maxLocalRanks == 1);
   }
 
   NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail);
@@ -1085,6 +1120,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   }
   comm->topParentLocalRanks = topParentLocalRanks;
 
+  NCCLCHECKGOTO(ncclTransportCheckP2pType(comm, &comm->intraNodeP2pSupport, &comm->directMode), ret, fail);
   // Launch proxy service thread, after this, the proxy calls can be used.
   if (parent && parent->config.splitShare) {
     comm->proxyState = parent->sharedRes->proxyState;
@@ -1092,7 +1128,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   } else {
     NCCLCHECKGOTO(ncclProxyCreate(comm), ret, fail);
   }
-  
+  NCCLCHECKGOTO(ncclCalloc(&comm->gproxyConn, comm->nRanks), ret, fail);
+
   timers[TIMER_INIT_CONNECT] = clockNano();
   do { // Build p2p schedule
     int node = comm->node;
@@ -1168,6 +1205,9 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
     // Connect Trees
     NCCLCHECKGOTO(ncclTransportTreeConnect(comm), ret, fail);
 
+    // Connect PAT only for communicators with 1 GPU per node
+    if (comm->maxLocalRanks == 1) NCCLCHECKGOTO(ncclTransportPatConnect(comm), ret, fail);
+
     // Setup NVLS
     NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail);
     NCCLCHECKGOTO(ncclNvlsBufferSetup(comm), ret, fail);
@@ -1179,12 +1219,13 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
     if (comm->collNetSupport > 0) {
       ncclCollNetSetup(comm, parent, graphs);
       NCCLCHECKGOTO(ncclCollNetChainBufferSetup(comm), ret, fail);
-      NCCLCHECKGOTO(ncclCollNetDirectBufferSetup(comm), ret, fail);
+      if (comm->maxLocalRanks <= NCCL_MAX_DIRECT_ARITY+1) {
+        NCCLCHECKGOTO(ncclCollNetDirectBufferSetup(comm), ret, fail);
+      }
     }
 
     // Connect to local net proxy
-    tpProxyRank = comm->topParentRanks[comm->rank];
-    NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &proxyConn), ret, fail);
+    NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, comm->rank, &proxyConn), ret, fail);
     NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
 
     // Then to remote ones when using PXN
@@ -1192,8 +1233,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
       int nranks;
       NCCLCHECKGOTO(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks), ret, fail);
       for (int r=0; r<nranks; r++) {
-        tpProxyRank = comm->topParentRanks[pxnPeers[r]];
-        NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &proxyConn), ret, fail);
+        NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, pxnPeers[r], &proxyConn), ret, fail);
         NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
       }
     }
@@ -1286,17 +1326,20 @@ NCCL_PARAM(MaxCTAs, "MAX_CTAS", NCCL_CONFIG_UNDEF_INT);
 NCCL_PARAM(MinCTAs, "MIN_CTAS", NCCL_CONFIG_UNDEF_INT);
 #define NCCL_MAX_CGA_CLUSTER_SIZE 8
 
+#define NCCL_COMMINIT_FUNCNAME_LEN 128
 struct ncclCommInitRankAsyncJob {
   struct ncclAsyncJob base;
   struct ncclComm* comm;
   struct ncclComm** newcomm;
   int cudaDev;
   // For ncclCommInitRank
-  int nranks, myrank;
-  ncclUniqueId commId;
+  int nranks, myrank, nId;
+  ncclUniqueId* commId;
   // for ncclCommSplit
   struct ncclComm* parent;
   int color, key;
+  // name of the function calling
+  char funcName[NCCL_COMMINIT_FUNCNAME_LEN];
 };
 
 struct ncclCommFinalizeAsyncJob {
@@ -1306,30 +1349,31 @@ struct ncclCommFinalizeAsyncJob {
 
 NCCL_PARAM(CommSplitShareResources, "COMM_SPLIT_SHARE_RESOURCES", NCCL_CONFIG_UNDEF_INT);
 
+typedef struct{
+  int key;
+  int color;
+} commSplitInfo;
 static ncclResult_t commGetSplitInfo(struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* nRanksRet, int* myRankRet, int* parentRanksRet) {
-  int* colors = NULL;
-  int* keys = NULL;
   int nRanks = 0, myRank = 0;
   ncclResult_t ret = ncclSuccess;
 
-  NCCLCHECKGOTO(ncclCalloc(&colors, parent->nRanks), ret, fail);
-  NCCLCHECKGOTO(ncclCalloc(&keys, parent->nRanks), ret, fail);
+  commSplitInfo* info = NULL;
+  NCCLCHECKGOTO(ncclCalloc(&info, parent->nRanks), ret, fail);
 
   // Compute nRanks, my rank and the ranks (of the original comm) before and after me
-  colors[parent->rank] = color;
-  keys[parent->rank] = key;
-  NCCLCHECKGOTO(bootstrapAllGather(parent->bootstrap, colors, sizeof(int)), ret, fail);
-  NCCLCHECKGOTO(bootstrapAllGather(parent->bootstrap, keys, sizeof(int)), ret, fail);
+  info[parent->rank].color = color;
+  info[parent->rank].key = key;
+  NCCLCHECKGOTO(bootstrapAllGather(parent->bootstrap, info, sizeof(commSplitInfo)), ret, fail);
 
   // Negative color does not create a new comm. Return now.
   if (color == NCCL_SPLIT_NOCOLOR) goto exit;
 
   memset(parentRanksRet, 0xff, sizeof(int) * parent->nRanks);
   for (int i = 0; i < parent->nRanks; i++) {
-    if (colors[i] != color) continue;
+    if (info[i].color != color) continue;
     // Find where to insert this rank
     int insert = 0;
-    while (insert < nRanks && keys[parentRanksRet[insert]] <= keys[i]) insert++;
+    while (insert < nRanks && info[parentRanksRet[insert]].key <= info[i].key) insert++;
     // Shift ranks by one after insert
     for (int r = nRanks; r > insert; r--) parentRanksRet[r] = parentRanksRet[r - 1];
     // Insert our rank
@@ -1345,8 +1389,7 @@ static ncclResult_t commGetSplitInfo(struct ncclComm* comm, struct ncclComm* par
   *myRankRet = myRank;
 
 exit:
-  free(colors);
-  free(keys);
+  free(info);
   return ret;
 fail:
   goto exit;
@@ -1361,7 +1404,9 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
   int cudaDev = job->cudaDev;
   int* parentRanks = NULL;
   int cudaArch;
-  uint64_t timers[TIMERS_INIT_COUNT];
+  double sum_timers = 0;
+  uint64_t timers[TIMERS_INIT_COUNT] = {0};
+  unsigned long long commIdHash;
 
   timers[TIMER_INIT_TOTAL] = clockNano();
   CUDACHECKGOTO(cudaSetDevice(cudaDev), res, fail);
@@ -1379,34 +1424,42 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
   }
   timers[TIMER_INIT_KERNELS] = clockNano() - timers[TIMER_INIT_KERNELS];
 
-  timers[TIMER_INIT_BOOTSTRAP] = clockNano();
   if (job->parent) {
     NCCLCHECKGOTO(ncclCalloc(&parentRanks, job->parent->nRanks), res, fail);
     NCCLCHECKGOTO(commGetSplitInfo(comm, job->parent, job->color, job->key, &job->nranks, &job->myrank, parentRanks), res, fail);
     // Negative color does not create a new comm object. We needed to take part in the allgather, but we're done now.
     if (job->color == NCCL_SPLIT_NOCOLOR) goto exit;
-    snprintf((char*)&job->commId, sizeof(job->commId), "%016lx-%d", job->parent->commHash, job->color);
+    timers[TIMER_INIT_ALLOC] = clockNano();
     NCCLCHECKGOTO(commAlloc(comm, job->parent, job->nranks, job->myrank), res, fail);
-    NCCLCHECKGOTO(bootstrapSplit((struct ncclBootstrapHandle*)&job->commId, comm, job->parent, job->color, job->key, parentRanks), res, fail);
+    timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC];
+    // obtain a unique hash for the comm, re-using part of the parent's hash, commHash is a 64bit struct (=16 hex), add the color
+    ncclUniqueId tmpId;
+    memset(&tmpId,0,sizeof(ncclUniqueId));// must set 0 here to avoid undefined bits
+    snprintf((char*)&tmpId, NCCL_UNIQUE_ID_BYTES, "%016lx-%d", job->parent->commHash, job->color);
+    comm->commHash = getHash(tmpId.internal, NCCL_UNIQUE_ID_BYTES);
+    INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p color %d key %d- Init START", job->funcName,
+         comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->color, job->key);
+    timers[TIMER_INIT_BOOTSTRAP] = clockNano();
+    NCCLCHECKGOTO(bootstrapSplit(comm->commHash, comm, job->parent, job->color, job->key, parentRanks), res, fail);
+    timers[TIMER_INIT_BOOTSTRAP] = clockNano() - timers[TIMER_INIT_BOOTSTRAP];
+    // debug info, no commId was used
+    commIdHash = 0;
   } else {
+    timers[TIMER_INIT_ALLOC] = clockNano();
     NCCLCHECKGOTO(commAlloc(comm, NULL, job->nranks, job->myrank), res, fail);
-    NCCLCHECKGOTO(bootstrapInit((struct ncclBootstrapHandle*)&job->commId, comm), res, fail);
+    timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC];
+    // obtain a unique hash using the first commId
+    comm->commHash = getHash(job->commId->internal, NCCL_UNIQUE_ID_BYTES);
+    commIdHash = hashUniqueId(job->commId[0]);
+    INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init START", job->funcName,
+         comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, commIdHash);
+    timers[TIMER_INIT_BOOTSTRAP] = clockNano();
+    NCCLCHECKGOTO(bootstrapInit(job->nId, (struct ncclBootstrapHandle*)job->commId, comm), res, fail);
+    timers[TIMER_INIT_BOOTSTRAP] = clockNano() - timers[TIMER_INIT_BOOTSTRAP];
   }
-  timers[TIMER_INIT_BOOTSTRAP] = clockNano() - timers[TIMER_INIT_BOOTSTRAP];
-
   comm->cudaArch = cudaArch;
-  comm->commHash = getHash(job->commId.internal, NCCL_UNIQUE_ID_BYTES);
-
-  if (job->parent) {
-    INFO(NCCL_INIT,"ncclCommSplit comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p color %d key %d commId 0x%llx - Init START",
-    comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->color, job->key, (unsigned long long)hashUniqueId(job->commId));
-  } else {
-    INFO(NCCL_INIT,"ncclCommInitRank comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init START",
-    comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, (unsigned long long)hashUniqueId(job->commId));
-  }
 
   NCCLCHECKGOTO(initTransportsRank(comm, job->parent, timers), res, fail);
-
   NCCLCHECKGOTO(ncclTunerPluginLoad(comm), res, fail);
   if (comm->tuner) {
     NCCLCHECK(comm->tuner->init(comm->nRanks, comm->nNodes, ncclDebugLog, &comm->tunerContext));
@@ -1420,23 +1473,25 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
   if (job->parent) {
     /* unlink child abort flag. */
     __atomic_store_n(&job->parent->childAbortFlag, NULL, __ATOMIC_RELEASE);
-    TRACE_CALL("ncclCommSplit(%p, %d, %d, %p, %d, %d)",
-                job->parent, job->color, job->key, comm, comm->rank, comm->nRanks);
+    TRACE_CALL("ncclCommSplit(%p, %d, %d, %p, %d, %d)", job->parent, job->color, job->key, comm, comm->rank, comm->nRanks);
+    INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p color %d key %d - Init COMPLETE", job->funcName,
+         comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->color, job->key);
   } else {
-    TRACE_CALL("ncclCommInitRank(%p, %d, 0x%llx, %d, %d)",
-                comm, comm->nRanks, (unsigned long long)hashUniqueId(job->commId), comm->rank, comm->cudaDev);
+    // the name for the replay tool is ncclCommInitRank for all the variations
+    TRACE_CALL("ncclCommInitRank(%p, %d, 0x%llx, %d, %d)", comm, comm->nRanks, commIdHash, comm->rank, comm->cudaDev);
+    INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init COMPLETE", job->funcName,
+         comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, commIdHash);
   }
-
-  if (job->parent) {
-    INFO(NCCL_INIT,"ncclCommSplit comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p color %d key %d commId 0x%llx - Init COMPLETE",
-    comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->color, job->key, (unsigned long long)hashUniqueId(job->commId));
-  } else {
-    INFO(NCCL_INIT,"ncclCommInitRank comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init COMPLETE",
-    comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, (unsigned long long)hashUniqueId(job->commId));
-  }
-  INFO(NCCL_INIT|NCCL_PROFILE,"Init timings: rank %d nranks %d total %.2f (kernels %.2f, bootstrap %.2f, allgathers %.2f, topo %.2f, graphs %.2f, connections %.2f, rest %.2f)", comm->rank, comm->nRanks, timers[TIMER_INIT_TOTAL]/1e9,
-    timers[TIMER_INIT_KERNELS]/1e9, timers[TIMER_INIT_BOOTSTRAP]/1e9, timers[TIMER_INIT_ALLGATHER]/1e9, timers[TIMER_INIT_TOPO]/1e9, timers[TIMER_INIT_GRAPHS]/1e9, timers[TIMER_INIT_CONNECT]/1e9,
-    (timers[TIMER_INIT_TOTAL]-timers[TIMER_INIT_KERNELS]-timers[TIMER_INIT_BOOTSTRAP]-timers[TIMER_INIT_ALLGATHER]-timers[TIMER_INIT_TOPO]-timers[TIMER_INIT_GRAPHS]-timers[TIMER_INIT_CONNECT])/1e9);
+  sum_timers = 0.0;
+  for (int it = 1; it < TIMERS_INIT_COUNT; ++it)
+    sum_timers += (timers[it] / 1e9);
+  INFO(NCCL_INIT | NCCL_PROFILE,
+       "Init timings - %s: rank %d nranks %d total %.2f (kernels %.2f, alloc %.2f, bootstrap %.2f, allgathers %.2f, topo %.2f, graphs %.2f, "
+       "connections %.2f, rest %.2f)",
+       job->funcName, comm->rank, comm->nRanks,
+       timers[TIMER_INIT_TOTAL] / 1e9, timers[TIMER_INIT_KERNELS] / 1e9, timers[TIMER_INIT_ALLOC] / 1e9,
+       timers[TIMER_INIT_BOOTSTRAP] / 1e9, timers[TIMER_INIT_ALLGATHER] / 1e9, timers[TIMER_INIT_TOPO] / 1e9,
+       timers[TIMER_INIT_GRAPHS] / 1e9, timers[TIMER_INIT_CONNECT] / 1e9, timers[TIMER_INIT_TOTAL] / 1e9 - sum_timers);
 exit:
   if (job->newcomm) {
     /* assign it to user pointer. */
@@ -1621,17 +1676,24 @@ fail:
   goto exit;
 }
 
-static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int cudaDev, ncclConfig_t *config) {
-  ncclResult_t res = ncclSuccess;
-  ncclComm_t comm = NULL;
-  struct ncclCommInitRankAsyncJob *job = NULL;
-  const char* env = ncclGetEnv("NCCL_COMM_ID");
-  if (env && myrank == 0) {
-    INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env);
-    NCCLCHECKGOTO(bootstrapCreateRoot((struct ncclBootstrapHandle*)&commId, true), res, fail);
-  }
+static void ncclCommInitJobFree(void* _job) {
+  struct ncclCommInitRankAsyncJob* job = (struct ncclCommInitRankAsyncJob*)_job;
+  free(job->commId);
+  free(_job);
+}
 
+static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, int nId, ncclUniqueId* commId, int myrank, int cudaDev, ncclConfig_t *config, const char funcName[]) {
+  if (nId <= 0 || nId > nranks) {
+    WARN("improper usage of ncclCommInitRank: nId = %d, nranks=%d", nId, nranks);
+    return ncclInvalidArgument;
+  }
+  ncclResult_t res = ncclSuccess;
+  const char* commIdEnv = NULL;
+  ncclComm_t comm = NULL;
+  struct ncclCommInitRankAsyncJob* job = NULL;
+  // first call ncclInit, this will setup the environment
   NCCLCHECKGOTO(ncclInit(), res, fail);
+
   if (ncclDebugLevel > NCCL_LOG_WARN || (ncclDebugLevel != NCCL_LOG_NONE && myrank == 0)) {
     static pthread_once_t once = PTHREAD_ONCE_INIT;
     pthread_once(&once, showVersion);
@@ -1659,19 +1721,37 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUni
   *newcomm = comm;
 
   NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail);
+  job->nId = nId;
   job->comm = comm;
   job->nranks = nranks;
-  job->commId = commId; // C++ struct assignment
   job->myrank = myrank;
   job->cudaDev = cudaDev;
-  NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, ncclCommInitRankFunc, NULL, free, comm), res, fail);
+  snprintf(job->funcName, NCCL_COMMINIT_FUNCNAME_LEN, "%s", funcName);
+  // need to copy the commIds to allow async commInit and to avoid alignement issues when casting from ncclUNiqueId and ncclBootstrapHandle
+  // ncclUniqueIds and ncclBootstrapHandle don't have the same alignment requirements.
+  // Therefore the array of Ids coming from the user might not be properly aligned to be cast into a ncclBootstrapHandle
+  // copying into allocated memory guarantees that the memory is properly aligned for any objects, removing that issue
+  NCCLCHECKGOTO(ncclCalloc(&job->commId, nId), res, fail);
+  memcpy(job->commId, commId, nId * NCCL_UNIQUE_ID_BYTES);
+
+  commIdEnv = ncclGetEnv("NCCL_COMM_ID");
+  if (commIdEnv && myrank == 0) {
+    INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commIdEnv);
+    if (nId > 1) {
+      INFO(NCCL_INIT | NCCL_ENV, "NCCL_COMM_ID cannot be used with more than one ncclUniqueId");
+      job->nId = 1;
+    }
+    // start the bootstrap root before bootstrapping, use only the first handle
+    NCCLCHECKGOTO(bootstrapCreateRoot((struct ncclBootstrapHandle*)&job->commId[0], true), res, fail);
+  }
+  NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, ncclCommInitRankFunc, NULL, ncclCommInitJobFree, comm), res, fail);
 
 exit:
   return ncclGroupErrCheck(res);
 fail:
   if (comm) {
     free(comm->abortFlag);
-    if (comm->abortFlagDev) ncclCudaHostFree((void*)comm->abortFlagDev);
+    if (comm->abortFlagDev) (void)ncclCudaHostFree((void*)comm->abortFlagDev);
     free(comm->abortFlagRefCount);
     free(comm);
   }
@@ -1703,7 +1783,7 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm
   NvtxParamsCommInitRank payload{myrank, nranks, cudaDev};
   NVTX3_FUNC_WITH_PARAMS(CommInitRank, CommInitRankSchema, payload)
 
-  NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, &config));
+  NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, 1, &commId, myrank, cudaDev, &config, __func__));
   return ncclSuccess;
 }
 
@@ -1713,6 +1793,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
   int totalnDev;
   int *gpuFlags = NULL;
   ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
+  int oldDev = 0;
 
   constexpr nvtxPayloadSchemaEntry_t CommInitAllSchema[] = {
     {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of devices"}
@@ -1722,6 +1803,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
   // Load the CUDA driver and dlsym hooks (can fail on old drivers)
   (void)ncclCudaLibraryInit();
 
+  CUDACHECK(cudaGetDevice(&oldDev));
   NCCLCHECKGOTO(PtrCheck(comms, "CommInitAll", "comms"), ret, fail);
   if (ndev < 0) {
     WARN("Invalid device count requested : %d", ndev);
@@ -1735,7 +1817,8 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
     for (int i = 0; i < ndev; ++i) {
       /* invalid device check. */
       if (devlist[i] < 0 || devlist[i] >= totalnDev) {
-        ret = ncclUnhandledCudaError;
+        WARN("Invalid device %d (totalnDev=%d)", devlist[i], totalnDev);
+        ret = ncclInvalidArgument;
         goto fail;
       }
 
@@ -1756,13 +1839,18 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
   NCCLCHECKGOTO(ncclGroupStart(), ret, fail);
   for (int i=0; i<ndev; i++) {
     // Ignore return codes .. we need to call ncclGroupEnd to clean up anyway
-    ncclCommInitRankDev(comms+i, ndev, uniqueId, i, devlist ? devlist[i] : i, &config);
+    int dev = devlist ? devlist[i] : i;
+    CUDACHECKGOTO(cudaSetDevice(dev), ret, fail);
+    ncclCommInitRankDev(comms+i, ndev,1, &uniqueId, i, dev, &config, __func__);
   }
   NCCLCHECKGOTO(ncclGroupEnd(), ret, fail);
 
-fail:
+exit:
+  cudaSetDevice(oldDev);
   free(gpuFlags);
   return ret;
+fail:
+  goto exit;
 }
 
 ncclResult_t ncclCommSetAsyncError(ncclComm_t comm, ncclResult_t nextState) {
@@ -1777,7 +1865,6 @@ ncclResult_t ncclCommSetAsyncError(ncclComm_t comm, ncclResult_t nextState) {
 
 NCCL_API(ncclResult_t, ncclCommInitRankConfig, ncclComm_t* comm, int nranks, ncclUniqueId commId, int myrank, ncclConfig_t *config);
 ncclResult_t ncclCommInitRankConfig(ncclComm_t *newcomm, int nranks, ncclUniqueId commId, int myrank, ncclConfig_t *config) {
-  NVTX3_FUNC_RANGE_IN(nccl_domain);
   int cudaDev;
   ncclResult_t ret = ncclSuccess;
   ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER;
@@ -1785,13 +1872,46 @@ ncclResult_t ncclCommInitRankConfig(ncclComm_t *newcomm, int nranks, ncclUniqueI
   NCCLCHECK(ncclGroupStartInternal());
 
   (void)ncclCudaLibraryInit();
-  CUDACHECKGOTO(cudaGetDevice(&cudaDev), ret, fail);
+  CUDACHECK(cudaGetDevice(&cudaDev));
+
+  NvtxParamsCommInitRank payload{myrank, nranks, cudaDev};
+  NVTX3_FUNC_WITH_PARAMS(CommInitRankConfig, CommInitRankSchema, payload)
 
   if (config == NULL)
     internalConfigPtr = &internalConfig;
   else
     internalConfigPtr = config;
-  NCCLCHECKGOTO(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, internalConfigPtr), ret, fail);
+  NCCLCHECKGOTO(ncclCommInitRankDev(newcomm, nranks, 1, &commId, myrank, cudaDev, internalConfigPtr, __func__), ret, fail);
+
+exit:
+  ncclGroupErrCheck(ret);
+  NCCLCHECK(ncclGroupEndInternal());
+  if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommGetAsyncError(*newcomm, &ret);
+  return ret;
+fail:
+  if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommSetAsyncError(*newcomm, ret);
+  goto exit;
+}
+
+NCCL_API(ncclResult_t, ncclCommInitRankScalable, ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commId, ncclConfig_t* config);
+ncclResult_t ncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commId, ncclConfig_t* config) {
+  int cudaDev;
+  ncclResult_t ret = ncclSuccess;
+  ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER;
+  ncclConfig_t *internalConfigPtr = NULL;
+  NCCLCHECK(ncclGroupStartInternal());
+
+  (void)ncclCudaLibraryInit();
+  CUDACHECK(cudaGetDevice(&cudaDev));
+
+  NvtxParamsCommInitRank payload{myrank, nranks, cudaDev};
+  NVTX3_FUNC_WITH_PARAMS(CommInitRankScalable, CommInitRankSchema, payload)
+
+  if (config == NULL)
+    internalConfigPtr = &internalConfig;
+  else
+    internalConfigPtr = config;
+  NCCLCHECKGOTO(ncclCommInitRankDev(newcomm, nranks, nId, commId, myrank, cudaDev, internalConfigPtr, __func__), ret, fail);
 
 exit:
   ncclGroupErrCheck(ret);
@@ -1818,13 +1938,25 @@ static ncclResult_t commDestroySync(struct ncclAsyncJob* job_) {
   TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d asyncResult %d", comm, comm->rank, *comm->abortFlag, comm->asyncResult);
 
   if (comm->initState == ncclSuccess) {
-    NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->sharedRes->hostStream), ret, fail);
-    NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->sharedRes->deviceStream), ret, fail);
+    if ((ret = ncclStrongStreamSynchronize(&comm->sharedRes->hostStream)) != ncclSuccess) {
+      WARN("commDestroySync: comm %p rank %d sync hostStream error %d\n", comm, comm->rank, ret);
+    }
+    if ((ret = ncclStrongStreamSynchronize(&comm->sharedRes->deviceStream)) != ncclSuccess) {
+      WARN("commDestroySync: comm %p rank %d sync deviceStream error %d\n", comm, comm->rank, ret);
+    }
+
+    NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm), ret, fail);
     NCCLCHECKGOTO(ncclCommPollCallbacks(comm, false), ret, fail);
     // And keep polling until all graphs referencing us die.
     while (comm->persistentRefs != 0) {
       NCCLCHECKGOTO(ncclCommPollCallbacks(comm, /*waitSome=*/true), ret, fail);
-    }  
+    }
+    while (!ncclIntruQueueEmpty(&comm->legacyRegCleanupQueue)) {
+      struct ncclCommCallback* cb = ncclIntruQueueDequeue(&comm->legacyRegCleanupQueue);
+      if (cb->fn(comm, cb) != ncclSuccess) {
+        WARN("Legacy IPC cleanup callback failed comm %p (rank = %d) cb %p", comm, comm->rank, cb);
+      }
+    }
   }
 
   if ((ret = ncclProxyStop(comm)) != ncclSuccess) {
@@ -1886,14 +2018,15 @@ ncclResult_t ncclCommFinalize(ncclComm_t comm) {
   /* launch async thread to finalize comm. */
   NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
   job->comm = comm;
-  NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, commDestroySync, NULL, free, comm), ret, fail);
+  NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, commDestroySync, NULL, free, comm), ret, fail);
 
 exit:
   ncclGroupErrCheck(ret);
   NCCLCHECK(ncclGroupEndInternal());
-  if (comm && !comm->config.blocking) { NCCLCHECK(ncclCommGetAsyncError(comm, &ret)) };
+  if (comm && !comm->config.blocking) { NCCLCHECK(ncclCommGetAsyncError(comm, &ret)); }
   return ret;
 fail:
+  free(job);
   if (comm && !comm->config.blocking) (void) ncclCommSetAsyncError(comm, ret);
   goto exit;
 }
@@ -1940,13 +2073,15 @@ static ncclResult_t commReclaim(struct ncclAsyncJob* job_) {
         nextIntraComm = nextIntraComm->intraNext;
 
         if ((ret = commCleanup(curIntraComm)) != ncclSuccess) {
+          // We pass a freed pointer, but we don't dereference; we merely print its value, so it's OK.
+          // coverity[pass_freed_arg]
           WARN("commReclaim: cleanup comm %p rank %d failed in destroy/abort, error %d", curIntraComm, curRank, ret);
         }
       }
     }
   }
 
-  return ret;
+  return ncclSuccess;
 }
 
 NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm);
@@ -1975,12 +2110,11 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
   NCCLCHECK(ncclCommEnsureReady(comm));
   NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail);
   job->comm = comm;
-  NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, commReclaim, NULL, free, comm), res, fail);
+  NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, commReclaim, NULL, free, comm), res, fail);
 
 exit:
   return res;
 fail:
-  free(job);
   goto exit;
 }
 
@@ -1991,15 +2125,6 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) {
     return ncclSuccess;
   }
 
-  int rank = comm->rank, nranks = comm->nRanks, cudaDev = comm->cudaDev;
-  struct ncclCommFinalizeAsyncJob *job = NULL;
-  ncclResult_t res = ncclSuccess;
-
-  NvtxParamsCommInitRank payload{rank, nranks, cudaDev};
-  NVTX3_FUNC_WITH_PARAMS(CommAbort, CommInitRankSchema, payload)
-
-  TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId);
-
   // Ask anything that might still be running on the device to quit
   if (comm->childAbortFlag != nullptr) {
     __atomic_store_n(comm->childAbortFlag, 1, __ATOMIC_RELEASE);
@@ -2010,30 +2135,61 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) {
   comm->destroyFlag = 1;
   /* init thread must be joined before we destroy the comm,
    * and we should ignore the init error here. */
-  ncclCommEnsureReady(comm);
+  (void)ncclCommEnsureReady(comm);
+
+  // once the comm is ready, we can access ranks etc
+  int rank = comm->rank, nranks = comm->nRanks, cudaDev = comm->cudaDev;
+  struct ncclCommFinalizeAsyncJob *job = NULL;
+  ncclResult_t res = ncclSuccess;
+
+  NvtxParamsCommInitRank payload{rank, nranks, cudaDev};
+  NVTX3_FUNC_WITH_PARAMS(CommAbort, CommInitRankSchema, payload)
+
+  TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId);
 
   NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail);
   job->comm = comm;
-  NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, commReclaim, NULL, free, comm), res, fail);
+  NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, commReclaim, NULL, free, comm), res, fail);
 
 exit:
   return ncclSuccess;
 fail:
-  free(job);
   goto exit;
 }
 
+struct NvtxParamsCommSplit {
+  int rank;
+  int nranks;
+  int cudaDev;
+  int color;
+  int key;
+};
+constexpr nvtxPayloadSchemaEntry_t CommSplitSchema[] = {
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Rank"},
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of ranks", nullptr, 0, offsetof(NvtxParamsCommSplit, nranks)},
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "CUDA device", nullptr, 0, offsetof(NvtxParamsCommSplit, cudaDev)},
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "color", nullptr, 0, offsetof(NvtxParamsCommSplit, color)},
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "key", nullptr, 0, offsetof(NvtxParamsCommSplit, key)},
+};
+
 NCCL_API(ncclResult_t, ncclCommSplit, ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config);
 ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config) {
   struct ncclCommInitRankAsyncJob *job = NULL;
   struct ncclComm* childComm = NCCL_COMM_NULL;
   ncclResult_t res = ncclSuccess;
 
+  NvtxParamsCommSplit payload{comm->rank, comm->nRanks, comm->cudaDev, color, key};
+  NVTX3_FUNC_WITH_PARAMS(CommSplit, CommSplitSchema, payload)
+
+  int oldDev;
+  CUDACHECK(cudaGetDevice(&oldDev));
+
   NCCLCHECK(ncclGroupStartInternal());
   NCCLCHECKGOTO(CommCheck(comm, "CommSplit", "comm"), res, fail);
   NCCLCHECKGOTO(PtrCheck(newcomm, "CommSplit", "newcomm"), res, fail);
   NCCLCHECKGOTO(ncclCommEnsureReady(comm), res, fail);
 
+  CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), res, fail);
   /* *newcomm should be NCCL_COMM_NULL until comm split fully complete. */
   *newcomm = NCCL_COMM_NULL;
   if (color == NCCL_SPLIT_NOCOLOR) {
@@ -2073,10 +2229,12 @@ ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newc
   job->color = color;
   job->key = key;
   job->cudaDev = comm->cudaDev;
-  NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, ncclCommInitRankFunc, NULL, free, comm), res, fail);
+  snprintf(job->funcName, NCCL_COMMINIT_FUNCNAME_LEN, "%s", __func__);
+  NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, ncclCommInitRankFunc, NULL, free, comm), res, fail);
 
 exit:
-  ncclGroupErrCheck(res);
+  cudaSetDevice(oldDev);
+  (void)ncclGroupErrCheck(res);
   NCCLCHECK(ncclGroupEndInternal());
   return res;
 fail:
@@ -2179,7 +2337,7 @@ ncclResult_t  ncclMemAlloc(void **ptr, size_t size) {
   CUmemAccessDesc accessDesc = {};
   CUmemGenericAllocationHandle handle;
   int cudaDev;
-  int flag = 0;
+  int flag;
   int dcnt;
   int mcSupport = 0;
 
@@ -2193,12 +2351,18 @@ ncclResult_t  ncclMemAlloc(void **ptr, size_t size) {
     CUCHECK(cuDeviceGetAttribute(&mcSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, currentDev));
 
   if (mcSupport) {
+    int requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+    // Query device to see if FABRIC handle support is available
+    flag = 0;
+    (void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev));;
+    if (flag) requestedHandleTypes |= CU_MEM_HANDLE_TYPE_FABRIC;
     memprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
     memprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-    memprop.requestedHandleTypes = ncclCuMemHandleType;
+    memprop.requestedHandleTypes = (CUmemAllocationHandleType) requestedHandleTypes;
     memprop.location.id = currentDev;
     // Query device to see if RDMA support is available
-    CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, currentDev));
+    flag = 0;
+    CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev));
     if (flag) memprop.allocFlags.gpuDirectRDMACapable = 1;
     CUCHECK(cuMemGetAllocationGranularity(&memGran, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
 
@@ -2207,14 +2371,25 @@ ncclResult_t  ncclMemAlloc(void **ptr, size_t size) {
     mcprop.size = size;
     /* device cnt is a dummy value right now, it might affect mc granularity in the future. */
     mcprop.numDevices = dcnt;
-    mcprop.handleTypes = ncclCuMemHandleType;
+    mcprop.handleTypes = requestedHandleTypes;
     mcprop.flags = 0;
     CUCHECK(cuMulticastGetGranularity(&mcGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
 
     /* only size needs to be aligned to mcGran */
     ALIGN_SIZE(size, mcGran);
-    /* Allocate the physical memory on the device */
-    CUCHECK(cuMemCreate(&handle, size, &memprop, 0));
+    if (requestedHandleTypes & CU_MEM_HANDLE_TYPE_FABRIC) {
+      /* First try cuMemCreate() with FABRIC handle support and then remove if it fails */
+      CUresult err = CUPFN(cuMemCreate(&handle, size, &memprop, 0));
+      if (err == CUDA_ERROR_NOT_PERMITTED || err == CUDA_ERROR_NOT_SUPPORTED) {
+        requestedHandleTypes &= ~CU_MEM_HANDLE_TYPE_FABRIC;
+        memprop.requestedHandleTypes = (CUmemAllocationHandleType) requestedHandleTypes;
+        /* Allocate the physical memory on the device */
+        CUCHECK(cuMemCreate(&handle, size, &memprop, 0));
+      }
+    } else {
+      /* Allocate the physical memory on the device */
+      CUCHECK(cuMemCreate(&handle, size, &memprop, 0));
+    }
     /* Reserve a virtual address range */
     CUCHECK(cuMemAddressReserve((CUdeviceptr*)ptr, size, memGran, 0, 0));
     /* Map the virtual address range to the physical allocation */
@@ -2234,6 +2409,9 @@ ncclResult_t  ncclMemAlloc(void **ptr, size_t size) {
 
 fallback:
 #endif
+  // Coverity is right to complain that we may pass a NULL ptr to cudaMalloc.  That's deliberate though:
+  // we want CUDA to return an error to the caller.
+  // coverity[var_deref_model]
   CUDACHECKGOTO(cudaMalloc(ptr, size), ret, fail);
 
 exit:
@@ -2272,7 +2450,7 @@ fallback:
   CUDACHECKGOTO(cudaFree(ptr), ret, fail);
 
 exit:
-  cudaSetDevice(saveDevice);
+  CUDACHECK(cudaSetDevice(saveDevice));
   return ret;
 fail:
   goto exit;
diff --git a/src/misc/argcheck.cc b/src/misc/argcheck.cc
index 6ed5db27a7..b1906845b6 100644
--- a/src/misc/argcheck.cc
+++ b/src/misc/argcheck.cc
@@ -53,6 +53,10 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) {
     return ncclInvalidArgument;
   }
 
+  // ncclMaxRedOp < info->op will always be false due to the sizes of
+  // the datatypes involved, and that's by design.  We keep the check though
+  // just as a reminder.
+  // coverity[result_independent_of_operands]
   if (info->op < 0 || ncclMaxRedOp < info->op) {
     WARN("%s : invalid reduction operation %d", info->opName, info->op);
     return ncclInvalidArgument;
diff --git a/src/misc/cudawrap.cc b/src/misc/cudawrap.cc
index d44c063559..03e3bde992 100644
--- a/src/misc/cudawrap.cc
+++ b/src/misc/cudawrap.cc
@@ -11,7 +11,7 @@
 
 // This env var (NCCL_CUMEM_ENABLE) toggles cuMem API usage
 NCCL_PARAM(CuMemEnable, "CUMEM_ENABLE", -2);
-
+NCCL_PARAM(CuMemHostEnable, "CUMEM_HOST_ENABLE", 0);
 // Handle type used for cuMemCreate()
 CUmemAllocationHandleType ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
 
@@ -49,6 +49,14 @@ int ncclCuMemEnable() {
   return  param >= 0 ? param : (param == -2 && ncclCuMemSupported);
 }
 
+int ncclCuMemHostEnable() {
+#if CUDART_VERSION < 12020
+  return 0;
+#else
+  return ncclParamCuMemHostEnable();
+#endif
+}
+
 #define DECLARE_CUDA_PFN(symbol) PFN_##symbol pfn_##symbol = nullptr
 
 #if CUDART_VERSION >= 11030
@@ -81,6 +89,7 @@ DECLARE_CUDA_PFN(cuMemRelease);
 DECLARE_CUDA_PFN(cuMemRetainAllocationHandle);
 DECLARE_CUDA_PFN(cuMemSetAccess);
 DECLARE_CUDA_PFN(cuMemUnmap);
+DECLARE_CUDA_PFN(cuMemGetAllocationPropertiesFromHandle);
 /* ncclMemAlloc/Free */
 DECLARE_CUDA_PFN(cuPointerGetAttribute);
 #if CUDA_VERSION >= 11070
@@ -107,7 +116,7 @@ bool ncclCudaLaunchBlocking = false;
 
 #if CUDART_VERSION >= 12000
 #define LOAD_SYM(symbol, ignore) do {                                   \
-    cudaDriverEntryPointQueryResult driverStatus;                       \
+    cudaDriverEntryPointQueryResult driverStatus = cudaDriverEntryPointSymbolNotFound; \
     res = cudaGetDriverEntryPoint(#symbol, (void **) (&pfn_##symbol), cudaEnableDefault, &driverStatus); \
     if (res != cudaSuccess || driverStatus != cudaDriverEntryPointSuccess) { \
       if (!ignore) {                                                    \
@@ -157,6 +166,7 @@ static ncclResult_t cudaPfnFuncLoader(void) {
   LOAD_SYM(cuMemRetainAllocationHandle, 1);
   LOAD_SYM(cuMemSetAccess, 1);
   LOAD_SYM(cuMemUnmap, 1);
+  LOAD_SYM(cuMemGetAllocationPropertiesFromHandle, 1);
 /* ncclMemAlloc/Free */
   LOAD_SYM(cuPointerGetAttribute, 1);
 #if CUDA_VERSION >= 11070
@@ -208,6 +218,20 @@ static void initOnceFunc() {
   // Determine whether we support the cuMem APIs or not
   ncclCuMemSupported = ncclIsCuMemSupported();
 
+#if 12020 <= CUDART_VERSION && CUDART_VERSION <= 12030
+  /* To use cuMem* for host memory allocation, we need to create context on each
+   * visible device. This is workaround needed in CUDA 12.3 which is fixed in 12.4. */
+  if (ncclCuMemSupported && ncclCuMemHostEnable()) {
+    int deviceCnt, saveDevice;
+    cudaGetDevice(&saveDevice);
+    cudaGetDeviceCount(&deviceCnt);
+    for (int i = 0; i < deviceCnt; ++i) {
+      cudaSetDevice(i);
+      cudaFree(NULL);
+    }
+    cudaSetDevice(saveDevice);
+  }
+#endif
   initResult = ret;
   return;
 error:
diff --git a/src/misc/ipcsocket.cc b/src/misc/ipcsocket.cc
index db61b31495..2d17f47e69 100644
--- a/src/misc/ipcsocket.cc
+++ b/src/misc/ipcsocket.cc
@@ -41,6 +41,7 @@ ncclResult_t ncclIpcSocketInit(ncclIpcSocket *handle, int rank, uint64_t hash, v
   int len = snprintf(temp, NCCL_IPC_SOCKNAME_LEN, NCCL_IPC_SOCKNAME_STR, rank, hash);
   if (len > (sizeof(cliaddr.sun_path) - 1)) {
     WARN("UDS: Cannot bind provided name to socket. Name too large");
+    close(fd);
     return ncclInternalError;
   }
 #ifndef USE_ABSTRACT_SOCKET
@@ -66,7 +67,7 @@ ncclResult_t ncclIpcSocketInit(ncclIpcSocket *handle, int rank, uint64_t hash, v
   // Mark socket as non-blocking
   if (handle->abortFlag) {
     int flags;
-    EQCHECK(flags = fcntl(fd, F_GETFL), -1);
+    SYSCHECK(flags = fcntl(fd, F_GETFL), "fcntl");
     SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
   }
 
@@ -186,20 +187,16 @@ ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen,
   cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick
 #endif
 
-  TRACE(NCCL_INIT, "UDS: Sending hdr %p len %d to UDS socket %s", hdr, hdrLen, temp);
+  TRACE(NCCL_INIT, "UDS: Sending hdr %p len %d fd %d to UDS socket %s", hdr, hdrLen, sendFd, temp);
 
-  if (sendFd != -1) {
-    TRACE(NCCL_INIT, "UDS: Sending fd %d to UDS socket %s", sendFd, temp);
+  msg.msg_control = control_un.control;
+  msg.msg_controllen = sizeof(control_un.control);
 
-    msg.msg_control = control_un.control;
-    msg.msg_controllen = sizeof(control_un.control);
-
-    cmptr = CMSG_FIRSTHDR(&msg);
-    cmptr->cmsg_len = CMSG_LEN(sizeof(int));
-    cmptr->cmsg_level = SOL_SOCKET;
-    cmptr->cmsg_type = SCM_RIGHTS;
-    memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd));
-  }
+  cmptr = CMSG_FIRSTHDR(&msg);
+  cmptr->cmsg_len = CMSG_LEN(sizeof(int));
+  cmptr->cmsg_level = SOL_SOCKET;
+  cmptr->cmsg_type = SCM_RIGHTS;
+  memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd));
 
   msg.msg_name = (void *)&cliaddr;
   msg.msg_namelen = sizeof(struct sockaddr_un);
diff --git a/src/misc/nvmlwrap.cc b/src/misc/nvmlwrap.cc
index a2b0be0df9..f441af80b1 100644
--- a/src/misc/nvmlwrap.cc
+++ b/src/misc/nvmlwrap.cc
@@ -102,6 +102,10 @@ ncclResult_t ncclNvmlEnsureInitialized() {
     for(Symbol sym: symbols) {
       *sym.ppfn = dlsym(libhandle, sym.name);
     }
+    // Coverity complains that we never dlclose this object, but that's
+    // deliberate, since we want the loaded object to remain in memory until
+    // the process terminates, so that we can use its code.
+    // coverity[leaked_storage]
   }
   #endif
 
diff --git a/src/misc/param.cc b/src/misc/param.cc
index 2248be9804..eb50cfeedc 100644
--- a/src/misc/param.cc
+++ b/src/misc/param.cc
@@ -37,7 +37,7 @@ void setEnvFile(const char* fileName) {
     while (line[s] != '\0' && line[s] != '=') s++;
     if (line[s] == '\0') continue;
     strncpy(envVar, line, std::min(1023,s));
-    envVar[s] = '\0';
+    envVar[std::min(1023,s)] = '\0';
     s++;
     strncpy(envValue, line+s, 1023);
     envValue[1023]='\0';
@@ -48,17 +48,28 @@ void setEnvFile(const char* fileName) {
   fclose(file);
 }
 
-void initEnv() {
+static void initEnvFunc() {
   char confFilePath[1024];
-  const char * userDir = userHomeDir();
-  if (userDir) {
-    sprintf(confFilePath, "%s/.nccl.conf", userDir);
+  const char* userFile = getenv("NCCL_CONF_FILE");
+  if (userFile && strlen(userFile) > 0) {
+    snprintf(confFilePath, sizeof(confFilePath), "%s", userFile);
     setEnvFile(confFilePath);
+  } else {
+    const char* userDir = userHomeDir();
+    if (userDir) {
+      snprintf(confFilePath, sizeof(confFilePath), "%s/.nccl.conf", userDir);
+      setEnvFile(confFilePath);
+    }
   }
-  sprintf(confFilePath, "/etc/nccl.conf");
+  snprintf(confFilePath, sizeof(confFilePath), "/etc/nccl.conf");
   setEnvFile(confFilePath);
 }
 
+void initEnv() {
+  static pthread_once_t once = PTHREAD_ONCE_INIT;
+  pthread_once(&once, initEnvFunc);
+}
+
 void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache) {
   static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
   pthread_mutex_lock(&mutex);
@@ -80,8 +91,7 @@ void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int6
   pthread_mutex_unlock(&mutex);
 }
 
-const char *ncclGetEnv(const char *name) {
-  static pthread_once_t once = PTHREAD_ONCE_INIT;
-  pthread_once(&once, initEnv);
+const char* ncclGetEnv(const char* name) {
+  initEnv();
   return getenv(name);
 }
diff --git a/src/misc/profiler.cc b/src/misc/profiler.cc
index 785d616b8b..9a4adf5795 100644
--- a/src/misc/profiler.cc
+++ b/src/misc/profiler.cc
@@ -1,115 +1,524 @@
 /*************************************************************************
- * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
+#include "param.h"
+#include "checks.h"
+#include "comm.h"
+#include "enqueue.h"
+#include "utils.h"
+#include "proxy.h"
 #include "profiler.h"
 
-//#define PROFILE_PROXY 1
-#ifdef PROFILE_PROXY
-#include "timer.h"
-#include "alloc.h"
+static pthread_mutex_t profilerLock = PTHREAD_MUTEX_INITIALIZER;
+static int profilerPluginRefCount;
+static void* profilerPluginLib;
+static ncclProfiler_t* ncclProfiler;
 
-static const char* profilingStateSendStr[] = { "BufferWait", "GPUWait", "SendWait", "", "End" };
-static const char* profilingStateRecvStr[] = { "BufferWait", "RecvWait", "FlushWait", "GPUWait", "End" };
-static const char* profilingEventStr[] = { "SendRecv", "Sleep", "Idle", "Append" };
-struct ncclProxyProfileEvent {
-  double timestamp[6];
-  uint64_t opCount;
-  int peer;
-  int step;
-  uint16_t channel;
-  uint8_t type; // send / recv
-  uint8_t opIndex;
-};
+#define MAX_STR_LEN 256
+#define NCCL_PROFILER_PLUGIN_SYMBOL "ncclProfiler_v1"
 
-struct ncclProxyProfileEvent* profilingEvents = NULL;
-int profilingIndex = 0;
-double profilingStart = 0;
-#define MAX_EVENTS 200000
-
-ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) {
-  if (profilingEvents == NULL) {
-    NCCLCHECK(ncclCalloc(&profilingEvents, MAX_EVENTS));
-    profilingStart = gettime();
+static void* tryOpenLib(char* name, int *err, char* errStr) {
+  if (nullptr == name || strlen(name) == 0) {
+    return nullptr;
   }
-  struct ncclProxyProfileEvent* event = NULL;
-  if (state%8 == 0) {
-    if (profilingIndex == MAX_EVENTS) return ncclSuccess;
-    args->subs[sub].profilingEvents[step%NCCL_STEPS] = event = profilingEvents+profilingIndex++;
-    if (state == ncclProxyProfileBegin) {
-      // Proxy operation information
-      event->opCount = args->opCount;
-      event->channel = args->subs[sub].channelId;
-      event->peer = args->subs[sub].peer;
-      event->type = args->pattern;
-      event->step = step;
-      event->opIndex = (((uint64_t)args)/sizeof(struct ncclProxyArgs))%256;
-    } else event->peer = -state;
+
+  if (strncasecmp(name, "STATIC_PLUGIN", strlen(name)) == 0) {
+    name = nullptr;
+  }
+
+  void *handle = dlopen(name, RTLD_NOW | RTLD_LOCAL);
+  if (nullptr == handle) {
+    strncpy(errStr, dlerror(), MAX_STR_LEN);
+    errStr[MAX_STR_LEN] = 0;
+    if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) {
+      *err = ENOENT;
+    }
+  }
+
+  return handle;
+}
+
+static char* tryOpenLibCheck(int openErr, char* openErrStr, char* nameList, int *nameListLen, char* name) {
+  if (openErr == ENOENT) {
+    snprintf(nameList, *nameListLen, " %s", name);
+    nameList += strlen(name) + 1;
+    *nameListLen -= strlen(name) + 1;
+    return nameList;
+  }
+  INFO(NCCL_ENV, "PROFILER/Plugin: %s", openErrStr);
+  return nameList;
+}
+
+static void* openProfilerPluginLib(char* couldNotFindNames, int len) {
+  int openErr;
+  void *pluginLib;
+  char profilerPluginLibName[PATH_MAX];
+  char openErrStr[MAX_STR_LEN + 1] = { 0 };
+
+  const char *envProfilerPluginName = getenv("NCCL_PROFILER_PLUGIN");
+  if (envProfilerPluginName && strlen(envProfilerPluginName)) {
+    snprintf(profilerPluginLibName, PATH_MAX, "%s", envProfilerPluginName);
+    pluginLib = tryOpenLib(profilerPluginLibName, &openErr, openErrStr);
+    if (pluginLib) {
+      INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: Plugin name set by env to %s", profilerPluginLibName);
+      return pluginLib;
+    }
+
+    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, profilerPluginLibName);
+    pluginLib = tryOpenLib(profilerPluginLibName, &openErr, openErrStr);
+    if (pluginLib) {
+      INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: Plugin name set by env to %s", profilerPluginLibName);
+      return pluginLib;
+    }
+    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, profilerPluginLibName);
   } else {
-    event = (struct ncclProxyProfileEvent*)args->subs[sub].profilingEvents[step%NCCL_STEPS];
-    if (state == ncclProxyProfileEnd) args->subs[sub].profilingEvents[step%NCCL_STEPS] = NULL;
-    if (state == ncclProxyProfileAppendEnd) event->opCount = args->opCount;
+    snprintf(profilerPluginLibName, PATH_MAX, "libnccl-profiler.so");
+    pluginLib = tryOpenLib(profilerPluginLibName, &openErr, openErrStr);
+    if (pluginLib) {
+      return pluginLib;
+    }
+    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, profilerPluginLibName);
   }
-  // Timestamp
-  event->timestamp[state%8] = gettime()-profilingStart;
+
+  return nullptr;
+}
+
+enum {
+  profilerPluginLoadFailed = -1,
+  profilerPluginLoadReady = 0,
+  profilerPluginLoadSuccess = 1,
+};
+static int profilerPluginStatus = profilerPluginLoadReady;
+static pid_t pid;
+
+#define MAX_PLUGIN_LOAD 2
+
+static ncclResult_t ncclProfilerPluginLoad(void) {
+  if (profilerPluginLoadFailed == profilerPluginStatus) {
+    return ncclSuccess;
+  }
+
+  char couldNotFindNames[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 };
+  pthread_mutex_lock(&profilerLock);
+  if (profilerPluginLoadSuccess == profilerPluginStatus) {
+    ++profilerPluginRefCount;
+    goto exit;
+  }
+
+  profilerPluginLib = openProfilerPluginLib(couldNotFindNames, MAX_PLUGIN_LOAD * PATH_MAX);
+  if (profilerPluginLib == nullptr) {
+    if (strlen(couldNotFindNames)) {
+      INFO(NCCL_ENV, "PROFILER/Plugin: Could not find:%s.", couldNotFindNames);
+    }
+    goto fail;
+  }
+
+  ncclProfiler = (ncclProfiler_t*)dlsym(profilerPluginLib, NCCL_PROFILER_PLUGIN_SYMBOL);
+  if (ncclProfiler == nullptr) {
+    INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find " NCCL_PROFILER_PLUGIN_SYMBOL ".");
+    goto fail;
+  }
+
+  ++profilerPluginRefCount;
+  profilerPluginStatus = profilerPluginLoadSuccess;
+
+  // Store the pid of the process loading the profiler.
+  // This is attached to the proxyOp event descriptor
+  // so the plugin can figure out if the parent event
+  // is in the same address space or not
+  pid = getpid();
+
+exit:
+  pthread_mutex_unlock(&profilerLock);
+  return ncclSuccess;
+fail:
+  if (profilerPluginLib) dlclose(profilerPluginLib);
+  profilerPluginStatus = profilerPluginLoadFailed;
+  goto exit;
+}
+
+static ncclResult_t ncclProfilerPluginUnload(void) {
+  pthread_mutex_lock(&profilerLock);
+  if (0 == (--profilerPluginRefCount)) {
+    INFO(NCCL_ENV, "PROFILER/Plugin: Closing profiler plugin %s", ncclProfiler->name);
+    dlclose(profilerPluginLib);
+    profilerPluginLib = nullptr;
+    ncclProfiler = nullptr;
+    profilerPluginStatus = profilerPluginLoadReady;
+  }
+  pthread_mutex_unlock(&profilerLock);
   return ncclSuccess;
 }
 
-void ncclProfilingDump() {
-  static int dumpDone = 0;
-  if (dumpDone) return;
-  dumpDone = 1;
-  const char* str = ncclGetEnv("NCCL_PROXY_PROFILE");
-  if (!str) { free(profilingEvents); return; }
-  FILE* f = fopen(str, "w");
-  fprintf(f, "[\n");
+#define ENABLE_TIMER 0
+#include "timer.h"
 
-  for (int i=0; i<profilingIndex; i++) {
-    struct ncclProxyProfileEvent* e = profilingEvents+i;
-    const int sendrecv = e->peer >= 0;
-    const char* typeStr = sendrecv ? (e->type == ncclPatternSend ? "Send" : "Recv") :
-      profilingEventStr[-(e->peer/8)];
+#if ENABLE_TIMER
+static int64_t elapsedCount;
+static int64_t initCount, finalizeCount;
+static int64_t groupStartCount, groupStopCount;
+static int64_t taskStartCount, taskStopCount;
+static int64_t proxyOpStartCount, proxyOpStopCount;
+static int64_t proxyStepStartCount, proxyStepStopCount;
+static int64_t proxyCtrlStartCount, proxyCtrlStopCount;
+static int64_t proxyOpRecordCount, proxyStepRecordCount, proxyCtrlRecordCount;
+
+static double elapsedTs[2];
+static double initTs[2], finalizeTs[2];
+static double groupStartTs[2], groupStopTs[2];
+static double taskStartTs[2], taskStopTs[2];
+static double proxyOpStartTs[2], proxyOpStopTs[2];
+static double proxyStepStartTs[2], proxyStepStopTs[2];
+static double proxyCtrlStartTs[2], proxyCtrlStopTs[2];
+static double proxyOpRecordTs[2], proxyStepRecordTs[2], proxyCtrlRecordTs[2];
+
+#define TIME_START_EVENT(event) do { \
+  (event ## Count)++; \
+  (event ## Ts)[0] = gettime(); \
+} while(0)
+
+#define TIME_STOP_EVENT(event) do { \
+  double val = gettime() - (event ## Ts)[0]; \
+  (event ## Ts)[1] += val; \
+} while(0)
+
+#define TIME_PRINT_EVENTS(name) do { \
+  printf("%s ", name); \
+  if (elapsedCount)         printf("[elapsed] %g/%ld = %g ", elapsedTs[1], elapsedCount, elapsedTs[1]/elapsedCount); \
+  if (initCount)            printf("[init] %g/%ld = %g ", initTs[1], initCount, initTs[1]/initCount); \
+  if (finalizeCount)        printf("[finalize] %g/%ld = %g ", finalizeTs[1], finalizeCount, finalizeTs[1]/finalizeCount); \
+  if (groupStartCount)      printf("[groupStart] %g/%ld = %g ", groupStartTs[1], groupStartCount, groupStartTs[1]/groupStartCount); \
+  if (groupStopCount)       printf("[groupStop] %g/%ld = %g ", groupStopTs[1], groupStopCount, groupStopTs[1]/groupStopCount); \
+  if (taskStartCount)       printf("[taskStart] %g/%ld = %g ", taskStartTs[1], taskStartCount, taskStartTs[1]/taskStartCount); \
+  if (taskStopCount)        printf("[taskStop] %g/%ld = %g ", taskStopTs[1], taskStopCount, taskStopTs[1]/taskStopCount); \
+  if (proxyOpStartCount)    printf("[proxyOpStart] %g/%ld = %g ", proxyOpStartTs[1], proxyOpStartCount, proxyOpStartTs[1]/proxyOpStartCount); \
+  if (proxyOpStopCount)     printf("[proxyOpStop] %g/%ld = %g ", proxyOpStopTs[1], proxyOpStopCount, proxyOpStopTs[1]/proxyOpStopCount); \
+  if (proxyStepStartCount)  printf("[proxyStepStart] %g/%ld = %g ", proxyStepStartTs[1], proxyStepStartCount, proxyStepStartTs[1]/proxyStepStartCount); \
+  if (proxyStepStopCount)   printf("[proxyStepStop] %g/%ld = %g ", proxyStepStopTs[1], proxyStepStopCount, proxyStepStopTs[1]/proxyStepStopCount); \
+  if (proxyCtrlStartCount)  printf("[proxyCtrlStart] %g/%ld = %g ", proxyCtrlStartTs[1], proxyCtrlStartCount, proxyCtrlStartTs[1]/proxyCtrlStartCount); \
+  if (proxyCtrlStopCount)   printf("[proxyCtrlStop] %g/%ld = %g ", proxyCtrlStopTs[1], proxyCtrlStopCount, proxyCtrlStopTs[1]/proxyCtrlStopCount); \
+  if (proxyOpRecordCount)   printf("[proxyOpRecord] %g/%ld = %g ", proxyOpRecordTs[1], proxyOpRecordCount, proxyOpRecordTs[1]/proxyOpRecordCount); \
+  if (proxyStepRecordCount) printf("[proxyStepRecord] %g/%ld = %g ", proxyStepRecordTs[1], proxyStepRecordCount, proxyStepRecordTs[1]/proxyStepRecordCount); \
+  if (proxyCtrlRecordCount) printf("[proxyCtrlRecord] %g/%ld = %g", proxyCtrlRecordTs[1], proxyCtrlRecordCount, proxyCtrlRecordTs[1]/proxyCtrlRecordCount); \
+  printf("\n"); \
+} while(0)
+#else
+#define TIME_START_EVENT(event) do {} while(0)
+#define TIME_STOP_EVENT(event)  do {} while(0)
+#define TIME_PRINT_EVENTS(name) do {} while(0)
+#endif
 
 
-    if (sendrecv) {
-      int state = ncclProxyProfileBegin;
-      const char** stateStr = e->type == ncclPatternSend ? profilingStateSendStr : profilingStateRecvStr;
-      fprintf(f, "{\"name\": \"%s-%d-%d\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f, \"args\": { \"opCount\": %ld, \"proxyOpIndex\":%d } },\n",
-          typeStr, e->peer, e->step, i, e->channel, e->timestamp[state], e->opCount, e->opIndex);
+static int eActivationMask;       // Set by profiler
+static int eActivationMaskGroup;  // Cached for current group
 
-      while (state<ncclProxyProfileEnd) {
-        if (e->timestamp[state]) {
-          const char* name = stateStr[state];
-          fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n",
-              name, i, e->channel, e->timestamp[state]);
-          state++;
-          while (e->timestamp[state] == 0) state++;
-          fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n",
-              name, i, e->channel, e->timestamp[state]);
-        }
-      }
-
-      fprintf(f, "{\"name\": \"%s-%d-%d\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n",
-          typeStr, e->peer, e->step, i, e->channel, e->timestamp[state]);
-    } else {
-      if (e->peer == -ncclProxyProfileAppend) {
-      fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f, \"args\": { \"added\": %ld } },\n",
-          typeStr, i, e->timestamp[0], e->opCount);
-      } else {
-        fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f },\n",
-          typeStr, i, e->timestamp[0]);
-      }
-      fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f },\n",
-          typeStr, i, e->timestamp[1]);
+ncclResult_t ncclProfilerPluginInit(struct ncclComm* comm) {
+  TIME_START_EVENT(elapsed);
+  TIME_START_EVENT(init);
+  ncclProfilerPluginLoad();
+  if (__builtin_expect(ncclProfiler != NULL, 0)) {
+    int err = ncclProfiler->init(&comm->profilerContext, &eActivationMask);
+    if (err) {
+      WARN("Profiler init failed with error (%d). Continue without profiler.", err);
+      ncclProfiler = NULL;
     }
   }
-  fprintf(f, "{} ]\n");
-  fclose(f);
-  free(profilingEvents);
+  TIME_STOP_EVENT(init);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclProfilerPluginFinalize(struct ncclComm* comm) {
+  TIME_START_EVENT(finalize);
+  if (__builtin_expect(ncclProfiler != NULL, 0)) {
+    ncclProfiler->finalize(comm->profilerContext);
+  }
+  ncclProfilerPluginUnload();
+  TIME_STOP_EVENT(finalize);
+  TIME_STOP_EVENT(elapsed);
+  TIME_PRINT_EVENTS("Profiler");
+  return ncclSuccess;
+}
+
+ncclResult_t ncclProfilerStartGroupEvent(struct ncclKernelPlan* plan) {
+  TIME_START_EVENT(groupStart);
+  eActivationMaskGroup = __atomic_load_n(&eActivationMask, __ATOMIC_RELAXED);
+  if (__builtin_expect(ncclProfiler != NULL, 0)) {
+    if (eActivationMaskGroup & (ncclProfileColl | ncclProfileP2p | ncclProfileProxyOp | ncclProfileProxyStep)) {
+      ncclProfilerEventDescr_v1_t eDescr = { 0 };
+      eDescr.type = ncclProfileGroup;
+      ncclProfiler->startEvent(plan->comm->profilerContext, &plan->groupEventHandle, &eDescr);
+    }
+  }
+  TIME_STOP_EVENT(groupStart);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclProfilerStopGroupEvent(struct ncclKernelPlan* plan) {
+  TIME_START_EVENT(groupStop);
+  if (__builtin_expect(ncclProfiler != NULL, 0) && plan->groupEventHandle) {
+    ncclProfiler->stopEvent(plan->groupEventHandle);
+  }
+  TIME_STOP_EVENT(groupStop);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) {
+  TIME_START_EVENT(taskStart);
+  if (__builtin_expect(ncclProfiler != NULL, 0)) {
+    int enable = eActivationMaskGroup & (ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileColl);
+    if (plan->groupEventHandle && enable) {
+      struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue);
+      while (ct) {
+        ncclProfilerEventDescr_t eDescr = { 0 };
+        eDescr.type = ncclProfileColl;
+        eDescr.parentObj = plan->groupEventHandle;
+        eDescr.rank = plan->comm->rank;
+        eDescr.coll.name = plan->comm->commName;
+        eDescr.coll.commHash = plan->comm->commHash;
+        eDescr.coll.seqNumber = plan->comm->seqNumber[ct->func]++;
+        eDescr.coll.func = ct->func;
+        eDescr.coll.sendBuff = ct->sendbuff;
+        eDescr.coll.recvBuff = ct->recvbuff;
+        eDescr.coll.count = ct->count;
+        eDescr.coll.root = ct->root;
+        eDescr.coll.datatype = ct->datatype;
+        eDescr.coll.op = ct->opHost;
+        eDescr.coll.trafficBytes = ct->trafficBytes;
+        eDescr.coll.nMaxChannels = ct->nMaxChannels;
+        eDescr.coll.nWarps = ct->nWarps;
+        eDescr.coll.algo = ct->algorithm;
+        eDescr.coll.proto = ct->protocol;
+        eDescr.coll.isCollnet = ct->isCollnet;
+        eDescr.coll.isNvls = ct->isNvls;
+        ncclProfiler->startEvent(plan->comm->profilerContext, &ct->eventHandle, &eDescr);
+
+        // update collective task with group event activation mask
+        ct->eActivationMask = eActivationMaskGroup;
+        ct = ct->next;
+      }
+      struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue);
+      while (pt) {
+        ncclProfilerEventDescr_t eDescr = { 0 };
+        eDescr.type = ncclProfileP2p;
+        eDescr.parentObj = plan->groupEventHandle;
+        eDescr.rank = plan->comm->rank;
+        eDescr.p2p.name = plan->comm->commName;
+        eDescr.p2p.commHash = plan->comm->commHash;
+        eDescr.p2p.func = pt->func;
+        eDescr.p2p.buff = pt->buff;
+        eDescr.p2p.count = pt->count;
+        eDescr.p2p.datatype = pt->datatype;
+        eDescr.p2p.peer = pt->root;
+        ncclProfiler->startEvent(plan->comm->profilerContext, &pt->eventHandle, &eDescr);
+
+        // update collective task with group event activation mask
+        pt->eActivationMask = eActivationMaskGroup;
+        pt = pt->next;
+      }
+    }
+  }
+  TIME_STOP_EVENT(taskStart);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclProfilerStopTaskEvents(struct ncclKernelPlan* plan) {
+  TIME_START_EVENT(taskStop);
+  if (__builtin_expect(ncclProfiler != NULL, 0)) {
+    int enable = eActivationMaskGroup & (ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileColl);
+    if (plan->groupEventHandle && enable) {
+      struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue);
+      while (ct) {
+        ncclProfiler->stopEvent(ct->eventHandle);
+        ct = ct->next;
+      }
+      struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue);
+      while (pt) {
+        ncclProfiler->stopEvent(pt->eventHandle);
+        pt = pt->next;
+      }
+    }
+  }
+  TIME_STOP_EVENT(taskStop);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclProfilerStartSendProxyOpEvent(int s, struct ncclProxyArgs* args) {
+  TIME_START_EVENT(proxyOpStart);
+  struct ncclProxySubArgs* sub = &args->subs[s];
+  if (__builtin_expect(ncclProfiler != NULL, 0)) {
+    if (sub->eActivationMask & (ncclProfileProxyStep | ncclProfileProxyOp)) {
+      ncclProfilerEventDescr_t eDescr = { 0 };
+      eDescr.type = ncclProfileProxyOp;
+      eDescr.parentObj = sub->taskEventHandle;
+      eDescr.rank = sub->rank;
+      eDescr.proxyOp.pid = args->pid;
+      eDescr.proxyOp.channelId = sub->channelId;
+      eDescr.proxyOp.peer = sub->peer;
+      eDescr.proxyOp.nSteps = sub->nsteps;
+      eDescr.proxyOp.chunkSize = args->chunkSize;
+      eDescr.proxyOp.isSend = 1;
+      ncclProfiler->startEvent(args->profilerContext, &sub->opEventHandle, &eDescr);
+    }
+  }
+  TIME_STOP_EVENT(proxyOpStart);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclProfilerStartRecvProxyOpEvent(int s, struct ncclProxyArgs* args) {
+  TIME_START_EVENT(proxyOpStart);
+  struct ncclProxySubArgs* sub = &args->subs[s];
+  if (__builtin_expect(ncclProfiler != NULL, 0)) {
+    if (sub->eActivationMask & (ncclProfileProxyStep | ncclProfileProxyOp)) {
+      ncclProfilerEventDescr_t eDescr = { 0 };
+      eDescr.type = ncclProfileProxyOp;
+      eDescr.parentObj = sub->taskEventHandle;
+      eDescr.rank = sub->rank;
+      eDescr.proxyOp.pid = args->pid;
+      eDescr.proxyOp.channelId = sub->channelId;
+      eDescr.proxyOp.peer = sub->peer;
+      eDescr.proxyOp.nSteps = sub->nsteps;
+      eDescr.proxyOp.chunkSize = args->chunkSize;
+      eDescr.proxyOp.isSend = 0;
+      ncclProfiler->startEvent(args->profilerContext, &sub->opEventHandle, &eDescr);
+    }
+  }
+  TIME_STOP_EVENT(proxyOpStart);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclProfilerStopProxyOpEvent(int s, struct ncclProxyArgs* args) {
+  TIME_START_EVENT(proxyOpStop);
+  struct ncclProxySubArgs* sub = &args->subs[s];
+  if (__builtin_expect(ncclProfiler != NULL, 0) && sub->opEventHandle) {
+    ncclProfiler->stopEvent(sub->opEventHandle);
+    sub->opEventHandle = NULL;
+  }
+  TIME_STOP_EVENT(proxyOpStop);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclProfilerStartSendProxyStepEvents(int s, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi) {
+  TIME_START_EVENT(proxyStepStart);
+  struct ncclProxySubArgs* sub = &args->subs[s];
+  if (__builtin_expect(ncclProfiler != NULL, 0)) {
+    if (sub->opEventHandle && (sub->eActivationMask & ncclProfileProxyStep)) {
+      for (uint64_t step = stepLo; step < stepHi; step++) {
+        ncclProfilerEventDescr_t eDescr = { 0 };
+        eDescr.type = ncclProfileProxyStep;
+        eDescr.parentObj = sub->opEventHandle;
+        eDescr.rank = sub->rank;
+        eDescr.proxyStep.step = step;
+        ncclProfiler->startEvent(args->profilerContext, &sub->stepEventHandles[step%NCCL_STEPS], &eDescr);
+      }
+    }
+  }
+  TIME_STOP_EVENT(proxyStepStart);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclProfilerStartRecvProxyStepEvents(int s, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi) {
+  TIME_START_EVENT(proxyStepStart);
+  struct ncclProxySubArgs* sub = &args->subs[s];
+  if (__builtin_expect(ncclProfiler != NULL, 0)) {
+    if (sub->opEventHandle && (sub->eActivationMask & ncclProfileProxyStep)) {
+      for (uint64_t step = stepLo; step < stepHi; step++) {
+        ncclProfilerEventDescr_t eDescr = { 0 };
+        eDescr.type = ncclProfileProxyStep;
+        eDescr.parentObj = sub->opEventHandle;
+        eDescr.rank = sub->rank;
+        eDescr.proxyStep.step = step;
+        ncclProfiler->startEvent(args->profilerContext, &sub->stepEventHandles[step%NCCL_STEPS], &eDescr);
+      }
+    }
+  }
+  TIME_STOP_EVENT(proxyStepStart);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclProfilerStopProxyStepEvents(int s, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi) {
+  TIME_START_EVENT(proxyStepStop);
+  struct ncclProxySubArgs* sub = &args->subs[s];
+  if (__builtin_expect(ncclProfiler != NULL, 0)) {
+    for (uint64_t step = stepLo; step < stepHi; step++) {
+      if (sub->stepEventHandles[step%NCCL_STEPS]) {
+        ncclProfiler->stopEvent(sub->stepEventHandles[step%NCCL_STEPS]);
+        sub->stepEventHandles[step%NCCL_STEPS] = NULL;
+      }
+    }
+  }
+  TIME_STOP_EVENT(proxyStepStop);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclProfilerStartProxyCtrlEvent(void* profilerContext, void** eHandle) {
+  TIME_START_EVENT(proxyCtrlStart);
+  if (__builtin_expect(ncclProfiler != NULL, 0)) {
+    // for proxy control events we allow profiling mode to change on a per event basis
+    int eActivationMaskProxy = __atomic_load_n(&eActivationMask, __ATOMIC_RELAXED);
+    if (eActivationMaskProxy & ncclProfileProxyCtrl) {
+      ncclProfilerEventDescr_t eDescr = { 0 };
+      eDescr.type = ncclProfileProxyCtrl;
+      ncclProfiler->startEvent(profilerContext, eHandle, &eDescr);
+      TIME_STOP_EVENT(proxyCtrlStart);
+      return ncclSuccess;
+    }
+  }
+  *eHandle = NULL;
+  TIME_STOP_EVENT(proxyCtrlStart);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle) {
+  TIME_START_EVENT(proxyCtrlStop);
+  if (__builtin_expect(ncclProfiler != NULL, 0) && eHandle) {
+    ncclProfiler->stopEvent(eHandle);
+  }
+  TIME_STOP_EVENT(proxyCtrlStop);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclProfilerRecordProxyOpEventState(int s, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState) {
+  TIME_START_EVENT(proxyOpRecord);
+  struct ncclProxySubArgs* sub = &args->subs[s];
+  if (__builtin_expect(ncclProfiler != NULL, 0) && sub->opEventHandle) {
+    ncclProfilerEventStateArgs_t a = { 0 };
+    a.proxyOp.steps = steps;
+    a.proxyOp.transSize = transSize;
+    ncclProfiler->recordEventState(sub->opEventHandle, eState, &a);
+  }
+  TIME_STOP_EVENT(proxyOpRecord);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclProfilerRecordProxyStepEventStates(int s, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi, ncclProfilerEventState_t eState) {
+  TIME_START_EVENT(proxyStepRecord);
+  struct ncclProxySubArgs* sub = &args->subs[s];
+  if (__builtin_expect(ncclProfiler != NULL, 0) && sub->opEventHandle) {
+    for (uint64_t step = stepLo; step < stepHi; step++) {
+      if (sub->stepEventHandles[step%NCCL_STEPS]) {
+        ncclProfiler->recordEventState(sub->stepEventHandles[step%NCCL_STEPS], eState, 0);
+      }
+    }
+  }
+  TIME_STOP_EVENT(proxyStepRecord);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclProfilerRecordProxyCtrlEventState(void* eHandle, int appended, ncclProfilerEventState_t eState) {
+  TIME_START_EVENT(proxyCtrlRecord);
+  if (__builtin_expect(ncclProfiler != NULL, 0) && eHandle && __atomic_load_n(&eActivationMask, __ATOMIC_RELAXED) & ncclProfileProxyCtrl) {
+    ncclProfilerEventStateArgs_t args = { 0 };
+    args.proxyCtrl.appendedProxyOps = appended;
+    ncclProfiler->recordEventState(eHandle, eState, &args);
+  }
+  TIME_STOP_EVENT(proxyCtrlRecord);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclProfilerAddPidToProxyOp(struct ncclProxyOp* op) {
+  op->pid = pid;
+  return ncclSuccess;
 }
-#else
-ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) { return ncclSuccess; }
-void ncclProfilingDump() {}
-#endif
diff --git a/src/misc/shmutils.cc b/src/misc/shmutils.cc
index a481643378..daf3b338db 100644
--- a/src/misc/shmutils.cc
+++ b/src/misc/shmutils.cc
@@ -4,7 +4,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "shm.h"
+#include "shmutils.h"
 #include "comm.h"
 #include "checks.h"
 #include <sys/types.h>
@@ -75,7 +75,7 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de
         goto fail;
       }
     } else {
-      SYSCHECKGOTO(fd = open(shmPath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), ret, fail);
+      SYSCHECKGOTO(fd = open(shmPath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), "open", ret, fail);
     }
 
   retry_fallocate:
@@ -90,7 +90,7 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de
     }
     INFO(NCCL_ALLOC, "Allocated %ld bytes of shared memory in %s", realShmSize, shmPath);
   } else {
-    SYSCHECKGOTO(fd = open(shmPath, O_RDWR, S_IRUSR | S_IWUSR), ret, fail);
+    SYSCHECKGOTO(fd = open(shmPath, O_RDWR, S_IRUSR | S_IWUSR), "open", ret, fail);
   }
 
   hptr = (char*)mmap(NULL, realShmSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
@@ -114,7 +114,7 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de
   }
 
   if (devShmPtr) {
-    CUDACHECKGOTO(cudaHostRegister((void*)hptr, realShmSize, cudaHostRegisterMapped), ret, fail);
+    CUDACHECKGOTO(cudaHostRegister((void*)hptr, realShmSize, cudaHostRegisterPortable | cudaHostRegisterMapped), ret, fail);
     CUDACHECKGOTO(cudaHostGetDevicePointer(&dptr, (void*)hptr, 0), ret, fail);
   }
 
@@ -129,7 +129,7 @@ fail:
        shmPath, shmSize, strerror(errno), errno);
   if (tmphandle) {
     shmHandleInit(fd, shmPath, shmSize, realShmSize, hptr, dptr, create, tmphandle);
-    ncclShmClose((ncclShmHandle_t)tmphandle);
+    (void)ncclShmClose((ncclShmHandle_t)tmphandle);
     tmphandle = NULL;
   }
   hptr = NULL;
@@ -182,7 +182,7 @@ ncclResult_t ncclShmUnlink(ncclShmHandle_t handle) {
 
 ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize) {
   ncclResult_t ret = ncclSuccess;
-  int curRound = shmem->round;
+  int curRound;
   size_t mycnt;
 
   if (comm == NULL || shmem == NULL || sendbuff == NULL || recvbuff == NULL || shmem->maxTypeSize < typeSize) {
@@ -190,6 +190,7 @@ ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff
     goto exit;
   }
 
+  curRound = shmem->round;
   memcpy((char*)shmem->ptr[curRound] + comm->localRank * typeSize, sendbuff, typeSize);
   /* sync among local ranks */
   mycnt = __atomic_add_fetch(shmem->cnt[curRound], 1, __ATOMIC_ACQ_REL);
diff --git a/src/misc/socket.cc b/src/misc/socket.cc
index 9ade0e41de..93e577e05d 100644
--- a/src/misc/socket.cc
+++ b/src/misc/socket.cc
@@ -284,6 +284,7 @@ ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char
       sin6.sin6_scope_id = 0;                          // should be global scope, set to 0
     } else {
       WARN("Net : unsupported IP family");
+      freeaddrinfo(p);
       return ncclInvalidArgument;
     }
 
@@ -408,7 +409,7 @@ ncclResult_t ncclSocketGetAddr(struct ncclSocket* sock, union ncclSocketAddress*
 
 static ncclResult_t socketTryAccept(struct ncclSocket* sock) {
   socklen_t socklen = sizeof(union ncclSocketAddress);
-  sock->fd = accept(sock->acceptFd, &sock->addr.sa, &socklen);
+  sock->fd = accept(sock->acceptFd, (struct sockaddr*)&sock->addr, &socklen);
   if (sock->fd != -1) {
     sock->state = ncclSocketStateAccepted;
   } else if (errno != EAGAIN && errno != EWOULDBLOCK) {
@@ -501,8 +502,9 @@ static ncclResult_t socketPollConnect(struct ncclSocket* sock) {
   } else if (ret < 0) {
     WARN("socketPollConnect poll() failed with error %s", strerror(errno));
     return ncclRemoteError;
-  } else {
-    EQCHECK(ret == 1 && (pfd.revents & POLLOUT), 0);
+  } else if (ret != 1 || (pfd.revents & POLLOUT) == 0) {
+    WARN("socketPollConnect poll() returned %d%s", ret, (pfd.revents & POLLOUT) ? "" : ", no POLLOUT events");
+    return ncclSystemError;
   }
 
   /* check socket status */
@@ -734,13 +736,17 @@ ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* ad
   /* Set socket as non-blocking if async or if we need to be able to abort */
   if ((sock->asyncFlag || sock->abortFlag) && sock->fd >= 0) {
     int flags;
-    EQCHECKGOTO(flags = fcntl(sock->fd, F_GETFL), -1, ret, fail);
-    SYSCHECKGOTO(fcntl(sock->fd, F_SETFL, flags | O_NONBLOCK), ret, fail);
+    SYSCHECKGOTO(flags = fcntl(sock->fd, F_GETFL), "fcntl", ret, fail);
+    SYSCHECKGOTO(fcntl(sock->fd, F_SETFL, flags | O_NONBLOCK), "fcntl", ret, fail);
   }
 
 exit:
   return ret;
 fail:
+  if (sock->fd != -1) {
+    close(sock->fd);
+    sock->fd = -1;
+  }
   goto exit;
 }
 
diff --git a/src/misc/tuner.cc b/src/misc/tuner.cc
index 608062bcc6..f1a9756f13 100644
--- a/src/misc/tuner.cc
+++ b/src/misc/tuner.cc
@@ -77,6 +77,8 @@ static void* tryOpenLib(const char* name, int* err, char* errStr) {
   if (nullptr == handle) {
     strncpy(errStr, dlerror(), MAX_STR_LEN);
     errStr[MAX_STR_LEN] = '\0';
+    // "handle" and "name" won't be NULL at the same time.
+    // coverity[var_deref_model]
     if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) {
       *err = ENOENT;
     }
diff --git a/src/misc/utils.cc b/src/misc/utils.cc
index 12504bc997..bb59947e46 100644
--- a/src/misc/utils.cc
+++ b/src/misc/utils.cc
@@ -65,15 +65,7 @@ ncclResult_t getHostName(char* hostname, int maxlen, const char delim) {
   return ncclSuccess;
 }
 
-uint64_t getHash(const char* string, int n) {
-  // Based on DJB2a, result = result * 33 ^ char
-  uint64_t result = 5381;
-  for (int c = 0; c < n; c++) {
-    result = ((result << 5) + result) ^ string[c];
-  }
-  return result;
-}
-
+static uint64_t hostHashValue = 0;
 /* Generate a hash of the unique identifying string for this host
  * that will be unique for both bare-metal and container instances
  * Equivalent of a hash of;
@@ -83,7 +75,7 @@ uint64_t getHash(const char* string, int n) {
  * This string can be overridden by using the NCCL_HOSTID env var.
  */
 #define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
-uint64_t getHostHash(void) {
+static void getHostHashOnce() {
   char hostHash[1024];
   const char *hostId;
 
@@ -103,8 +95,8 @@ uint64_t getHostHash(void) {
         strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1);
         free(p);
       }
+      fclose(file);
     }
-    fclose(file);
   }
 
   // Make sure the string is terminated
@@ -112,7 +104,12 @@ uint64_t getHostHash(void) {
 
   TRACE(NCCL_INIT,"unique hostname '%s'", hostHash);
 
-  return getHash(hostHash, strlen(hostHash));
+  hostHashValue = getHash(hostHash, strlen(hostHash));
+}
+uint64_t getHostHash(void) {
+  static pthread_once_t once = PTHREAD_ONCE_INIT;
+  pthread_once(&once, getHostHashOnce);
+  return hostHashValue;
 }
 
 /* Generate a hash of the unique identifying string for this process
diff --git a/src/nccl.h.in b/src/nccl.h.in
index 9efdf9fc1b..431ecb5546 100644
--- a/src/nccl.h.in
+++ b/src/nccl.h.in
@@ -168,6 +168,13 @@ ncclResult_t pncclCommAbort(ncclComm_t comm);
 ncclResult_t  ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
 ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
 
+/* Creates a new communicator (multi thread/process version), similar to ncclCommInitRankConfig.
+ * Allows to use more than one ncclUniqueId (up to one per rank), indicated by nId, to accelerate the init operation.
+ * The number of ncclUniqueIds and their order must be the same for every rank.
+ */
+ncclResult_t ncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commIds, ncclConfig_t* config);
+ncclResult_t pncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commIds, ncclConfig_t* config);
+
 /* Returns a string for each error code. */
 const char*  ncclGetErrorString(ncclResult_t result);
 const char* pncclGetErrorString(ncclResult_t result);
diff --git a/src/net.cc b/src/net.cc
index 0f5d336ea5..97a8c73816 100644
--- a/src/net.cc
+++ b/src/net.cc
@@ -355,6 +355,8 @@ static void* tryOpenLib(char* name, int* err, char* errStr) {
   if (nullptr == handle) {
     strncpy(errStr, dlerror(), MAX_STR_LEN);
     errStr[MAX_STR_LEN] = '\0';
+    // "handle" and "name" won't be NULL at the same time.
+    // coverity[var_deref_model]
     if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) {
       *err = ENOENT;
     }
@@ -422,11 +424,10 @@ static int netPluginStatus = netPluginLoadReady;
 
 ncclResult_t ncclNetPluginLoad(struct ncclComm* comm) {
   char couldNotFindNames[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 };
-  if (netPluginLoadFailed == netPluginStatus) {
-    return ncclSuccess;
-  }
-
   pthread_mutex_lock(&netPluginLock);
+  if (netPluginLoadFailed == netPluginStatus) {
+    goto exit;
+  }
   if (netPluginLoadSuccess == netPluginStatus) {
     ++netPluginRefCount;
     goto exit;
diff --git a/src/proxy.cc b/src/proxy.cc
index eef71a5653..5e657c0a4a 100644
--- a/src/proxy.cc
+++ b/src/proxy.cc
@@ -8,18 +8,21 @@
 #include "info.h"
 #include "collectives.h"
 #include "socket.h"
-#include "shm.h"
+#include "shmutils.h"
 #include "profiler.h"
 #define ENABLE_TIMER 0
 #include "timer.h"
+#include "profiler.h"
 #include "transport.h"
 
 #include <sys/syscall.h>
 #include <assert.h>
 #include <unistd.h>
 #include <sys/time.h>
+#include <sched.h>
 
 enum { proxyRecv=0, proxySend=1 };
+void* ncclProxyServiceUDS(void* _args);
 
 static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, int nranks) {
   if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice) return true;
@@ -67,8 +70,10 @@ static ncclResult_t expectedProxyResponseStore(struct ncclProxyState* state, voi
         return ncclInternalError;
       }
 
-      memcpy(elem->respBuff, respBuff, respSize);
-      free(respBuff);
+      if (respSize > 0) {
+        memcpy(elem->respBuff, respBuff, respSize);
+        free(respBuff);
+      }
       elem->done = true;
       elem->res  = res;
       return ncclSuccess;
@@ -360,12 +365,17 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr
   sub->nsteps = op->nsteps;
   sub->nbytes = op->nbytes;
   sub->offset = 0;
-  sub->peer = op->root;
+  sub->peer = op->peer;
   sub->reg = op->reg;
   sub->sendMhandle = op->sendMhandle;
   sub->recvMhandle = op->recvMhandle;
   sub->sendbuff = op->sendbuff;
   sub->recvbuff = op->recvbuff;
+  sub->eActivationMask = op->eActivationMask;
+  sub->taskEventHandle = op->taskEventHandle;
+  sub->rank = op->rank;
+  args->pid = op->pid;
+  args->profilerContext = op->profilerContext;
   args->nsubs = subIndex+1;
   if (subIndex) {
     if ((args->sliceSteps != op->sliceSteps) ||
@@ -527,6 +537,7 @@ static ncclResult_t SaveProxy(struct ncclComm* comm, struct ncclChannel* channel
 
   if (justInquire) *justInquire = true;
   else {
+    op->peer = peer;
     NCCLCHECK(ncclLocalOpAppend(comm, &connector->proxyConn, op));
   }
   return ncclSuccess;
@@ -588,6 +599,64 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool
       NCCLCHECK(SaveProxy(comm, channel, proxySend, channel->nvls.treeDown[2], op, 0, justInquire));
       NCCLCHECK(SaveProxy(comm, channel, proxyRecv, channel->nvls.treeUp, op, 0, justInquire));
     } break;
+  case ncclPatternPatUp: {
+      // Run full algorithm to count the number of steps for each peer.
+      int *nstepsSend, *nstepsRecv;
+      const int rank = comm->rank, nranks = comm->nRanks;
+      NCCLCHECK(ncclCalloc(&nstepsSend, log2Up(nranks)));
+      NCCLCHECK(ncclCalloc(&nstepsRecv, log2Up(nranks)));
+      const ssize_t size = op->nbytes/comm->nRanks;
+      PatRSAlgorithm<char> algo(op->chunkSize, NCCL_STEPS, 0, size, size, op->chunkSize, rank, nranks);
+      int last = 0;
+      while (last == 0) {
+        int recvDim, sendDim, recvOffset, sendOffset, sendStepOffset, postRecv, postSend, nelem;
+        size_t inpIx, outIx;
+        algo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, sendStepOffset, nelem, postRecv, postSend, last);
+        if (recvDim != -1 && postRecv) nstepsRecv[recvDim]++;
+        if (sendDim != -1 && postSend) nstepsSend[sendDim]++;
+      }
+      for (int i=0; i<log2Up(nranks); i++) {
+        if (nstepsSend[i]) {
+          int sendPeer = (rank + (1<<i)) % nranks;
+          op->nsteps = nstepsSend[i];
+          NCCLCHECK(SaveProxy(comm, channel, proxySend, sendPeer, op, 0, justInquire));
+        }
+        if (nstepsRecv[i]) {
+          int recvPeer = (rank - (1<<i) + nranks) % nranks;
+          op->nsteps = nstepsRecv[i];
+          NCCLCHECK(SaveProxy(comm, channel, proxyRecv, recvPeer, op, 0, justInquire));
+        }
+      }
+    } break;
+  case ncclPatternPatDown: {
+      // Run full algorithm to count the number of steps for each peer.
+      int *nstepsSend, *nstepsRecv;
+      const int rank = comm->rank, nranks = comm->nRanks;
+      NCCLCHECK(ncclCalloc(&nstepsSend, log2Up(nranks)));
+      NCCLCHECK(ncclCalloc(&nstepsRecv, log2Up(nranks)));
+      const ssize_t size = op->nbytes/comm->nRanks;
+      PatAGAlgorithm<char> algo(op->chunkSize, NCCL_STEPS, 0, size, size, op->chunkSize, rank, nranks);
+      int last = 0;
+      while (last == 0) {
+        int recvDim, sendDim, recvOffset, sendOffset, recvStepOffset, postRecv, postSend, nelem;
+        size_t inpIx, outIx;
+        algo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, recvStepOffset, nelem, postRecv, postSend, last);
+        if (recvDim != -1 && postRecv) nstepsRecv[recvDim]++;
+        if (sendDim != -1 && postSend) nstepsSend[sendDim]++;
+      }
+      for (int i=0; i<log2Up(nranks); i++) {
+        if (nstepsSend[i]) {
+          int sendPeer = (rank - (1<<i) + nranks) % nranks;
+          op->nsteps = nstepsSend[i];
+          NCCLCHECK(SaveProxy(comm, channel, proxySend, sendPeer, op, 0, justInquire));
+        }
+        if (nstepsRecv[i]) {
+          int recvPeer = (rank + (1<<i)) % nranks;
+          op->nsteps = nstepsRecv[i];
+          NCCLCHECK(SaveProxy(comm, channel, proxyRecv, recvPeer, op, 0, justInquire));
+        }
+      }
+    } break;
   case ncclPatternSend:
   case ncclPatternRecv: {
       if (op->root == comm->rank) return ncclSuccess;
@@ -657,9 +726,9 @@ static ncclResult_t ncclProxyGetPostedOps(struct ncclProxyState* proxyState, int
   if (state->opsPool == NULL) return ncclInternalError;
   struct ncclProxyOpsPool* pool = state->opsPool;
 
-  struct ncclProxyArgs profArgs; // Only used for profiling purposes
   if (state->nextOps != -1) goto process_nextops;
 
+  void* eHandle;
   // If we have ops to progress, no need to block waiting for something to arrive or even wait for the lock
   // to be available. Exit, continue progress, and come back later.
   if (state->active != NULL && (pool->nextOps == -1 || pthread_mutex_trylock(&pool->mutex) != 0)) return ncclSuccess;
@@ -667,10 +736,11 @@ static ncclResult_t ncclProxyGetPostedOps(struct ncclProxyState* proxyState, int
   if (state->active == NULL) {
     pthread_mutex_lock(&pool->mutex);
     while (pool->nextOps == -1 && !state->stop) {
-      struct ncclProxyArgs profArgs; // Only used for profiling purposes
-      ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileSleep);
+      ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle);
+      ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlSleep);
       pthread_cond_wait(&pool->cond, &pool->mutex);
-      ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileWakeup);
+      ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlWakeup);
+      ncclProfilerStopProxyCtrlEvent(eHandle);
     }
     if (state->stop) { // We might have been woken up to stop.
       pthread_mutex_unlock(&pool->mutex);
@@ -684,7 +754,8 @@ static ncclResult_t ncclProxyGetPostedOps(struct ncclProxyState* proxyState, int
   if (state->nextOps == -1) return ncclInternalError;
 
 process_nextops:
-  ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileAppend);
+  ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle);
+  ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlAppend);
   TIME_START(2);
   int freeOp[NCCL_MAX_LOCAL_RANKS];
   int freeOpEnd[NCCL_MAX_LOCAL_RANKS];
@@ -720,6 +791,10 @@ process_nextops:
     if (freeOp[i] == -1) continue;
     int newFree = freeOp[i];
     int oldFree = pool->freeOps[i];
+    // Coverity gets confused by the complex code structure here.  The previous "for" loop ensures that freeOpEnd[i]
+    // is initialized so long as freeOp[i] is initialized (is not -1).  In the current loop we filter out uninitialized
+    // freeOp[i], hence ensuring that freeOpEnd[i] is also initialized.
+    // coverity[uninit_use:FALSE]
     pool->ops[freeOpEnd[i]].next = oldFree;
     if (oldFree == -1) {
       // Nothing for the main thread to consume, we can set it.
@@ -735,8 +810,8 @@ process_nextops:
       }
     }
   }
-  profArgs.opCount = *added;
-  ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileAppendEnd);
+  ncclProfilerRecordProxyCtrlEventState(eHandle, *added, ncclProfilerProxyCtrlAppendEnd);
+  ncclProfilerStopProxyCtrlEvent(eHandle);
   TIME_STOP(2);
   return ncclSuccess;
 }
@@ -758,6 +833,7 @@ static int setProxyThreadContext(struct ncclProxyState* proxyState) {
       if (CUPFN(cuCtxCreate) == nullptr || CUPFN(cuCtxDestroy) == nullptr || CUPFN(cuCtxSetCurrent) == nullptr) {
         WARN("Unable to create thread context due to old driver, disabling.");
         createThreadContext = 0;
+        goto exit;
       }
     }
   }
@@ -767,15 +843,17 @@ static int setProxyThreadContext(struct ncclProxyState* proxyState) {
                             NULL, 0, CU_CTX_SCHED_SPIN|CU_CTX_MAP_HOST, proxyState->cudaDev)) != CUDA_SUCCESS) {
         WARN("Failed to create CUDA context on device %d", proxyState->cudaDev);
         createThreadContext = 0;
+        goto exit;
       }
     } else {
       if (CUPFN(cuCtxSetCurrent(proxyState->cudaCtx)) != CUDA_SUCCESS) {
         WARN("Failed to set CUDA context on device %d", proxyState->cudaDev);
-        return 0;
+        goto exit;
       }
-      return 1;
     }
+    return 1;
   }
+exit:
 #endif
   return 0;
 }
@@ -787,12 +865,14 @@ NCCL_PARAM(ProgressAppendOpFreq, "PROGRESS_APPENDOP_FREQ", 8);
 void* ncclProxyProgress(void *proxyState_) {
   struct ncclProxyState* proxyState = (struct ncclProxyState*)proxyState_;
   if (setProxyThreadContext(proxyState)) {
-    INFO(NCCL_INIT, "[Proxy Progress] Created CUDA context on device %d", proxyState->cudaDev);
+    INFO(NCCL_INIT, "[Proxy Progress] Set CUDA context on device %d", proxyState->cudaDev);
   } else if (cudaSetDevice(proxyState->cudaDev) != cudaSuccess) {
     WARN("[Proxy Progress] Failed to set CUDA device %d", proxyState->cudaDev);
   }
   // if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
 
+  INFO(NCCL_INIT, "[Proxy Progress] Device %d CPU core %d", proxyState->cudaDev, sched_getcpu());
+
   struct ncclProxyProgressState* state = &proxyState->progressState;
   state->nextOps = -1;
   const int sig = ncclParamProxyDumpSignal();
@@ -809,9 +889,7 @@ void* ncclProxyProgress(void *proxyState_) {
    * ncclParamProgressAppendOpFreq(). If they are equal, we will append proxy ops. This will decrease the
    * frequency of calling ncclProxyGetPostedOps() and reduce the perf impact. */
   int proxyOpAppendCounter = 0;
-  struct ncclProxyArgs profArgs; // Only used for profiling purposes
-  while ((state->stop == 0 || (state->stop == 1 && state->active)) &&
-         __atomic_load_n(proxyState->abortFlag, __ATOMIC_ACQUIRE) == 0) {
+  while (state->stop == 0 || (state->stop == 1 && state->active)) {
     int idle = 1;
     ncclResult_t ret = progressOps(proxyState, state, state->active, &idle);
     if (ret != ncclSuccess) {
@@ -819,8 +897,11 @@ void* ncclProxyProgress(void *proxyState_) {
       INFO(NCCL_ALL,"%s:%d -> %d [Progress Thread]", __FILE__, __LINE__, ret);
       continue;
     }
-    if (lastIdle == 0 && idle == 1) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileIdle);
-    if (lastIdle == 1 && idle == 0) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileActive);
+    void* eHandle;
+    ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle);
+    if (lastIdle == 0 && idle == 1) ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlIdle);
+    if (lastIdle == 1 && idle == 0) ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlActive);
+    ncclProfilerStopProxyCtrlEvent(eHandle);
     if (idle || (++proxyOpAppendCounter == ncclParamProgressAppendOpFreq())) {
       int added = 0;
       proxyOpAppendCounter = 0;
@@ -860,7 +941,7 @@ ncclResult_t ncclProxyStart(struct ncclComm* comm) {
 static ncclResult_t ncclProxyProgressCreate(struct ncclProxyState* proxyState) {
   struct ncclProxyProgressState* state = &proxyState->progressState;
   if (!state->thread) {
-    pthread_create(&state->thread, NULL, ncclProxyProgress, proxyState);
+    PTHREADCHECK(pthread_create(&state->thread, NULL, ncclProxyProgress, proxyState), "pthread_create");
     ncclSetThreadName(state->thread, "NCCL Progress%2d", proxyState->tpLocalnRanks);
   }
   return ncclSuccess;
@@ -875,7 +956,7 @@ ncclResult_t ncclProxyProgressDestroy(struct ncclProxyState* proxyState) {
     state->stop = 1;
     pthread_cond_signal(&state->opsPool->cond);
     pthread_mutex_unlock(&state->opsPool->mutex);
-    pthread_join(state->thread, NULL);
+    PTHREADCHECK(pthread_join(state->thread, NULL), "pthread_join");
   }
 
   // Free off any memory allocated for the proxy arg pools
@@ -885,7 +966,6 @@ ncclResult_t ncclProxyProgressDestroy(struct ncclProxyState* proxyState) {
     state->pools = next;
   }
 
-  ncclProfilingDump();
   TIME_PRINT("Proxy");
   return ncclSuccess;
 }
@@ -962,23 +1042,17 @@ struct ncclProxyInitResp {
   char devShmPath[6]; // "XXXXXX" - May or may not be set
 };
 
-ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int tpProxyRank, struct ncclProxyConnector* proxyConn) {
+ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int proxyRank, struct ncclProxyConnector* proxyConn) {
   struct ncclSocket* sock;
-  int ready, proxyRank = -1;
+  int ready;
   struct ncclProxyState* sharedProxyState = comm->proxyState;
+  int tpProxyRank = comm->topParentRanks[proxyRank];
 
-  // Keep one connection per local rank
-  for (int i = 0; i < comm->localRanks; ++i) {
-    /* find the proxy rank in comm. */
-    if (comm->topParentRanks[comm->localRankToRank[i]] == tpProxyRank) {
-      proxyRank = comm->localRankToRank[i];
-      break;
-    }
-  }
   proxyConn->sameProcess = comm->peerInfo[proxyRank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0;
   // Keep one connection per local rank
   proxyConn->connection = NULL;
   proxyConn->tpRank = tpProxyRank;
+  proxyConn->rank = proxyRank;
   if (sharedProxyState->peerSocks == NULL) {
     NCCLCHECK(ncclCalloc(&sharedProxyState->peerSocks, comm->sharedRes->tpNLocalRanks));
     NCCLCHECK(ncclCalloc(&sharedProxyState->proxyOps, comm->sharedRes->tpNLocalRanks));
@@ -1020,68 +1094,93 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
       proxyOps->nextOps = proxyOps->nextOpsEnd = proxyOps->freeOp = -1;
     }
   }
+  proxyConn->initialized = true;
   INFO(NCCL_NET|NCCL_PROXY, "Connected to proxy localRank %d -> connection %p", proxyConn->tpLocalRank, proxyConn->connection);
   return ncclSuccess;
 }
 
 // UDS support
-ncclResult_t ncclProxyCallBlockingUDS(struct ncclComm* comm, int tpRank, int type, void* reqBuff, int reqSize, void* respBuff, int respSize, int *respFd) {
+ncclResult_t ncclProxyCallBlockingUDS(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize, int* reqFd, int *respFd) {
   ncclResult_t res = ncclSuccess;
   struct ncclIpcSocket ipcSock = { 0 };
   void *opId;
   NCCLCHECK(getRandomData(&opId, sizeof(opId)));
+  int reqFdtmp = -1;
 
   int rank = comm->topParentLocalRanks[comm->localRank];
   struct ncclProxyState* sharedProxyState = comm->proxyState;
-  uint64_t pidHash = sharedProxyState->peerAddressesUDS[tpRank];
+  uint64_t pidHash = sharedProxyState->peerAddressesUDS[proxyConn->tpRank];
 
   INFO(NCCL_PROXY, "ProxyCall UDS comm %p rank %d tpRank %d(%lx) reqSize %d respSize %d respFd %p opId %p",
-       comm, rank, tpRank, pidHash, reqSize, respSize, respFd, opId);
+       comm, rank, proxyConn->tpRank, pidHash, reqSize, respSize, respFd, opId);
 
   // cuMem: Create a UDS socket to receive the response
   NCCLCHECK(ncclIpcSocketInit(&ipcSock, rank, (uint64_t)opId, comm->abortFlag));
 
+  if (reqFd) {
+    reqFdtmp = *reqFd;
+  } else {
+    // give a dummy fd for the other side of UDS socket
+    NCCLCHECK(ncclIpcSocketGetFd(&ipcSock, &reqFdtmp));
+  }
+
   ncclIpcHdr hdr;
   hdr.type = type;
   hdr.rank = rank;
   hdr.reqSize = reqSize;
   hdr.respSize = respSize;
   hdr.opId = opId;
+
   assert(reqSize <= sizeof(hdr.data));
   memcpy(&hdr.data, reqBuff, reqSize);
-  NCCLCHECKGOTO(ncclIpcSocketSendMsg(&ipcSock, &hdr, sizeof(hdr), -1, tpRank, pidHash), res, error);
+  NCCLCHECKGOTO(ncclIpcSocketSendMsg(&ipcSock, &hdr, sizeof(hdr), reqFdtmp, proxyConn->tpRank, pidHash), res, error);
   NCCLCHECKGOTO(ncclIpcSocketRecvMsg(&ipcSock, respBuff, respSize, respFd), res, error);
   NCCLCHECKGOTO(ncclIpcSocketClose(&ipcSock), res, error);
 
   INFO(NCCL_PROXY, "ProxyCall UDS comm %p rank %d tpRank %d(%lx) reqSize %d respSize %d respFd %d opId %p - DONE",
-       comm, rank, tpRank, pidHash, reqSize, respSize, (respFd ? *respFd : -1), opId);
+       comm, rank, proxyConn->tpRank, pidHash, reqSize, respSize, (respFd ? *respFd : -1), opId);
 
   return res;
 
 error:
   NCCLCHECK(ncclIpcSocketClose(&ipcSock));
-  WARN("ncclProxyCallBlockingUDS call to tpRank %d(%lx) failed : %d", tpRank, pidHash, res);
+  WARN("ncclProxyCallBlockingUDS call to tpRank %d(%lx) failed : %d", proxyConn->tpRank, pidHash, res);
   return res;
 }
 
 // cuMem API support
 // The request/response is sent out-of-band using ncclIpcSocket for this specific command
-ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, int tpRank, void *handle, int* convertedFd) {
+ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, int proxyRank, void *handle, int* convertedFd) {
   ncclResult_t ret = ncclSuccess;
 
   // Request the allocation of a UDS fd for the handle
-  NCCLCHECKGOTO(ncclProxyCallBlockingUDS(comm, tpRank, ncclProxyMsgGetFd, handle, sizeof(CUmemGenericAllocationHandle), NULL, 0, convertedFd), ret, error);
+  if (comm->gproxyConn[proxyRank].initialized == false) {
+    NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_P2P, 1, proxyRank, &comm->gproxyConn[proxyRank]), ret, error);
+  }
+  NCCLCHECKGOTO(ncclProxyCallBlockingUDS(comm, &comm->gproxyConn[proxyRank], ncclProxyMsgGetFd, handle, sizeof(CUmemGenericAllocationHandle), NULL, 0, NULL, convertedFd), ret, error);
 
   // We have now received the converted fd over UDS
-  INFO(NCCL_PROXY, "UDS: ClientGetFd handle 0x%lx tpRank %d returned fd %d", *(uint64_t*)handle, tpRank, *convertedFd);
+  INFO(NCCL_PROXY, "UDS: ClientGetFd handle 0x%lx tpRank %d returned fd %d sameProcess %d", *(uint64_t*)handle, comm->topParentRanks[proxyRank], *convertedFd, comm->gproxyConn[proxyRank].sameProcess);
 
   return ret;
 
 error:
-  WARN("ncclProxyClientGetFd call to tpRank %d handle 0x%lx failed : %d", tpRank, *(uint64_t*)handle, ret);
+  WARN("ncclProxyClientGetFd call to tpRank %d handle 0x%lx failed : %d", comm->topParentRanks[proxyRank], *(uint64_t*)handle, ret);
   return ret;
 }
 
+ncclResult_t ncclProxyClientQueryFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int localFd, int* rmtFd) {
+  ncclResult_t ret = ncclSuccess;
+  NCCLCHECKGOTO(ncclProxyCallBlockingUDS(comm, proxyConn, ncclProxyMsgQueryFd, NULL, 0, (void*)rmtFd, sizeof(int), &localFd, NULL), ret, fail);
+exit:
+  // We have now received the converted fd over UDS
+  INFO(NCCL_PROXY, "UDS: ClientQueryFd localFd %d tpRank %d remote fd %d sameProcess %d", localFd, proxyConn->tpRank, *rmtFd, proxyConn->sameProcess);
+  return ret;
+fail:
+  WARN("ncclProxyClientQueryFdBlocking call to tpRank %d localFd %d failed : %d", proxyConn->tpRank, localFd, ret);
+  goto exit;
+}
+
 const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop", "GetFd" };
 ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId) {
   struct ncclSocket* sock;
@@ -1091,7 +1190,6 @@ ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector
   if (sharedProxyState->peerSocks == NULL) return ncclInternalError;
 
   sock = sharedProxyState->peerSocks + proxyConn->tpLocalRank;
-  if (sock == NULL) return ncclInternalError;
 
   NCCLCHECKGOTO(ncclSocketSend(sock, &type, sizeof(int)), ret, error);
   NCCLCHECKGOTO(ncclSocketSend(sock, &proxyConn->connection, sizeof(void*)), ret, error);
@@ -1267,6 +1365,22 @@ static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclPr
   return ncclSuccess;
 }
 
+static ncclResult_t proxyQueryFd(struct ncclProxyState* proxyState, int rank, void *opId, int rmtFd) {
+#if CUDART_VERSION >= 11030
+  struct ncclIpcSocket ipcSock = { 0 };
+  uint64_t hash = (uint64_t) opId;
+  ncclResult_t ret = ncclSuccess;
+
+  NCCLCHECKGOTO(ncclIpcSocketInit(&ipcSock, proxyState->tpRank, hash^1, proxyState->abortFlag), ret, exit);
+  NCCLCHECKGOTO(ncclIpcSocketSendMsg(&ipcSock, &rmtFd, sizeof(int), rmtFd, rank, hash), ret, exit);
+exit:
+  NCCLCHECK(ncclIpcSocketClose(&ipcSock));
+  return ncclSuccess;
+#else
+  return ncclInternalError;
+#endif
+}
+
 // cuMem API support
 static ncclResult_t proxyGetFd(struct ncclProxyState* proxyState, int rank, void *opId, uint64_t handle) {
 #if CUDART_VERSION >= 11030
@@ -1286,7 +1400,7 @@ static ncclResult_t proxyGetFd(struct ncclProxyState* proxyState, int rank, void
 error:
   NCCLCHECK(ncclIpcSocketClose(&ipcSock));
   // We can now safely close the exported fd
-  (void) close(fd);
+  SYSCHECK(close(fd), "close");
   return ret;
 #else
   return ncclInternalError;
@@ -1352,30 +1466,37 @@ static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclP
 }
 
 static ncclResult_t proxyServiceInitOp(int type, struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclProxyState* proxyState, int* asyncOpCount) {
+  ncclResult_t ret = ncclSuccess;
   struct ncclSocket* sock = &peer->sock;
   struct ncclProxyAsyncOp* asyncOp;
   NCCLCHECK(ncclCalloc(&asyncOp, 1));
 
   asyncOp->type = type;
-  NCCLCHECK(ncclSocketRecv(sock, &asyncOp->connection, sizeof(void*)));
+  NCCLCHECKGOTO(ncclSocketRecv(sock, &asyncOp->connection, sizeof(void*)), ret, fail);
 
-  NCCLCHECK(ncclSocketRecv(sock, &asyncOp->reqSize, sizeof(int)));
-  NCCLCHECK(ncclSocketRecv(sock, &asyncOp->respSize, sizeof(int)));
+  NCCLCHECKGOTO(ncclSocketRecv(sock, &asyncOp->reqSize, sizeof(int)), ret, fail);
+  NCCLCHECKGOTO(ncclSocketRecv(sock, &asyncOp->respSize, sizeof(int)), ret, fail);
   if (asyncOp->reqSize) {
-    NCCLCHECK(ncclCalloc(&asyncOp->reqBuff, asyncOp->reqSize));
-    NCCLCHECK(ncclSocketRecv(sock, asyncOp->reqBuff, asyncOp->reqSize));
+    NCCLCHECKGOTO(ncclCalloc(&asyncOp->reqBuff, asyncOp->reqSize), ret, fail);
+    NCCLCHECKGOTO(ncclSocketRecv(sock, asyncOp->reqBuff, asyncOp->reqSize), ret, fail);
   }
 
   // Store opId for completion response
-  NCCLCHECK(ncclSocketRecv(sock, &asyncOp->opId, sizeof(asyncOp->opId)));
+  NCCLCHECKGOTO(ncclSocketRecv(sock, &asyncOp->opId, sizeof(asyncOp->opId)), ret, fail);
 
-  if (asyncOp->respSize) NCCLCHECK(ncclCalloc(&asyncOp->respBuff, asyncOp->respSize));
+  if (asyncOp->respSize) NCCLCHECKGOTO(ncclCalloc(&asyncOp->respBuff, asyncOp->respSize), ret, fail);
 
   asyncProxyOpEnqueue(peer, asyncOp);
 
   (*asyncOpCount)++;
   NCCLCHECK(proxyProgressAsync(asyncOp, proxyState, asyncOpCount, peer, connectionPool));
-  return ncclSuccess;
+exit:
+  return ret;
+fail:
+  if (asyncOp->reqBuff) free(asyncOp->reqBuff);
+  if (asyncOp->respBuff) free(asyncOp->respBuff);
+  free(asyncOp);
+  goto exit;
 }
 
 #include <poll.h>
@@ -1395,6 +1516,12 @@ static bool proxyMatchOpType(int type) {
   }
 }
 
+enum {
+  PROXY_RUNNING = 0,
+  PROXY_STOP = 1,
+  PROXY_ABORT = 2
+};
+
 void* ncclProxyService(void* _args) {
   struct ncclProxyState* proxyState =  (struct ncclProxyState*) _args;
   // if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
@@ -1405,6 +1532,8 @@ void* ncclProxyService(void* _args) {
   }
   // if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
 
+  INFO(NCCL_INIT, "[Proxy Service] Device %d CPU core %d", proxyState->cudaDev, sched_getcpu());
+
   // Prepare poll descriptor
   struct ncclProxyConnectionPool connectionPool;
   connectionPool.pools = NULL;
@@ -1426,13 +1555,13 @@ void* ncclProxyService(void* _args) {
 
   int maxnpeers = 0;
   int npeers = 0;
-  int stop = 0;
+  int stop = PROXY_RUNNING;
   int asyncOpCount = 0;
-  while (stop == 0 || (stop == 1 && npeers > 0)) {
+  while (stop == PROXY_RUNNING || npeers > 0) {
     /* Even if local comm aborts, we cannot let proxy thread exit if we still have peer
      * connections. Need to wait until all other related comms call abort and safely exit
      * together, or we could face segmentation fault. */
-    if (__atomic_load_n(proxyState->abortFlag, __ATOMIC_ACQUIRE) != 0) stop = 1;
+    if (__atomic_load_n(proxyState->abortFlag, __ATOMIC_ACQUIRE) != 0) stop = PROXY_ABORT;
     /* never let proxy service thread blocks in poll, or it cannot receive abortFlag. */
     int ret;
     do {
@@ -1474,10 +1603,14 @@ void* ncclProxyService(void* _args) {
       if (pollfds[s].fd == -1) continue;
 
       // Progress all ops for this ncclProxyLocalPeer
+      if (stop == PROXY_ABORT && ncclCuMemEnable() && ncclCuMemHostEnable() && !proxyState->directMode) closeConn = 1;
       ncclProxyAsyncOp* op = peer->asyncOps;
       while (op != nullptr) {
         ncclProxyAsyncOp* opnext = op->next; /* in case op is freed in proxyProgressAsync */
         type = op->type;
+        // Coverity gets confused here by complex code structure.  Yes, connectionPool.pools gets dereferenced, and
+        // while calling proxyProgressAsync() connectionPool.pools is NULL, but that changes before it's dereferenced.
+        // coverity[var_deref_model:FALSE]
         res = proxyProgressAsync(op, proxyState, &asyncOpCount, peer, &connectionPool);
         if (res == ncclSuccess || res == ncclInProgress) {
           op = opnext;
@@ -1494,14 +1627,15 @@ void* ncclProxyService(void* _args) {
         int closed;
         res = ncclSocketTryRecv(sock, &type, sizeof(int), &closed, false /*blocking*/);
         if (res != ncclSuccess && res != ncclInProgress) {
-          WARN("[Service thread] Could not receive type from localRank %d, res=%u, closed=%d", peer->tpLocalRank, res, closed);
+          if (!__atomic_load_n(proxyState->abortFlag, __ATOMIC_RELAXED))
+            WARN("[Service thread] Could not receive type from localRank %d, res=%u, closed=%d", peer->tpLocalRank, res, closed);
           closeConn = 1;
         } else if (closed) {
           INFO(NCCL_INIT|NCCL_NET|NCCL_PROXY, "[Service thread] Connection closed by localRank %d", peer->tpLocalRank);
           closeConn = 1;
         } else if (res == ncclSuccess) { // We received something from the sock
           if (type == ncclProxyMsgStop) {
-            stop = 1;
+            stop = PROXY_STOP;
             closeConn = 1;
           } else if (type == ncclProxyMsgClose) {
             closeConn = 1;
@@ -1518,12 +1652,13 @@ void* ncclProxyService(void* _args) {
         closeConn = 1;
       }
       if (res != ncclSuccess && res != ncclInProgress) {
-        WARN("[Proxy Service %d] Failed to execute operation %s from rank %d, retcode %d", proxyState->tpRank, ncclProxyMsgTypeStr[type], peer->tpRank, res);
+        if (!__atomic_load_n(proxyState->abortFlag, __ATOMIC_RELAXED))
+          WARN("[Proxy Service %d] Failed to execute operation %s from rank %d, retcode %d", proxyState->tpRank, ncclProxyMsgTypeStr[type], peer->tpRank, res);
         closeConn = 1;
       }
 
       if (closeConn) {
-        ncclSocketClose(sock);
+        (void)ncclSocketClose(sock);
 
         if (op != nullptr) {
           asyncProxyOpDequeue(peer, op);
@@ -1540,10 +1675,10 @@ void* ncclProxyService(void* _args) {
     WARN("[Proxy Service] proxyDestroy failed");
   }
   for (int s=0; s<maxnpeers; s++) {
-    ncclSocketClose(&peers[s].sock);
+    (void)ncclSocketClose(&peers[s].sock);
   }
   ncclProxyFreeConnections(&connectionPool, proxyState);
-  ncclSocketClose(proxyState->listenSock);
+  (void)ncclSocketClose(proxyState->listenSock);
   free(proxyState->listenSock);
   proxyOpsFree(proxyState);
   return NULL;
@@ -1553,12 +1688,17 @@ void* ncclProxyService(void* _args) {
 // Process a request on the UDS socket
 static ncclResult_t proxyUDSRecvReq(struct ncclProxyState* proxyState, int reqFd) {
   ncclIpcHdr hdr;
-  NCCLCHECK(ncclIpcSocketRecvMsg(&proxyState->ipcSock, &hdr, sizeof(hdr), NULL));
+  int rmtFd = -1;
+
+  NCCLCHECK(ncclIpcSocketRecvMsg(&proxyState->ipcSock, &hdr, sizeof(hdr), &rmtFd));
   if (hdr.type == ncclProxyMsgGetFd) {
     // cuMem API support
     uint64_t handle = *(uint64_t*)hdr.data;
     INFO(NCCL_PROXY, "proxyUDSRecvReq::ncclProxyMsgGetFd rank %d opId %p handle=0x%lx", hdr.rank, hdr.opId, handle);
     return proxyGetFd(proxyState, hdr.rank, hdr.opId, handle);
+  } else if (hdr.type == ncclProxyMsgQueryFd) {
+    INFO(NCCL_PROXY, "proxyUDSRecvReq::proxyQueryFd rank %d opId %p rmtFd %d", hdr.rank, hdr.opId, rmtFd);
+    return proxyQueryFd(proxyState, hdr.rank, hdr.opId, rmtFd);
   }
 
   return ncclInternalError;
@@ -1570,11 +1710,13 @@ void* ncclProxyServiceUDS(void* _args) {
   struct pollfd pollfds[1];
 
   if (setProxyThreadContext(proxyState)) {
-    INFO(NCCL_INIT, "[Proxy Service UDS] Created CUDA context on device %d", proxyState->cudaDev);
+    INFO(NCCL_INIT, "[Proxy Service UDS] Set CUDA context on device %d", proxyState->cudaDev);
   } else if (cudaSetDevice(proxyState->cudaDev) != cudaSuccess) {
     WARN("[Proxy Service UDS] Failed to set CUDA device %d", proxyState->cudaDev);
   }
 
+  INFO(NCCL_INIT, "[Proxy Service UDS] Device %d CPU core %d", proxyState->cudaDev, sched_getcpu());
+
   if (ncclIpcSocketGetFd(&proxyState->ipcSock, &pollfds[0].fd) != ncclSuccess) {
     WARN("[Proxy Service UDS] Get listenSock fd fails");
     return NULL;
@@ -1593,7 +1735,7 @@ void* ncclProxyServiceUDS(void* _args) {
     }
 
     // Check for stop/abort
-    if (proxyState->stop || *proxyState->abortFlag) break;
+    if (__atomic_load_n(&proxyState->stop, __ATOMIC_ACQUIRE) || __atomic_load_n(proxyState->abortFlag, __ATOMIC_ACQUIRE)) break;
 
     if (pollfds[0].revents) {
       // A request was seen on the UDS fd
@@ -1638,14 +1780,16 @@ ncclResult_t ncclProxyCreate(struct ncclComm* comm) {
     proxyState->dmaBufSupport = comm->dmaBufSupport;
     proxyState->ncclNet = comm->ncclNet;
     proxyState->ncclCollNet = comm->ncclCollNet;
+    proxyState->profilerContext = comm->profilerContext;
+    proxyState->directMode = comm->directMode;
     memcpy(proxyState->buffSizes, comm->buffSizes, sizeof(comm->buffSizes));
 
-    pthread_create(&comm->proxyState->thread, NULL, ncclProxyService, comm->proxyState);
+    PTHREADCHECK(pthread_create(&comm->proxyState->thread, NULL, ncclProxyService, comm->proxyState), "pthread_create");
     ncclSetThreadName(comm->proxyState->thread, "NCCL Service %2d", comm->cudaDev);
 
     // UDS support
     INFO(NCCL_PROXY, "UDS: Creating service thread comm %p rank %d", comm, comm->rank);
-    pthread_create(&comm->proxyState->threadUDS, NULL, ncclProxyServiceUDS, comm->proxyState);
+    PTHREADCHECK(pthread_create(&comm->proxyState->threadUDS, NULL, ncclProxyServiceUDS, comm->proxyState), "pthread_create");
     ncclSetThreadName(comm->proxyState->threadUDS, "NCCL UDS Service %2d", comm->cudaDev);
   }
   return ncclSuccess;
@@ -1658,17 +1802,17 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) {
     if ((comm->proxyRefCountOld = ncclAtomicRefCountDecrement(&sharedProxyState->refCount)) == 0) {
       if (comm->proxyState->threadUDS) {
         // UDS support
-        comm->proxyState->stop = 1;
+        __atomic_store_n(&comm->proxyState->stop, 1, __ATOMIC_RELEASE);
       }
 
-      if (sharedProxyState->peerAddresses) {
+      if (*comm->abortFlag == 0 && sharedProxyState->peerAddresses) {
         struct ncclSocket sock;
         int type = ncclProxyMsgStop;
         ncclSocketInit(&sock, sharedProxyState->peerAddresses + comm->topParentRanks[comm->rank], comm->sharedRes->magic, ncclSocketTypeProxy, comm->abortFlag);
         if (ncclSocketConnect(&sock) == ncclSuccess) {
-          ncclSocketSend(&sock, &type, sizeof(int));
+          (void)ncclSocketSend(&sock, &type, sizeof(int));
         }
-        ncclSocketClose(&sock);
+        (void)ncclSocketClose(&sock);
       }
 
       if (sharedProxyState->peerSocks) {
@@ -1686,7 +1830,7 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) {
               }
             }
             int type = ncclProxyMsgClose;
-            ncclSocketSend(sharedProxyState->peerSocks + i, &type, sizeof(int));
+            (void)ncclSocketSend(sharedProxyState->peerSocks + i, &type, sizeof(int));
             NCCLCHECK(ncclSocketClose(sharedProxyState->peerSocks + i));
           }
         }
@@ -1700,13 +1844,15 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) {
 ncclResult_t ncclProxyDestroy(struct ncclComm* comm) {
   struct ncclProxyState* sharedProxyState = comm->sharedRes->proxyState;
 
-  assert(sharedProxyState->refCount == 0);
-  free(sharedProxyState->peerAddresses);
-  free(sharedProxyState->peerAddressesUDS);
-  free(sharedProxyState->peerSocks);
-  free(sharedProxyState->proxyOps);
-  free(sharedProxyState->sharedDevMems);
-  expectedProxyResponseFree(sharedProxyState);
-  free(sharedProxyState);
+  if (sharedProxyState) {
+    assert(sharedProxyState->refCount == 0);
+    free(sharedProxyState->peerAddresses);
+    free(sharedProxyState->peerAddressesUDS);
+    free(sharedProxyState->peerSocks);
+    free(sharedProxyState->proxyOps);
+    free(sharedProxyState->sharedDevMems);
+    expectedProxyResponseFree(sharedProxyState);
+    free(sharedProxyState);
+  }
   return ncclSuccess;
 }
diff --git a/src/register.cc b/src/register.cc
index 90d429fe40..c4ca4b4a0c 100644
--- a/src/register.cc
+++ b/src/register.cc
@@ -26,8 +26,8 @@ ncclResult_t ncclNetDeregister(struct ncclComm* comm, struct ncclReg* reg) {
 
 ncclResult_t ncclNetRegister(struct ncclComm* comm, void* addr, size_t size, struct ncclReg* reg) {
   struct ncclRegCache* cache = &comm->regCache;
-  int netCount;
-  NCCLCHECK(ncclTopoGetNetCount(comm->topo, &netCount));
+  int netCount = 0;
+  if (comm->topo != NULL) NCCLCHECK(ncclTopoGetNetCount(comm->topo, &netCount));
   if (netCount == 0) return ncclSuccess;
 
   ncclResult_t ret = ncclSuccess;
@@ -105,7 +105,11 @@ ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, s
 NCCL_PARAM(LocalRegister, "LOCAL_REGISTER", 1);
 
 ncclResult_t ncclRegister(struct ncclComm* comm, void* data, size_t size, void** handle) {
-  if (!ncclParamLocalRegister()) return ncclSuccess;
+  if (!ncclParamLocalRegister()) {
+    *handle = NULL;
+    return ncclSuccess;
+  }
+  INFO(NCCL_REG, "register comm %p buffer %p size %zi", comm, data, size);
   struct ncclRegCache* cache = &comm->regCache;
   uintptr_t pageSize = cache->pageSize;
   uintptr_t addr = (uintptr_t)data & -pageSize;
@@ -166,6 +170,10 @@ ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle) {
   struct ncclReg* reg = (struct ncclReg*)handle;
   struct ncclRegCache* cache = &comm->regCache;
   int slot;
+  int saveDev;
+  if (handle == NULL) goto exit;
+  CUDACHECK(cudaGetDevice(&saveDev));
+  CUDACHECK(cudaSetDevice(comm->cudaDev));
   for (slot=0; slot<cache->population && cache->slots[slot] != reg; slot++);
   if (slot == cache->population) {
     WARN("Deregister: Could not find handle");
@@ -178,10 +186,19 @@ ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle) {
     reg->regAddr = (CUdeviceptr)NULL;
   }
   if (reg->state & COLLNET_REG_COMPLETE) {
-    NCCLCHECK(ncclCollnetDeregBuffer(comm, reg->proxyconn, reg->collnetHandle));
+    NCCLCHECK(ncclCollnetDeregBuffer(comm, reg->collnetProxyconn, reg->collnetHandle));
+  }
+  if (reg->state & IPC_REG_COMPLETE) {
+    for (int i = 0; i < NCCL_MAX_LOCAL_RANKS; ++i)
+      if (reg->ipcInfos[i])
+        NCCLCHECK(ncclIpcDeregBuffer(comm, reg->ipcInfos[i]));
+    if (reg->regIpcAddrs.hostPeerRmtAddrs) free(reg->regIpcAddrs.hostPeerRmtAddrs);
+    if (reg->regIpcAddrs.devPeerRmtAddrs) NCCLCHECK(ncclCudaFree(reg->regIpcAddrs.devPeerRmtAddrs));
   }
   free(reg);
   memmove(cache->slots+slot, cache->slots+slot+1, (cache->population-slot-1)*sizeof(struct ncclReg*));
   cache->population -= 1;
+  CUDACHECK(cudaSetDevice(saveDev));
+exit:
   return ncclSuccess;
 }
diff --git a/src/transport.cc b/src/transport.cc
index 5df47065bf..eeee7a24bf 100644
--- a/src/transport.cc
+++ b/src/transport.cc
@@ -28,7 +28,7 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph*
     struct ncclTransport *transport = ncclTransports[t];
     struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
     int ret = 0;
-    NCCLCHECK(transport->canConnect(&ret, comm->topo, graph, myInfo, peerInfo));
+    NCCLCHECK(transport->canConnect(&ret, comm, graph, myInfo, peerInfo));
     if (ret) {
       connector->transportComm = transportComm;
       NCCLCHECK(transportComm->setup(comm, graph, myInfo, peerInfo, connect, connector, channelId, connIndex));
@@ -70,25 +70,52 @@ NCCL_PARAM(ConnectRoundMaxPeers, "CONNECT_ROUND_MAX_PEERS", 128);
 NCCL_PARAM(ReportConnectProgress, "REPORT_CONNECT_PROGRESS", 0);
 #include <sys/time.h>
 
+ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* intraNodeP2pSupport, bool* directMode) {
+  bool supportFlag = true;
+  bool directFlag = false;
+  if (comm->localRanks == 1) {
+    supportFlag = false;
+  } else {
+    for (int i = 0; i < comm->localRanks; ++i) {
+      for (int j = i + 1; j < comm->localRanks; ++j) {
+        int ipeer = comm->localRankToRank[i];
+        int jpeer = comm->localRankToRank[j];
+        struct ncclPeerInfo* ipeerInfo = &comm->peerInfo[ipeer];
+        struct ncclPeerInfo* jpeerInfo = &comm->peerInfo[jpeer];
+        int canConnect = 0;
+        NCCLCHECK(ncclTransports[0]->canConnect(&canConnect, comm, NULL, ipeerInfo, jpeerInfo));
+        if (!canConnect && supportFlag == true) {
+          supportFlag = false;
+        }
+        if (ipeerInfo->hostHash == jpeerInfo->hostHash && ipeerInfo->pidHash == jpeerInfo->pidHash) directFlag = true;
+        if (!supportFlag && directFlag) break;
+      }
+    }
+  }
+  *intraNodeP2pSupport = supportFlag;
+  *directMode = directFlag;
+  return ncclSuccess;
+}
+
 ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType/*=NULL*/) {
   // Stream used during transport setup; need for P2P pre-connect + CUDA Graph
   ncclResult_t ret = ncclSuccess;
   int highestType = TRANSPORT_UNDEFINED;  // track highest transport type
   struct ncclConnect** data; // Store intermediate send/recvData structs for connect
-  struct ncclConnect** recvData; // Points to entries inside data for given recv connection within a channel
-  struct ncclConnect** sendData; // Points to entries inside data for given send connection within a channel
+  struct ncclConnect** recvData = NULL; // Points to entries inside data for given recv connection within a channel
+  struct ncclConnect** sendData = NULL; // Points to entries inside data for given send connection within a channel
   int done = 0;
-
   int maxPeers = ncclParamConnectRoundMaxPeers();
-  NCCLCHECK(ncclCalloc(&data, maxPeers));
-  NCCLCHECK(ncclCalloc(&recvData, maxPeers));
-  NCCLCHECK(ncclCalloc(&sendData, maxPeers));
 
   struct timeval timeStart, timeLast;
   gettimeofday(&timeStart, NULL);
   timeLast = timeStart; // struct copy
   bool timeReported = false;
 
+  NCCLCHECK(ncclCalloc(&data, maxPeers));
+  NCCLCHECKGOTO(ncclCalloc(&recvData, maxPeers), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&sendData, maxPeers), ret, fail);
+
   NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail);
   // First time initialization
   for (int i=1; i<comm->nRanks; i++) {
@@ -104,7 +131,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
     // The next M entries contain sendData, connection information for send connections
     // It's not guaranteed that each entry of data has the same number of total or send/recv specific connections
     int p = i-(done+1);
-    if (recvMask || sendMask) NCCLCHECK(ncclCalloc(data+p, 2*MAXCHANNELS));
+    if (recvMask || sendMask) NCCLCHECKGOTO(ncclCalloc(data+p, 2*MAXCHANNELS), ret, fail);
     recvData[p] = data[p];
     int sendChannels = 0, recvChannels = 0;
     int type;
@@ -163,7 +190,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
               struct ncclConnector* conn = comm->channels[c].peers[sendPeer]->send + connIndex;
               // This connector hasn't completed connection yet
               if (conn->connected == 0) {
-                NCCLCHECKGOTO(conn->transportComm->connect(comm, sendData[p] + sendDataOffset++, 1, comm->rank, conn), ret, fail);
+                NCCLCHECKGOTO(conn->transportComm->connect(comm, sendData[p] + sendDataOffset, 1, comm->rank, conn), ret, fail);
                 if (ret == ncclSuccess) {
                   conn->connected = 1;
                   /* comm->channels[c].devPeers[sendPeer]->send[connIndex] is a device memory access. */
@@ -172,6 +199,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
                   allChannelsConnected = false;
                 }
               }
+              sendDataOffset++;
             }
             TIME_STOP(3);
 
@@ -181,7 +209,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
               struct ncclConnector* conn = comm->channels[c].peers[recvPeer]->recv + connIndex;
               // This connector hasn't completed connection yet
               if (conn->connected == 0) {
-                NCCLCHECKGOTO(conn->transportComm->connect(comm, recvData[p] + recvDataOffset++, 1, comm->rank, conn), ret, fail);
+                NCCLCHECKGOTO(conn->transportComm->connect(comm, recvData[p] + recvDataOffset, 1, comm->rank, conn), ret, fail);
                 if (ret == ncclSuccess) {
                   conn->connected = 1;
                   /* comm->channels[c].devPeers[recvPeer]->recv[connIndex] is a device memory access. */
@@ -190,6 +218,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
                   allChannelsConnected = false;
                 }
               }
+              recvDataOffset++;
             }
             TIME_STOP(4);
           }
@@ -198,7 +227,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
             data[p] = NULL;
           }
         }
-	if (ncclParamReportConnectProgress() && comm->rank == 0) {
+	if (ncclParamReportConnectProgress() && comm->rank == 0 && done > 0) {
           struct timeval now;
           gettimeofday(&now, NULL);
           if (((now.tv_sec - timeLast.tv_sec)*1.0 + (now.tv_usec-timeLast.tv_usec)*1e-6) > 1) {
@@ -236,34 +265,31 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
     int bootstrapTag = (i << 8) + (1 << 7) + (graph ? graph->id + 1 : 0);
     int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
     int sendPeer = (comm->rank + i) % comm->nRanks;
-    int flag = 0;
 
     if (recvPeer != sendPeer) {
-      if (comm->connectSend[sendPeer] != 0UL)
-        NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail);
-      if (comm->connectRecv[recvPeer] != 0UL)
-        NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, &flag, sizeof(int)), ret, fail);
-
-      if (comm->connectSend[sendPeer] != 0UL)
-        NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail);
-      if (comm->connectRecv[recvPeer] != 0UL)
-        NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, &flag, sizeof(int)), ret, fail);
+      if (comm->connectSend[sendPeer] != 0UL) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, NULL, 0), ret, fail);
+      if (comm->connectRecv[recvPeer] != 0UL) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, NULL, 0), ret, fail);
+      if (comm->connectSend[sendPeer] != 0UL) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, NULL, 0), ret, fail);
+      if (comm->connectRecv[recvPeer] != 0UL) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, NULL, 0), ret, fail);
     } else {
       if (comm->connectSend[sendPeer] != 0UL || comm->connectRecv[recvPeer] != 0UL) {
-        NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail);
-        NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail);
+        NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, NULL, 0), ret, fail);
+        NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, NULL, 0), ret, fail);
       }
     }
     comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0UL;
   }
 
-  free(data);
-  free(sendData);
-  free(recvData);
-
   if (highestTransportType != NULL) *highestTransportType = highestType;
   TIME_PRINT("P2P Setup/Connect");
 exit:
+  for(int i=0; i<maxPeers; ++i){
+    if(data[i]) free(data[i]);
+  }
+  free(data);
+  if (sendData) free(sendData);
+  if (recvData) free(recvData);
+
   NCCLCHECK(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream));
   NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream));
   return ret;
@@ -275,8 +301,8 @@ extern struct ncclTransport collNetTransport;
 
 // All ranks must participate in collNetSetup call
 // We do not NCCLCHECK this call because we would fall back to P2P network in case CollNet setup fails
-int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type, ncclConnect* connect) {
-  int fail = 1;
+bool ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type, ncclConnect* connect) {
+  ncclResult_t ret = ncclSuccess;
   int rank = comm->rank;
   int nranks = comm->nRanks;
   int nMasters = comm->nNodes;
@@ -297,24 +323,23 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
   struct ncclTransportComm* transportComm = (type == collNetRecv) ? &(collNetTransport.recv) : &(collNetTransport.send);
   conn->transportComm = transportComm;
   // setup
-  struct ncclConnect myConnect;
-  if (isMaster) {
-    NCCLCHECK(transportComm->setup(comm, collNetGraph, myInfo, peerInfo, &myConnect, conn, collNetGraphChannelId, type));
-  }
-  // prepare connect handles
-  ncclResult_t res;
+  struct ncclConnect myConnect = { 0 };
   struct {
     int isMaster;
     ncclConnect connect;
   } *allConnects = NULL;
   ncclConnect *masterConnects = NULL;
+  if (isMaster) {
+    NCCLCHECK(transportComm->setup(comm, collNetGraph, myInfo, peerInfo, &myConnect, conn, collNetGraphChannelId, type));
+  }
+  // prepare connect handles
   NCCLCHECK(ncclCalloc(&masterConnects, nMasters));
   if (type == collNetRecv) {  // recv side: AllGather
     // all ranks must participate
-    NCCLCHECK(ncclCalloc(&allConnects, nranks));
+    NCCLCHECKGOTO(ncclCalloc(&allConnects, nranks), ret, cleanup);
     allConnects[rank].isMaster = isMaster;
     memcpy(&(allConnects[rank].connect), &myConnect, sizeof(struct ncclConnect));
-    NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allConnects, sizeof(*allConnects)), res, cleanup);
+    NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allConnects, sizeof(*allConnects)), ret, cleanup);
     // consolidate
     int c = 0;
     for (int r = 0; r < nranks; r++) {
@@ -328,21 +353,20 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
   }
   // connect
   if (isMaster) {
-    NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, comm->node, conn), res, cleanup);
+    NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, comm->node, conn), ret, cleanup);
     struct ncclDevChannelPeer* devRoot;
-    CUDACHECKGOTO(cudaMemcpy(&devRoot, channel->devPeers + nranks, sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost), res, cleanup);
+    CUDACHECKGOTO(cudaMemcpy(&devRoot, channel->devPeers + nranks, sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost), ret, cleanup);
     struct ncclConnInfo* devConnInfo = (type == collNetRecv) ? devRoot->recv + type : devRoot->send + type;
-    CUDACHECKGOTO(cudaMemcpy(devConnInfo, &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice), res, cleanup);
+    CUDACHECKGOTO(cudaMemcpy(devConnInfo, &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice), ret, cleanup);
   }
   if (isMaster && type == collNetRecv) {
     memcpy(connect, masterConnects+comm->node, sizeof(struct ncclConnect));
     TRACE(NCCL_INIT, "CollNet [recv] : rank %d collNetRank %d collNetNranks %d sent connect to rank %d", rank, comm->node, nMasters, masterPeer);
   }
-  fail = 0;
 cleanup:
   if (allConnects != NULL) free(allConnects);
   if (masterConnects != NULL) free(masterConnects);
-  return fail;
+  return ret != ncclSuccess;
 }
 
 ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail) {
diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc
index ae1fe0fb56..7d2f298ae6 100644
--- a/src/transport/coll_net.cc
+++ b/src/transport/coll_net.cc
@@ -18,15 +18,15 @@ int64_t ncclParamGdrCopySyncEnable();
 int64_t ncclParamGdrCopyFlushEnable();
 
 struct collNetRecvConnectInfo {
-  int rank;
-  int nranks;
   collNetHandle_t collNetHandle;
 };
+static_assert(sizeof(collNetRecvConnectInfo) <= CONNECT_SIZE, "Collnet Recv Connect info is too large");
 
 struct collNetSendConnectInfo {
   void* mhandles[NCCL_NUM_PROTOCOLS];
   void* reqFifo;
 };
+static_assert(sizeof(collNetSendConnectInfo) <= CONNECT_SIZE, "Collnet Send Connect info is too large");
 
 #define COLLNET_GROUP_NSUBS 8
 #define COLLNET_MAX_GROUPS (NCCL_PROXY_MAX_SUBS/COLLNET_GROUP_NSUBS)
@@ -135,7 +135,7 @@ struct recvResources {
   int collNetRank;
 };
 
-static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
+static ncclResult_t canConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
   // This transport cannot be used for p2p
   *ret = 0;
   return ncclSuccess;
@@ -154,15 +154,14 @@ struct setupReq {
 static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
   struct setupReq req = { 0 };
 
-  int proxyRank, tpProxyRank;
+  int proxyRank;
   int64_t netId;
   NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &netId, &req.netDev, &proxyRank));
   NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 1, &req.useGdr));
   send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
 
   send->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
-  tpProxyRank = comm->topParentRanks[myInfo->rank];
-  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, tpProxyRank, &send->proxyConn));
+  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, myInfo->rank, &send->proxyConn));
   ncclAtomicRefCountIncrement(&comm->collNetSharedRes->refCount);
   req.collNet = comm->collNetSharedRes;
   NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
@@ -175,7 +174,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
 static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
   struct setupReq req = { 0 };
 
-  int proxyRank, tpProxyRank;
+  int proxyRank;
   int64_t netId;
   NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &netId, &req.netDev, &proxyRank));
   NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 0, &req.useGdr));
@@ -184,8 +183,8 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush));
 
   recv->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
-  tpProxyRank = comm->topParentRanks[myInfo->rank];
-  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, tpProxyRank, &recv->proxyConn));
+  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, myInfo->rank, &recv->proxyConn));
+  static_assert(sizeof(collNetRecvConnectInfo) <= sizeof(struct ncclConnect), "Collnet Recv Connect info is too big");
   struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;
   ncclAtomicRefCountIncrement(&comm->collNetSharedRes->refCount);
   req.collNet = comm->collNetSharedRes;
@@ -442,6 +441,7 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
 static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
   if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("sendProxyConnect: reqSize is %d != %ld", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; }
   struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff;
+  static_assert(sizeof(collNetSendConnectInfo) <= sizeof(struct ncclConnect), "Collnet Send Connect info is too big");
   struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(args->connectInfos+args->rank);
 
   struct sendResources* resources = (struct sendResources*)(connection->transportResources);
@@ -1039,7 +1039,7 @@ ncclResult_t ncclCollnetLocalRegisterBuffer(struct ncclComm* comm, const void* u
         NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyconn, ncclProxyMsgRegister, &info, sizeof(struct collnetRegInfo), &handle, sizeof(void*)), ret, fail);
         if (handle) {
           regRecord->state |= COLLNET_REG_COMPLETE;
-          regRecord->proxyconn = proxyconn;
+          regRecord->collnetProxyconn = proxyconn;
           *outHandle = regRecord->collnetHandle = handle;
           *outRegBufFlag = 1;
         }
@@ -1091,7 +1091,7 @@ ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, const void* u
   record->size = buffSize;
   *outHandle = record->mhandle = handle;
   *outRegBufFlag = 1;
-  ncclIntruQueueEnqueue(cleanupQueue, &record->base);
+  ncclIntruQueueEnqueue(cleanupQueue, (struct ncclCommCallback*)record);
   *nCleanupQueueElts += 1;
 
 exit:
@@ -1214,23 +1214,6 @@ ncclResult_t ncclCollNetDirectBufferSetup(ncclComm_t comm) {
   }
   NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_COLLNET_DIRECT], 1, &highestTransportType1), ret, fail);
 
-  // Exchange highest intra-node transport type among ranks
-  // because we need to know whether all ranks can p2p each other to determine whether we can directly read/write registered user buffer
-  if (highestTransportType0 != TRANSPORT_UNDEFINED && highestTransportType1 != TRANSPORT_UNDEFINED) {
-    int highestTypes[NCCL_MAX_LOCAL_RANKS] = { TRANSPORT_UNDEFINED };
-
-    comm->intraHighestTransportType = highestTypes[comm->localRank] = highestTransportType0 > highestTransportType1 ? highestTransportType0 : highestTransportType1;
-    NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, highestTypes, sizeof(int)), ret, fail);
-    for (int i = 0; i < comm->localRanks; i++) {
-      if (highestTypes[i] > comm->intraHighestTransportType)
-        comm->intraHighestTransportType = highestTypes[i];
-    }
-    if (comm->collNetSharedRes->intraHighestTransportType < comm->intraHighestTransportType)
-      comm->collNetSharedRes->intraHighestTransportType = comm->intraHighestTransportType;
-  } else if (comm->intraHighestTransportType == TRANSPORT_UNDEFINED) {
-    // reuse previous shared intraHighestTransportType
-    comm->intraHighestTransportType = comm->collNetSharedRes->intraHighestTransportType;
-  }
   INFO(NCCL_INIT, "rank %d Connected CollNet", comm->rank);
 
 exit:
diff --git a/src/transport/generic.cc b/src/transport/generic.cc
index a0efaab5c7..7fd7e59fbc 100644
--- a/src/transport/generic.cc
+++ b/src/transport/generic.cc
@@ -34,3 +34,26 @@ exit:
 fail:
   goto exit;
 }
+
+ncclResult_t ncclTransportPatConnect(struct ncclComm* comm) {
+  ncclResult_t ret = ncclSuccess;
+  if (comm && comm->nRanks > 1) {
+    for (int mask=1; mask<comm->nRanks; mask<<=1) {
+      int prevPeer = (comm->rank + mask) % comm->nRanks;
+      int nextPeer = (comm->rank + comm->nRanks - mask) % comm->nRanks;
+      for (int c = 0; c < comm->nChannels; c++) {
+        NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &prevPeer, 1, &nextPeer, 0), ret, fail); // ReduceScatter
+      }
+      NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_TREE], 0), ret, fail);
+      for (int c = 0; c < comm->nChannels; c++) {
+        NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &nextPeer, 1, &prevPeer, 0), ret, fail); // AllGather
+      }
+      NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_TREE], 0), ret, fail);
+    }
+    INFO(NCCL_INIT, "Connected binomial trees");
+  }
+exit:
+  return ret;
+fail:
+  goto exit;
+}
diff --git a/src/transport/net.cc b/src/transport/net.cc
index d5a585d428..00eca607d9 100644
--- a/src/transport/net.cc
+++ b/src/transport/net.cc
@@ -10,10 +10,11 @@
 #include "proxy.h"
 #include "collectives.h"
 #include "gdrwrap.h"
-#include "shm.h"
+#include "shmutils.h"
 #include "p2p.h"
 #include "profiler.h"
 #include "transport.h"
+#include "shm.h"
 
 static_assert(sizeof(ncclNetHandle_t) <= CONNECT_SIZE, "NET Connect info is too large");
 
@@ -62,9 +63,8 @@ struct connectMapMem{
   char* cpuPtr;
   int size;
   ncclIpcDesc ipcDesc;
-  char shmPath[PATH_MAX];
-  ncclShmHandle_t attachHandle;
-  ncclShmHandle_t createHandle;
+  ncclShmIpcDesc_t attachDesc;
+  ncclShmIpcDesc_t createDesc;
 };
 
 struct connectMap {
@@ -142,11 +142,11 @@ struct recvNetResources {
 };
 
 /* Determine if two peers can communicate with NET */
-static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
+static ncclResult_t canConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
   *ret = 1;
   if (info1->hostHash == info2->hostHash) {
     // If on the same host, check intra-node net is not disabled.
-    NCCLCHECK(ncclTopoCheckNet(topo, info1->busId, info2->busId, ret));
+    NCCLCHECK(ncclTopoCheckNet(comm->topo, info1->rank, info2->rank, ret));
   }
   return ncclSuccess;
 }
@@ -173,9 +173,8 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
  * information for this peer */
 static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
   struct setupReq req = { 0 };
-  int tpProxyRank;
 
-  send->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
+  send->conn.shared = req.shared = graph || connIndex == 0 ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
   req.channelId = channelId;
   req.connIndex = connIndex;
 
@@ -185,8 +184,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 1, &req.useGdr));
   send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
 
-  tpProxyRank = comm->topParentRanks[proxyRank];
-  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &send->proxyConn));
+  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, proxyRank, &send->proxyConn));
   req.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
   req.tpRank = comm->topParentRanks[myInfo->rank];
   req.tpRemoteRank = comm->topParentRanks[peerInfo->rank];
@@ -199,7 +197,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
     INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [send] via NET/%s/%d(%d)%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, comm->ncclNet->name, req.netDev,
         proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
   }
-  *((int*)connectInfo) = tpProxyRank;
+  *((int*)connectInfo) = comm->topParentRanks[proxyRank];
   return ncclSuccess;
 }
 
@@ -212,12 +210,12 @@ NCCL_PARAM(GdrCopyFlushEnable, "GDRCOPY_FLUSH_ENABLE", 0);
 static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
   struct setupReq req = { 0 };
 
-  recv->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
+  recv->conn.shared = req.shared = graph || connIndex == 0 ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
   req.channelId = channelId;
   req.connIndex = connIndex;
 
   // Use myInfo->rank as the receiver uses its own NIC
-  int proxyRank, tpProxyRank;
+  int proxyRank;
   int64_t netId;
   NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, myInfo->rank, &netId, &req.netDev, &proxyRank));
   NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 0, &req.useGdr));
@@ -226,8 +224,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush));
 
   // We don't support PXN on receive yet
-  tpProxyRank = comm->topParentRanks[myInfo->rank];
-  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, tpProxyRank, &recv->proxyConn));
+  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, myInfo->rank, &recv->proxyConn));
 
   req.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
   req.tpRank = comm->topParentRanks[myInfo->rank];
@@ -238,26 +235,24 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   return ncclSuccess;
 }
 
-static ncclResult_t netMapShm(struct connectMapMem* mem) {
-  mem->cpuPtr = NULL;
-  mem->gpuPtr = NULL;
-  NCCLCHECK(ncclShmOpen(mem->shmPath, mem->size, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr, -1, &mem->attachHandle));
+static ncclResult_t netMapShm(struct ncclComm *comm, struct connectMapMem* mem) {
+  NCCLCHECK(ncclShmImportShareableBuffer(comm, &mem->createDesc, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr, &mem->attachDesc));
   return ncclSuccess;
 }
-static ncclResult_t netCreateShm(struct connectMapMem* mem) {
-  mem->shmPath[0] = '\0'; // Let ncclShmOpen create a tmp file
-  NCCLCHECK(ncclShmOpen(mem->shmPath, mem->size, (void**)&mem->cpuPtr, NULL, 1, &mem->createHandle));
+
+static ncclResult_t netCreateShm(struct ncclProxyState* proxyState, struct connectMapMem* mem) {
+  NCCLCHECK(ncclShmAllocateShareableBuffer(proxyState->tpRank, mem->size, false, &mem->createDesc, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr));
   return ncclSuccess;
 }
 
 static ncclResult_t netDumpMap(struct connectMap* map) {
   printf("Dump map same process %d shared %d\n", map->sameProcess, map->shared);
   struct connectMapMem *mem = map->mems+NCCL_NET_MAP_HOSTMEM;
-  printf("Mem 0: Host mem %s (%x B) CPU %p GPU %p\n", mem->shmPath, mem->size, mem->cpuPtr, mem->gpuPtr);
+  printf("Mem 0: Host mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr);
   mem = map->mems+NCCL_NET_MAP_DEVMEM;
   printf("Mem 1: Vid  mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr);
   mem = map->mems+NCCL_NET_MAP_SHARED_HOSTMEM;
-  printf("Mem 2: Shared Host mem %s (%x B) CPU %p GPU %p\n", mem->shmPath, mem->size, mem->cpuPtr, mem->gpuPtr);
+  printf("Mem 2: Shared Host mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr);
   mem = map->mems+NCCL_NET_MAP_SHARED_DEVMEM;
   printf("Mem 3: Shared Vid mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr);
   printf("SendMem -> Used %d Bank %d Offset %x, cpu %p gpu %p\n",
@@ -328,10 +323,10 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
       }
     }
   } else if (!(map->sameProcess && map->cudaDev == comm->cudaDev)) {
-    if (!map->sameProcess) NCCLCHECK(netMapShm(map->mems+NCCL_NET_MAP_HOSTMEM));
+    if (!map->sameProcess) NCCLCHECK(netMapShm(comm, map->mems + NCCL_NET_MAP_HOSTMEM));
     if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
       map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr = NULL;
-      NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.tpRank,
+      NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.rank,
                                              map->mems[NCCL_NET_MAP_DEVMEM].size,
                                              &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc,
                                              (void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
@@ -341,7 +336,7 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
       void** sharedDevMemPtr = comm->proxyState->sharedDevMems + send->proxyConn.tpLocalRank;
       if (*sharedDevMemPtr == NULL) {
         map->mems[NCCL_NET_MAP_SHARED_DEVMEM].gpuPtr = NULL;
-        NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.tpRank,
+        NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.rank,
                                                map->mems[NCCL_NET_MAP_SHARED_DEVMEM].size,
                                                &map->mems[NCCL_NET_MAP_SHARED_DEVMEM].ipcDesc,
                                                sharedDevMemPtr));
@@ -463,24 +458,19 @@ static ncclResult_t sendFree(struct ncclConnector* send) {
   if (map) {
     int cudaDev;
     CUDACHECK(cudaGetDevice(&cudaDev));
-    if (map->sameProcess && map->cudaDev == cudaDev) {
-      // Our own GPU, so it wasn't mapped in
-      free(map);
-      return ncclSuccess;
-    }
-    if (!map->sameProcess || ncclCuMemEnable()) {
-      if (!map->sameProcess) NCCLCHECK(ncclShmClose(map->mems[NCCL_NET_MAP_HOSTMEM].attachHandle));
-      if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
-        if (ncclCuMemEnable()) {
-          // cuMem API support
-          NCCLCHECK(ncclP2pFreeShareableBuffer(&map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc));
-          NCCLCHECK(ncclCuMemFree(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
-        } else {
-          // Legacy CUDA IPC support
-          CUDACHECK(cudaIpcCloseMemHandle(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
-        }
+    if (map->cudaDev != cudaDev && map->mems[NCCL_NET_MAP_DEVMEM].size) {
+      if (ncclCuMemEnable()) {
+        // cuMem API support
+        NCCLCHECK(ncclP2pFreeShareableBuffer(&map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc));
+        NCCLCHECK(ncclCuMemFree(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
+      } else {
+        // Legacy CUDA IPC support
+        CUDACHECK(cudaIpcCloseMemHandle(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
       }
     }
+    if (!map->sameProcess) {
+      NCCLCHECK(ncclShmIpcClose(&map->mems[NCCL_NET_MAP_HOSTMEM].attachDesc));
+    }
     free(map);
   }
 
@@ -518,7 +508,7 @@ static ncclResult_t sharedNetBuffersInit(struct ncclProxyState* proxyState, int
 
   if (cuda && state->cudaBuff == NULL) {
     if (sameProcess == 0 || ncclCuMemEnable()) {
-      NCCLCHECK(ncclP2pAllocateShareableBuffer(state->size, &state->ipcDesc, (void**)&state->cudaBuff));
+      NCCLCHECK(ncclP2pAllocateShareableBuffer(state->size, 0, &state->ipcDesc, (void**)&state->cudaBuff));
     } else {
       NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size));
     }
@@ -527,7 +517,7 @@ static ncclResult_t sharedNetBuffersInit(struct ncclProxyState* proxyState, int
     NCCLCHECK(ncclCudaHostCalloc(&state->hostBuff, state->size));
   }
   if (cpuPtr) *cpuPtr = cuda ? state->cudaBuff : state->hostBuff;
-  if (gpuPtr) *gpuPtr = sameProcess ? *cpuPtr : NULL;
+  if (gpuPtr) *gpuPtr = (cpuPtr && sameProcess) ? *cpuPtr : NULL;
   if (ipcDesc) memcpy(ipcDesc, &state->ipcDesc, sizeof(state->ipcDesc));
   return ncclSuccess;
 }
@@ -543,7 +533,7 @@ static ncclResult_t sharedBuffersGet(struct ncclProxyState* proxyState, int chan
 static ncclResult_t sharedNetBuffersDestroy(struct ncclProxyState* proxyState, int tpLocalRank, int type, struct ncclProxyConnection* connection) {
   if (proxyState->progressState.localPeers == NULL) NCCLCHECK(ncclInternalError);
   struct ncclProxyPeer* peer = proxyState->progressState.localPeers[tpLocalRank];
-  if (peer == NULL) NCCLCHECK(ncclInternalError;)
+  if (peer == NULL) NCCLCHECK(ncclInternalError);
   struct ncclProxySharedP2p* state = type == 0 ? &peer->send : &peer->recv;
   if (state->size == 0) NCCLCHECK(ncclInternalError);
   if (ncclAtomicRefCountDecrement(&state->refcount) == 0) {
@@ -746,7 +736,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
     if (resources->shared == 0) {
       if (!map->sameProcess || ncclCuMemEnable()) {
         ALIGN_SIZE(map->mems[NCCL_NET_MAP_DEVMEM].size, CUDA_IPC_MIN);
-        NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc,
+        NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, 0, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc,
                                                  (void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
       } else {
         NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size));
@@ -758,7 +748,11 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
     NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size));
     map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr;
   } else {
-    NCCLCHECK(netCreateShm(map->mems+NCCL_NET_MAP_HOSTMEM));
+    NCCLCHECK(netCreateShm(proxyState, map->mems+NCCL_NET_MAP_HOSTMEM));
+    void* sendMem = (void*)NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem);
+    void* recvMem = (void*)NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem);
+    memset(sendMem, 0, sizeof(struct ncclSendMem));
+    memset(recvMem, 0, sizeof(struct ncclRecvMem));
   }
   if (ncclGdrCopy && map->sameProcess && ncclParamGdrCopySyncEnable()) {
     uint64_t *cpuPtr, *gpuPtr;
@@ -896,7 +890,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
   if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
     if (resources->shared == 0) {
       if (ncclCuMemEnable()) {
-        NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc,
+        NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, 0, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc,
                                                  (void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
       } else {
         NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size));
@@ -968,7 +962,7 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct
     if (resources->map.sameProcess) {
       NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
     } else {
-      NCCLCHECK(ncclShmClose(mems[NCCL_NET_MAP_HOSTMEM].createHandle));
+      NCCLCHECK(ncclShmIpcClose(&mems[NCCL_NET_MAP_HOSTMEM].createDesc));
     }
     NCCLCHECK(ncclCudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
     if (!resources->map.sameProcess || ncclCuMemEnable()) {
@@ -1050,7 +1044,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
       // Set step base for next op
       resources->step = sub->base + sub->nsteps;
       sub->posted = sub->transmitted = sub->done = 0;
-      for (uint64_t step=0; step<sub->nsteps; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileBegin);
+      ncclProfilerStartSendProxyOpEvent(s, args);
       if (sub->reg && sub->nbytes > 0) {
         NCCLCHECK(proxyState->ncclNet->regMr(resources->netSendComm, sub->recvbuff, sub->nbytes, NCCL_PTR_CUDA, &sub->mhandle));
       } else {
@@ -1072,6 +1066,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
       char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
       // Post buffers to the GPU
       if (sub->posted < sub->nsteps && sub->posted < sub->done + maxDepth) {
+        ncclProfilerStartSendProxyStepEvents(s, args, sub->posted, sub->posted+args->sliceSteps);
         int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
         if (resources->shared) {
           if (!sub->reg) {
@@ -1087,9 +1082,8 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
           if (sub->reg == 0 || sub->posted == args->sliceSteps) *sendHead = sub->base + sub->posted - NCCL_STEPS;
           if (resources->gdcSync) wc_store_fence(); // Flush out WC write
         } else sub->posted += args->sliceSteps;
-        for (uint64_t step=sub->posted-args->sliceSteps; step<sub->posted; step++) {
-          ncclProfilingRecord(args, s, step, ncclProxyProfileSendGPUWait);
-        }
+        ncclProfilerRecordProxyOpEventState(s, args, sub->posted, sub->transSize, ncclProfilerProxyOpSendPosted);
+        ncclProfilerRecordProxyStepEventStates(s, args, sub->posted-args->sliceSteps, sub->posted, ncclProfilerProxyStepSendGPUWait);
         args->idle = 0;
         continue;
       }
@@ -1130,12 +1124,18 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
             buff = sub->reg ? (char*)sub->recvbuff : localBuff+resources->recvMem->connFifo[buffSlot].offset;
           }
           if (ready) {
+            ncclProfilerRecordProxyOpEventState(s, args, sub->transmitted + args->sliceSteps, sub->transSize, ncclProfilerProxyOpSendRemFifoWait);
             // Data is ready, try to send.
+            // Coverity complains about the size here as pointing to an out-of-scope temporary.  Which is nonsense,
+            // since size is a plain integer.
+            // coverity[use_invalid:FALSE]
             NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, sub->mhandle, sub->requests+buffSlot));
             if (sub->requests[buffSlot] != NULL) {
               TRACE(NCCL_NET, "sendProxy [%ld/%d] Isend posted, req %p, size %d, proto %d, myRank %d, channelId %d", sub->transmitted, buffSlot, sub->requests[buffSlot], size, p, proxyState->tpRank, sub->channelId);
               sub->transmitted += args->sliceSteps;
-              for (uint64_t step=sub->transmitted-args->sliceSteps; step<sub->transmitted; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileSendWait);
+              ncclProfilerRecordProxyOpEventState(s, args, sub->transmitted, sub->transSize, ncclProfilerProxyOpSendTransmitted);
+              ncclProfilerRecordProxyStepEventStates(s, args, sub->transmitted-args->sliceSteps, sub->transmitted, ncclProfilerProxyStepSendWait);
+              sub->transSize += size;
               args->idle = 0;
               continue;
             }
@@ -1165,7 +1165,8 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
           __sync_synchronize();
           TRACE(NCCL_NET, "sendProxy [%ld/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]);
           sub->done += args->sliceSteps;
-          for (uint64_t step=sub->done-args->sliceSteps; step<sub->done; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileEnd);
+          ncclProfilerStopProxyStepEvents(s, args, sub->done-args->sliceSteps, sub->done);
+          ncclProfilerRecordProxyOpEventState(s, args, sub->done, sub->transSize, ncclProfilerProxyOpSendDone);
 
           if (resources->shared == 0) {
             volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
@@ -1188,6 +1189,9 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
       }
     }
     if (args->done == args->nsubs) {
+      for (int s=0; s<args->nsubs; s++) {
+        ncclProfilerStopProxyOpEvent(s, args);
+      }
       args->state = ncclProxyOpNone;
     }
   }
@@ -1229,7 +1233,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
       resources->step = sub->base + sub->nsteps;
       sub->posted = sub->received = sub->transmitted = sub->done = 0;
       for (int i=0; i<groupSize; i++) sub[-i].groupSize = groupSize;
-      for (uint64_t step=0; step<sub->nsteps; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileBegin);
+      ncclProfilerStartRecvProxyOpEvent(s, args);
       if (sub->reg && sub->nbytes > 0) {
         // Register buffer
         NCCLCHECK(proxyState->ncclNet->regMr(resources->netRecvComm, sub->recvbuff, sub->nbytes, NCCL_PTR_CUDA, &sub->mhandle));
@@ -1254,6 +1258,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
         struct ncclProxySubArgs* sub = subGroup + i;
         if (sub->posted < sub->nsteps) {
           if (sub->posted >= sub->done + maxDepth) { subCount = 0; break; }
+          ncclProfilerStartRecvProxyStepEvents(s+i, args, sub->posted, sub->posted+args->sliceSteps);
           struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
           if (sub->reg) maxDepth = 1;
           int stepSize = resources->buffSizes[p] / NCCL_STEPS;
@@ -1294,7 +1299,8 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
           for (int i=0; i<subGroup->groupSize; i++) {
             struct ncclProxySubArgs* sub = subGroup+i;
             sub->posted += args->sliceSteps;
-            for (uint64_t step=sub->posted-args->sliceSteps; step<sub->posted; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvWait);
+            ncclProfilerRecordProxyOpEventState(s+i, args, sub->posted, sub->transSize, ncclProfilerProxyOpRecvPosted);
+            ncclProfilerRecordProxyStepEventStates(s+i, args, sub->posted-args->sliceSteps, sub->posted, ncclProfilerProxyStepRecvWait);
           }
           args->idle = 0;
         }
@@ -1337,7 +1343,9 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
               }
             }
             sub->received += args->sliceSteps;
-            for (uint64_t step=sub->received-args->sliceSteps; step<sub->received; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvFlushWait);
+            sub->transSize += sizes[i];
+            ncclProfilerRecordProxyOpEventState(s+i, args, sub->received, sub->transSize, ncclProfilerProxyOpRecvReceived);
+            ncclProfilerRecordProxyStepEventStates(s+i, args, sub->received-args->sliceSteps, sub->received, ncclProfilerProxyStepRecvFlushWait);
             if (step < sub->nsteps) {
               struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
               if (resources->useGdr) needFlush |= resources->needFlush;
@@ -1393,7 +1401,8 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
             struct ncclProxySubArgs* sub = subGroup + i;
 
             sub->transmitted += args->sliceSteps;
-            for (uint64_t step=sub->transmitted-args->sliceSteps; step<sub->transmitted; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvGPUWait);
+            ncclProfilerRecordProxyOpEventState(s+i, args, sub->transmitted, sub->transSize, ncclProfilerProxyOpRecvTransmitted);
+            ncclProfilerRecordProxyStepEventStates(s+i, args, sub->transmitted-args->sliceSteps, sub->transmitted, ncclProfilerProxyStepRecvGPUWait);
             if (step < sub->nsteps) {
               __sync_synchronize();
               struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
@@ -1431,7 +1440,8 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
               subGroup->recvRequestsCache[sub->done%NCCL_STEPS] = NULL;
             }
             sub->done += args->sliceSteps;
-            for (uint64_t step=sub->done-args->sliceSteps; step<sub->done; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileEnd);
+            ncclProfilerStopProxyStepEvents(s+i, args, sub->done-args->sliceSteps, sub->done);
+            ncclProfilerRecordProxyOpEventState(s+i, args, sub->done, sub->transSize, ncclProfilerProxyOpRecvDone);
             args->idle = 0;
             if (sub->done == sub->nsteps) {
               struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
@@ -1447,6 +1457,9 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
     }
     if (args->done == args->nsubs) {
       args->state = ncclProxyOpNone;
+      for (int s=0; s<args->nsubs; s++) {
+        ncclProfilerStopProxyOpEvent(s, args);
+      }
     }
   }
   return ncclSuccess;
diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc
index be8a8a37b6..d828c9801b 100644
--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
@@ -49,6 +49,11 @@ struct alignas(64) ncclIbMergedDev {
   int devs[NCCL_IB_MAX_DEVS_PER_NIC]; // Points to an index in ncclIbDevs
   int speed;
   char devName[MAX_MERGED_DEV_NAME]; // Up to NCCL_IB_MAX_DEVS_PER_NIC * name size, and a character for each '+'
+  int dmaBufSupported;               //  0 = uninit, 1 = yes, -1 = no
+};
+
+struct ncclIbStats {
+  int fatalErrorCount;
 };
 
 static int ncclNIbDevs = -1;
@@ -69,6 +74,7 @@ struct alignas(64) ncclIbDev {
   struct ncclIbMrCache mrCache;
   int ar; // ADAPTIVE_ROUTING
   struct ibv_port_attr portAttr;
+  struct ncclIbStats stats;
 };
 
 #define MAX_IB_DEVS 32
@@ -80,7 +86,7 @@ static int ncclIbRelaxedOrderingEnabled = 0;
 NCCL_PARAM(IbGidIndex, "IB_GID_INDEX", -1);
 NCCL_PARAM(IbRoutableFlidIbGidIndex, "IB_ROUTABLE_FLID_GID_INDEX", 1);
 NCCL_PARAM(IbRoceVersionNum, "IB_ROCE_VERSION_NUM", 2);
-NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 18);
+NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 20);
 NCCL_PARAM(IbRetryCnt, "IB_RETRY_CNT", 7);
 NCCL_PARAM(IbPkey, "IB_PKEY", 0);
 NCCL_PARAM(IbUseInline, "IB_USE_INLINE", 0);
@@ -90,6 +96,32 @@ NCCL_PARAM(IbArThreshold, "IB_AR_THRESHOLD", 8192);
 NCCL_PARAM(IbPciRelaxedOrdering, "IB_PCI_RELAXED_ORDERING", 2);
 NCCL_PARAM(IbAdaptiveRouting, "IB_ADAPTIVE_ROUTING", -2);
 NCCL_PARAM(IbFifoTc, "IB_FIFO_TC", 0);
+NCCL_PARAM(IbAsyncEvents,"IB_RETURN_ASYNC_EVENTS",1);
+NCCL_PARAM(IbEceEnable,"IB_ECE_ENABLE",1);
+
+static ncclResult_t ncclIbStatsInit(struct ncclIbStats* stat) {
+  __atomic_store_n(&stat->fatalErrorCount, 0, __ATOMIC_RELAXED);
+  return ncclSuccess;
+}
+static void ncclIbStatsFatalError(struct ncclIbStats* stat){
+  __atomic_fetch_add(&stat->fatalErrorCount, 1, __ATOMIC_RELAXED);
+}
+static ncclResult_t ncclIbStatsCheckFatalCount(struct ncclIbStats* stat, const char* funcName) {
+  if (ncclParamIbAsyncEvents() && __atomic_load_n(&stat->fatalErrorCount, __ATOMIC_RELAXED)) {
+    WARN("communicator encountered a fatal error (detected in %s)\n", funcName);
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+static void ncclIbQpFatalError(struct ibv_qp* qp) {
+  ncclIbStatsFatalError((struct ncclIbStats*)qp->qp_context);
+}
+static void ncclIbCqFatalError(struct ibv_cq* cq) {
+  ncclIbStatsFatalError((struct ncclIbStats*)cq->cq_context);
+}
+static void ncclIbDevFatalError(struct ncclIbDev* dev) {
+  ncclIbStatsFatalError(&dev->stats);
+}
 
 pthread_t ncclIbAsyncThread;
 static void* ncclIbAsyncThreadMain(void* args) {
@@ -98,9 +130,53 @@ static void* ncclIbAsyncThreadMain(void* args) {
     struct ibv_async_event event;
     if (ncclSuccess != wrap_ibv_get_async_event(dev->context, &event)) { break; }
     char *str;
+    struct ibv_cq* cq = event.element.cq;    // only valid if CQ error
+    struct ibv_qp* qp = event.element.qp;    // only valid if QP error
+    struct ibv_srq* srq = event.element.srq; // only valid if SRQ error
     if (ncclSuccess != wrap_ibv_event_type_str(&str, event.event_type)) { break; }
-    if (event.event_type != IBV_EVENT_COMM_EST)
-      WARN("NET/IB : %s:%d Got async event : %s", dev->devName, dev->portNum, str);
+    switch (event.event_type) {
+    case IBV_EVENT_DEVICE_FATAL:
+      // the above is device fatal error
+      WARN("NET/IB : %s:%d async fatal event: %s", dev->devName, dev->portNum, str);
+      ncclIbDevFatalError(dev);
+      break;
+    case IBV_EVENT_CQ_ERR:
+      // the above is a CQ fatal error
+      WARN("NET/IB : %s:%d async fatal event on CQ (%p): %s", dev->devName, dev->portNum, cq, str);
+      ncclIbCqFatalError(cq);
+      break;
+    case IBV_EVENT_QP_FATAL:
+    case IBV_EVENT_QP_REQ_ERR:
+    case IBV_EVENT_QP_ACCESS_ERR:
+      // the above are QP fatal errors
+      WARN("NET/IB : %s:%d async fatal event on QP (%p): %s", dev->devName, dev->portNum, qp, str);
+      ncclIbQpFatalError(qp);
+      break;
+    case IBV_EVENT_SRQ_ERR:
+      // SRQ are not used in NCCL
+      WARN("NET/IB : %s:%d async fatal event on SRQ, unused for now (%p): %s", dev->devName, dev->portNum, srq, str);
+      break;
+    case IBV_EVENT_PATH_MIG_ERR:
+    case IBV_EVENT_PORT_ERR:
+    case IBV_EVENT_PATH_MIG:
+    case IBV_EVENT_PORT_ACTIVE:
+    case IBV_EVENT_SQ_DRAINED:
+    case IBV_EVENT_LID_CHANGE:
+    case IBV_EVENT_PKEY_CHANGE:
+    case IBV_EVENT_SM_CHANGE:
+    case IBV_EVENT_QP_LAST_WQE_REACHED:
+    case IBV_EVENT_CLIENT_REREGISTER:
+    case IBV_EVENT_SRQ_LIMIT_REACHED:
+      // the above are non-fatal
+      WARN("NET/IB : %s:%d Got async error event: %s", dev->devName, dev->portNum, str);
+      break;
+    case IBV_EVENT_COMM_EST:
+      break;
+    default:
+      WARN("NET/IB : %s:%d unknown event type (%d)", dev->devName, dev->portNum, event.event_type);
+      break;
+    }
+    // acknowledgment needs to happen last to avoid user-after-free
     if (ncclSuccess != wrap_ibv_ack_async_event(&event)) { break; }
   }
   return NULL;
@@ -140,11 +216,11 @@ static void* envIbAddrRange(sa_family_t af, int* mask) {
   char addrString[128] = { 0 };
   snprintf(addrString, 128, "%s", env);
   char *addrStrPtr = addrString;
-  char *maskStrPtr = strstr(addrString, "/") + 1;
+  char *maskStrPtr = strstr(addrString, "/");
   if (NULL == maskStrPtr) {
     return NULL;
   }
-  *(maskStrPtr - 1) = '\0';
+  *(maskStrPtr++) = '\0';
 
   if (inet_pton(af, addrStrPtr, ret) == 0) {
     WARN("NET/IB: Ip address '%s' is invalid for family %s, ignoring address", addrStrPtr, (af == AF_INET) ? "AF_INET" : "AF_INET6");
@@ -242,12 +318,14 @@ static ncclResult_t ncclIbRoceGetVersionNum(const char* deviceName, int portNum,
 
   int fd = open(roceTypePath, O_RDONLY);
   if (fd == -1) {
+    WARN("NET/IB: open failed in ncclIbRoceGetVersionNum: %s", strerror(errno));
     return ncclSystemError;
   }
   int ret = read(fd, gidRoceVerStr, 15);
   close(fd);
 
   if (ret == -1) {
+    WARN("NET/IB: read failed in ncclIbRoceGetVersionNum: %s", strerror(errno));
     return ncclSystemError;
   }
 
@@ -420,7 +498,7 @@ int ncclIbFindMatchingDev(int dev) {
 }
 
 ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
-  ncclResult_t ret;
+  ncclResult_t ret = ncclSuccess;
   if (ncclParamIbDisable()) return ncclInternalError;
   static int shownIbHcaEnv = 0;
   if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; }
@@ -496,11 +574,12 @@ build_ib_list:
           ncclIbDevs[ncclNIbDevs].pdRefs = 0;
           ncclIbDevs[ncclNIbDevs].pd = NULL;
           strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE);
-          NCCLCHECK(ncclIbGetPciPath(ncclIbDevs[ncclNIbDevs].devName, &ncclIbDevs[ncclNIbDevs].pciPath, &ncclIbDevs[ncclNIbDevs].realPort));
+          NCCLCHECKGOTO(ncclIbGetPciPath(ncclIbDevs[ncclNIbDevs].devName, &ncclIbDevs[ncclNIbDevs].pciPath, &ncclIbDevs[ncclNIbDevs].realPort), ret, fail);
           ncclIbDevs[ncclNIbDevs].maxQp = devAttr.max_qp;
           ncclIbDevs[ncclNIbDevs].mrCache.capacity = 0;
           ncclIbDevs[ncclNIbDevs].mrCache.population = 0;
           ncclIbDevs[ncclNIbDevs].mrCache.slots = NULL;
+          NCCLCHECK(ncclIbStatsInit(&ncclIbDevs[ncclNIbDevs].stats));
 
           // Enable ADAPTIVE_ROUTING by default on IB networks
           // But allow it to be overloaded by an env parameter
@@ -510,9 +589,9 @@ build_ib_list:
           TRACE(NCCL_NET,"NET/IB: [%d] %s:%s:%d/%s speed=%d context=%p pciPath=%s ar=%d", d, devices[d]->name, devices[d]->dev_name, ncclIbDevs[ncclNIbDevs].portNum,
               portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE", ncclIbDevs[ncclNIbDevs].speed, context, ncclIbDevs[ncclNIbDevs].pciPath, ncclIbDevs[ncclNIbDevs].ar);
 
-          pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, ncclIbDevs + ncclNIbDevs);
+          PTHREADCHECKGOTO(pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, ncclIbDevs + ncclNIbDevs), "pthread_create", ret, fail);
           ncclSetThreadName(ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs);
-          pthread_detach(ncclIbAsyncThread); // will not be pthread_join()'d
+          PTHREADCHECKGOTO(pthread_detach(ncclIbAsyncThread), "pthread_detach", ret, fail); // will not be pthread_join()'d
 
           int mergedDev = ncclNMergedIbDevs;
           if (mergeNics) {
@@ -592,10 +671,11 @@ build_ib_list:
     }
     pthread_mutex_unlock(&ncclIbLock);
   }
-  return ncclSuccess;
+exit:
+  return ret;
 fail:
   pthread_mutex_unlock(&ncclIbLock);
-  return ret;
+  goto exit;
 }
 
 ncclResult_t ncclIbDevices(int* ndev) {
@@ -607,46 +687,63 @@ ncclResult_t ncclIbDevices(int* ndev) {
 // Returns :
 // ncclSuccess : GDR works
 // ncclSystemError : no module or module loaded but not supported by GPU
+#define KNL_MODULE_LOADED(a) ((access(a, F_OK) == -1) ? 0 : 1)
+static int ncclIbGdrModuleLoaded = 0; // 1 = true, 0 = false
+static void ibGdrSupportInitOnce() {
+  // Check for the nv_peer_mem module being loaded
+  ncclIbGdrModuleLoaded = KNL_MODULE_LOADED("/sys/kernel/mm/memory_peers/nv_mem/version") ||
+                          KNL_MODULE_LOADED("/sys/kernel/mm/memory_peers/nv_mem_nc/version") ||
+                          KNL_MODULE_LOADED("/sys/module/nvidia_peermem/version");
+}
 ncclResult_t ncclIbGdrSupport() {
-  static int moduleLoaded = -1;
-  if (moduleLoaded == -1) {
-    // Check for the nv_peer_mem module being loaded
-    moduleLoaded = ((access("/sys/kernel/mm/memory_peers/nv_mem/version", F_OK) == -1) &&
-                    // Also support the new nv_mem_nc module
-                    (access("/sys/kernel/mm/memory_peers/nv_mem_nc/version", F_OK) == -1)) ? 0 : 1;
-  }
-  if (moduleLoaded == 0) return ncclSystemError;
+  static pthread_once_t once = PTHREAD_ONCE_INIT;
+  pthread_once(&once, ibGdrSupportInitOnce);
+  if (!ncclIbGdrModuleLoaded)
+    return ncclSystemError;
   return ncclSuccess;
 }
 
+static __thread int ibDmaSupportInitDev; // which device to init, must be thread local
+static void ibDmaBufSupportInitOnce(){
+  ncclResult_t res;
+  // select the appropriate
+  struct ncclIbMergedDev* mergedDev = ncclIbMergedDevs + ibDmaSupportInitDev;
+  // Test each real devices
+  int dev_fail = 0;
+  for (int i = 0; i < mergedDev->ndevs; i++) {
+    int ibDev = mergedDev->devs[i];
+    struct ibv_pd* pd;
+    struct ibv_context* ctx = ncclIbDevs[ibDev].context;
+    NCCLCHECKGOTO(wrap_ibv_alloc_pd(&pd, ctx), res, failure);
+    // Test kernel DMA-BUF support with a dummy call (fd=-1)
+    (void)wrap_direct_ibv_reg_dmabuf_mr(pd, 0ULL /*offset*/, 0ULL /*len*/, 0ULL /*iova*/, -1 /*fd*/, 0 /*flags*/);
+    // ibv_reg_dmabuf_mr() will fail with EOPNOTSUPP/EPROTONOSUPPORT if not supported (EBADF otherwise)
+    dev_fail |= (errno == EOPNOTSUPP) || (errno == EPROTONOSUPPORT);
+    NCCLCHECKGOTO(wrap_ibv_dealloc_pd(pd), res, failure);
+    // stop the search and goto failure
+    if (dev_fail) goto failure;
+  }
+  mergedDev->dmaBufSupported = 1;
+  return;
+failure:
+  mergedDev->dmaBufSupported = -1;
+  return;
+}
 // Detect whether DMA-BUF support is present in the kernel
 // Returns :
 // ncclSuccess : DMA-BUF support is available
 // ncclSystemError : DMA-BUF is not supported by the kernel
 ncclResult_t ncclIbDmaBufSupport(int dev) {
-  static int dmaBufSupported = -1;
-  if (dmaBufSupported == -1) {
-    ncclResult_t res;
-    struct ibv_pd* pd;
-    struct ibv_context* ctx;
-    struct ncclIbMergedDev* mergedDev = ncclIbMergedDevs + dev;
+  struct oncewrap {
+    pthread_once_t once = PTHREAD_ONCE_INIT;
+  };
+  static oncewrap onces[MAX_IB_DEVS];
+  // init the device only once
+  ibDmaSupportInitDev = dev;
+  pthread_once(&onces[dev].once, ibDmaBufSupportInitOnce);
 
-    // Test each dev
-    for (int i = 0; i < mergedDev->ndevs; i++) {
-      int ibDev = mergedDev->devs[i];
-      ctx = ncclIbDevs[ibDev].context;
-      NCCLCHECKGOTO(wrap_ibv_alloc_pd(&pd, ctx), res, failure);
-      // Test kernel DMA-BUF support with a dummy call (fd=-1)
-      (void) wrap_direct_ibv_reg_dmabuf_mr(pd, 0ULL/*offset*/, 0ULL/*len*/, 0ULL/*iova*/, -1/*fd*/, 0/*flags*/);
-      // ibv_reg_dmabuf_mr() will fail with EOPNOTSUPP/EPROTONOSUPPORT if not supported (EBADF otherwise)
-      dmaBufSupported = (errno != EOPNOTSUPP && errno != EPROTONOSUPPORT) ? 1 : 0;
-      NCCLCHECKGOTO(wrap_ibv_dealloc_pd(pd), res, failure);
-    }
-  }
-  if (dmaBufSupported == 0) return ncclSystemError;
-  return ncclSuccess;
-failure:
-  dmaBufSupported = 0;
+  int dmaBufSupported = ncclIbMergedDevs[dev].dmaBufSupported;
+  if (dmaBufSupported == 1) return ncclSuccess;
   return ncclSystemError;
 }
 
@@ -842,16 +939,19 @@ struct alignas(32) ncclIbNetCommBase {
   // Track necessary remDevInfo here
   int nRemDevs;
   struct ncclIbDevInfo remDevs[NCCL_IB_MAX_DEVS_PER_NIC];
+  // statistics about the comm
+  struct ncclIbStats stats;
 };
 
 struct ncclIbSendComm {
   struct ncclIbNetCommBase base;
+  // Start with fifo and ibv structs as they have alignment restrictions
   struct ncclIbSendFifo fifo[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
+  struct ibv_sge sges[NCCL_NET_IB_MAX_RECVS];
+  struct ibv_send_wr wrs[NCCL_NET_IB_MAX_RECVS + 1];
   // Each dev correlates to a mergedIbDev
   struct ncclIbSendCommDev devs[NCCL_IB_MAX_DEVS_PER_NIC];
   struct ncclIbRequest* fifoReqs[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
-  struct ibv_sge sges[NCCL_NET_IB_MAX_RECVS];
-  struct ibv_send_wr wrs[NCCL_NET_IB_MAX_RECVS+1];
   struct ncclIbRemSizesFifo remSizesFifo;
   uint64_t fifoHead;
   int ar; // Use adaptive routing when all merged devices have it enabled
@@ -903,8 +1003,7 @@ static void ncclIbAddEvent(struct ncclIbRequest* req, int devIndex, struct ncclI
   req->events[devIndex]++;
   req->devBases[devIndex] = base;
 }
-
-ncclResult_t ncclIbInitCommDevBase(int ibDevN, struct ncclIbNetCommDevBase* base) {
+ncclResult_t ncclIbInitCommDevBase(int ibDevN, struct ncclIbNetCommDevBase* base, void* cq_context) {
   base->ibDevN = ibDevN;
   ncclIbDev* ibDev = ncclIbDevs + ibDevN;
   pthread_mutex_lock(&ibDev->lock);
@@ -921,7 +1020,7 @@ ncclResult_t ncclIbInitCommDevBase(int ibDevN, struct ncclIbNetCommDevBase* base
   pthread_mutex_unlock(&ibDev->lock);
 
   // Recv requests can generate 2 completions (one for the post FIFO, one for the Recv).
-  NCCLCHECK(wrap_ibv_create_cq(&base->cq, ibDev->context, 2*MAX_REQUESTS*ncclParamIbQpsPerConn(), NULL, NULL, 0));
+  NCCLCHECK(wrap_ibv_create_cq(&base->cq, ibDev->context, 2*MAX_REQUESTS*ncclParamIbQpsPerConn(), cq_context, NULL, 0));
 
   return ncclSuccess;
 }
@@ -940,9 +1039,10 @@ returning:
   return res;
 }
 
-ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base, int access_flags, struct ncclIbQp* qp) {
+ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base, int access_flags, void* qp_context, struct ncclIbQp* qp) {
   struct ibv_qp_init_attr qpInitAttr;
   memset(&qpInitAttr, 0, sizeof(struct ibv_qp_init_attr));
+  qpInitAttr.qp_context = qp_context;
   qpInitAttr.send_cq = base->cq;
   qpInitAttr.recv_cq = base->cq;
   qpInitAttr.qp_type = IBV_QPT_RC;
@@ -1026,6 +1126,7 @@ ncclResult_t ncclIbRtsQp(struct ibv_qp* qp) {
 }
 
 ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) {
+  ncclResult_t ret = ncclSuccess;
   struct ncclIbListenComm* comm;
   NCCLCHECK(ncclCalloc(&comm, 1));
   struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle;
@@ -1033,14 +1134,20 @@ ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) {
   memset(handle, 0, sizeof(struct ncclIbHandle));
   comm->dev = dev;
   handle->magic = NCCL_SOCKET_MAGIC;
-  NCCLCHECK(ncclSocketInit(&comm->sock, &ncclIbIfAddr, handle->magic, ncclSocketTypeNetIb, NULL, 1));
-  NCCLCHECK(ncclSocketListen(&comm->sock));
-  NCCLCHECK(ncclSocketGetAddr(&comm->sock, &handle->connectAddr));
+  NCCLCHECKGOTO(ncclSocketInit(&comm->sock, &ncclIbIfAddr, handle->magic, ncclSocketTypeNetIb, NULL, 1), ret, fail);
+  NCCLCHECKGOTO(ncclSocketListen(&comm->sock), ret, fail);
+  NCCLCHECKGOTO(ncclSocketGetAddr(&comm->sock, &handle->connectAddr), ret, fail);
   *listenComm = comm;
-  return ncclSuccess;
+exit:
+  return ret;
+fail:
+  (void)ncclSocketClose(&comm->sock);
+  free(comm);
+  goto exit;
 }
 
 ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
+  ncclResult_t ret = ncclSuccess;
   struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle;
   struct ncclIbCommStage* stage = &handle->stage;
   struct ncclIbSendComm* comm = (struct ncclIbSendComm*)stage->comm;
@@ -1055,16 +1162,18 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet
     WARN("Error: trying to connect already connected sendComm");
     return ncclInternalError;
   }
+  stage->buffer = NULL;
 
   NCCLCHECK(ncclIbMalloc((void**)&comm, sizeof(struct ncclIbSendComm)));
-  NCCLCHECK(ncclSocketInit(&comm->base.sock, &handle->connectAddr, handle->magic, ncclSocketTypeNetIb, NULL, 1));
+  NCCLCHECKGOTO(ncclIbStatsInit(&comm->base.stats), ret, fail);
+  NCCLCHECKGOTO(ncclSocketInit(&comm->base.sock, &handle->connectAddr, handle->magic, ncclSocketTypeNetIb, NULL, 1), ret, fail);
   stage->comm = comm;
   stage->state = ncclIbCommStateConnect;
-  NCCLCHECK(ncclSocketConnect(&comm->base.sock));
+  NCCLCHECKGOTO(ncclSocketConnect(&comm->base.sock), ret, fail);
 
 ib_connect_check:
   /* since ncclSocketConnect is async, we must check if connection is complete */
-  NCCLCHECK(ncclSocketReady(&comm->base.sock, &ready));
+  NCCLCHECKGOTO(ncclSocketReady(&comm->base.sock, &ready), ret, fail);
   if (!ready) return ncclSuccess;
 
   // IB Setup
@@ -1078,7 +1187,7 @@ ib_connect_check:
   comm->ar = 1; // Set to 1 for logic
   for (int i = 0; i < mergedDev->ndevs; i++) {
     int ibDevN = mergedDev->devs[i];
-    NCCLCHECK(ncclIbInitCommDevBase(ibDevN, &comm->devs[i].base));
+    NCCLCHECKGOTO(ncclIbInitCommDevBase(ibDevN, &comm->devs[i].base, &comm->base.stats), ret, fail);
     comm->ar = comm->ar && ncclIbDevs[dev].ar; // ADAPTIVE_ROUTING - if all merged devs have it enabled
   }
 
@@ -1091,13 +1200,17 @@ ib_connect_check:
   for (int q = 0; q < comm->base.nqps; q++) {
     ncclIbSendCommDev* commDev = comm->devs + devIndex;
     ncclIbDev* ibDev = ncclIbDevs + commDev->base.ibDevN;
-    NCCLCHECK(ncclIbCreateQp(ibDev->portNum, &commDev->base, IBV_ACCESS_REMOTE_WRITE, comm->base.qps+q));
+    NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &commDev->base, IBV_ACCESS_REMOTE_WRITE, &comm->base.stats, comm->base.qps + q), ret, fail);
     comm->base.qps[q].devIndex = devIndex;
     meta.qpInfo[q].qpn      = comm->base.qps[q].qp->qp_num;
     meta.qpInfo[q].devIndex = comm->base.qps[q].devIndex;
 
-    // Query ece capabilities (enhanced connection establishment)
-    NCCLCHECK(wrap_ibv_query_ece(comm->base.qps[q].qp, &meta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported));
+    if (ncclParamIbEceEnable()) {
+      // Query ece capabilities (enhanced connection establishment)
+      NCCLCHECKGOTO(wrap_ibv_query_ece(comm->base.qps[q].qp, &meta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported), ret, fail);
+    } else {
+      meta.qpInfo[q].ece_supported = 0;
+    }
     devIndex = (devIndex + 1) % comm->base.ndevs;
   }
 
@@ -1112,13 +1225,13 @@ ib_connect_check:
     devInfo->lid           = ibDev->portAttr.lid;
 
     // Prepare my fifo
-    NCCLCHECK(wrap_ibv_reg_mr(&commDev->fifoMr, commDev->base.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
+    NCCLCHECKGOTO(wrap_ibv_reg_mr(&commDev->fifoMr, commDev->base.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
     devInfo->fifoRkey = commDev->fifoMr->rkey;
 
     // Pack local GID info
     devInfo->link_layer = commDev->base.gidInfo.link_layer = ibDev->portAttr.link_layer;
-    NCCLCHECK(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, &ibDev->portAttr, &commDev->base.gidInfo.localGidIndex));
-    NCCLCHECK(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, commDev->base.gidInfo.localGidIndex, &commDev->base.gidInfo.localGid));
+    NCCLCHECKGOTO(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, &ibDev->portAttr, &commDev->base.gidInfo.localGidIndex), ret, fail);
+    NCCLCHECKGOTO(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, commDev->base.gidInfo.localGidIndex, &commDev->base.gidInfo.localGid), ret, fail);
     devInfo->gid.global.subnet_prefix = commDev->base.gidInfo.localGid.global.subnet_prefix;
     devInfo->gid.global.interface_id = commDev->base.gidInfo.localGid.global.interface_id;
 
@@ -1148,12 +1261,12 @@ ib_connect_check:
 
   stage->state = ncclIbCommStateSend;
   stage->offset = 0;
-  NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(meta)));
+  NCCLCHECKGOTO(ncclIbMalloc((void**)&stage->buffer, sizeof(meta)), ret, fail);
 
   memcpy(stage->buffer, &meta, sizeof(meta));
 
 ib_send:
-  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->base.sock, stage->buffer, sizeof(meta), &stage->offset));
+  NCCLCHECKGOTO(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->base.sock, stage->buffer, sizeof(meta), &stage->offset), ret, fail);
   if (stage->offset != sizeof(meta)) return ncclSuccess;
 
   stage->state = ncclIbCommStateConnecting;
@@ -1163,7 +1276,7 @@ ib_send:
 
 ib_connect:
   struct ncclIbConnectionMetadata remMeta;
-  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->base.sock, stage->buffer, sizeof(ncclIbConnectionMetadata), &stage->offset));
+  NCCLCHECKGOTO(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->base.sock, stage->buffer, sizeof(ncclIbConnectionMetadata), &stage->offset), ret, fail);
   if (stage->offset != sizeof(remMeta)) return ncclSuccess;
 
   memcpy(&remMeta, stage->buffer, sizeof(ncclIbConnectionMetadata));
@@ -1197,7 +1310,7 @@ ib_connect:
   }
 
   for (int i=0; i < comm->base.ndevs; i++) {
-    NCCLCHECK(wrap_ibv_reg_mr(comm->remSizesFifo.mrs+i, comm->devs[i].base.pd, &comm->remSizesFifo.elems, sizeof(int)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ));
+    NCCLCHECKGOTO(wrap_ibv_reg_mr(comm->remSizesFifo.mrs+i, comm->devs[i].base.pd, &comm->remSizesFifo.elems, sizeof(int)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
   }
   comm->base.nRemDevs = remMeta.ndevs;
 
@@ -1212,10 +1325,10 @@ ib_connect:
 
     struct ibv_qp* qp = comm->base.qps[q].qp;
     if (remQpInfo->ece_supported)
-      NCCLCHECK(wrap_ibv_set_ece(qp, &remQpInfo->ece, &remQpInfo->ece_supported));
+      NCCLCHECKGOTO(wrap_ibv_set_ece(qp, &remQpInfo->ece, &remQpInfo->ece_supported), ret, fail);
 
-    NCCLCHECK(ncclIbRtrQp(qp, &commDev->base.gidInfo, remQpInfo->qpn, remDevInfo, false));
-    NCCLCHECK(ncclIbRtsQp(qp));
+    NCCLCHECKGOTO(ncclIbRtrQp(qp, &commDev->base.gidInfo, remQpInfo->qpn, remDevInfo, false), ret, fail);
+    NCCLCHECKGOTO(ncclIbRtsQp(qp), ret, fail);
   }
 
   if (link_layer == IBV_LINK_LAYER_ETHERNET ) { // RoCE
@@ -1233,19 +1346,23 @@ ib_connect:
   stage->offset = 0;
 
 ib_send_ready:
-  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->base.sock, &comm->base.ready, sizeof(int), &stage->offset));
+  NCCLCHECKGOTO(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->base.sock, &comm->base.ready, sizeof(int), &stage->offset), ret, fail);
   if (stage->offset != sizeof(int)) return ncclSuccess;
 
-  free(stage->buffer);
-  stage->state = ncclIbCommStateStart;
-
   *sendComm = comm;
-  return ncclSuccess;
+exit:
+  if (stage->buffer) free(stage->buffer);
+  stage->state = ncclIbCommStateStart;
+  return ret;
+fail:
+  free(comm);
+  goto exit;
 }
 
 NCCL_PARAM(IbGdrFlushDisable, "GDR_FLUSH_DISABLE", 0);
 
 ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
+  ncclResult_t ret = ncclSuccess;
   struct ncclIbListenComm* lComm = (struct ncclIbListenComm*)listenComm;
   struct ncclIbCommStage* stage = &lComm->stage;
   struct ncclIbRecvComm* rComm = (struct ncclIbRecvComm*)stage->comm;
@@ -1262,22 +1379,23 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle
   }
 
   NCCLCHECK(ncclIbMalloc((void**)&rComm, sizeof(struct ncclIbRecvComm)));
+  NCCLCHECKGOTO(ncclIbStatsInit(&rComm->base.stats), ret, fail);
   stage->comm = rComm;
   stage->state = ncclIbCommStateAccept;
-  NCCLCHECK(ncclSocketInit(&rComm->base.sock));
-  NCCLCHECK(ncclSocketAccept(&rComm->base.sock, &lComm->sock));
+  NCCLCHECKGOTO(ncclSocketInit(&rComm->base.sock), ret, fail);
+  NCCLCHECKGOTO(ncclSocketAccept(&rComm->base.sock, &lComm->sock), ret, fail);
 
 ib_accept_check:
-  NCCLCHECK(ncclSocketReady(&rComm->base.sock, &ready));
+  NCCLCHECKGOTO(ncclSocketReady(&rComm->base.sock, &ready), ret, fail);
   if (!ready) return ncclSuccess;
 
   struct ncclIbConnectionMetadata remMeta;
   stage->state = ncclIbCommStateRecv;
   stage->offset = 0;
-  NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(remMeta)));
+  NCCLCHECKGOTO(ncclIbMalloc((void**)&stage->buffer, sizeof(remMeta)), ret, fail);
 
 ib_recv:
-  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->base.sock, stage->buffer, sizeof(remMeta), &stage->offset));
+  NCCLCHECKGOTO(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->base.sock, stage->buffer, sizeof(remMeta), &stage->offset), ret, fail);
   if (stage->offset != sizeof(remMeta)) return ncclSuccess;
 
   /* copy back the received info */
@@ -1308,10 +1426,10 @@ ib_recv:
   for (int i = 0; i < rComm->base.ndevs; i++) {
     rCommDev = rComm->devs + i;
     ibDevN = mergedDev->devs[i];
-    NCCLCHECK(ncclIbInitCommDevBase(ibDevN, &rCommDev->base));
+    NCCLCHECKGOTO(ncclIbInitCommDevBase(ibDevN, &rCommDev->base, &rComm->base.stats), ret, fail);
     ibDev = ncclIbDevs + ibDevN;
-    NCCLCHECK(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, &ibDev->portAttr, &rCommDev->base.gidInfo.localGidIndex));
-    NCCLCHECK(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, rCommDev->base.gidInfo.localGidIndex, &rCommDev->base.gidInfo.localGid));
+    NCCLCHECKGOTO(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, &ibDev->portAttr, &rCommDev->base.gidInfo.localGidIndex), ret, fail);
+    NCCLCHECKGOTO(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, rCommDev->base.gidInfo.localGidIndex, &rCommDev->base.gidInfo.localGid), ret, fail);
   }
 
   // Copy remDevInfo for things like remGidInfo, remFifoAddr, etc.
@@ -1336,23 +1454,26 @@ ib_recv:
     // Local ibDevN
     ibDevN = rComm->devs[devIndex].base.ibDevN;
     ibDev = ncclIbDevs + ibDevN;
-    NCCLCHECK(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_REMOTE_WRITE, qp));
+    NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_REMOTE_WRITE, &rComm->base.stats, qp), ret, fail);
     qp->devIndex = devIndex;
     devIndex = (devIndex + 1) % rComm->base.ndevs;
 
     // Set the ece (enhanced connection establishment) on this QP before RTR
     if (remMeta.qpInfo[q].ece_supported) {
-      NCCLCHECK(wrap_ibv_set_ece(qp->qp, &remMeta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported));
+      // Coverity suspects a copy-paste error below due to the use of remMeta in one argument and meta in another.
+      // However, this has been confirmed to be intentional.
+      // coverity[copy_paste_error]
+      NCCLCHECKGOTO(wrap_ibv_set_ece(qp->qp, &remMeta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported), ret, fail);
 
       // Query the reduced ece for this QP (matching enhancements between the requestor and the responder)
       // Store this in our own qpInfo for returning to the requestor
       if (meta.qpInfo[q].ece_supported)
-        NCCLCHECK(wrap_ibv_query_ece(qp->qp, &meta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported));
+        NCCLCHECKGOTO(wrap_ibv_query_ece(qp->qp, &meta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported), ret, fail);
     }
 
     bool override_tc = (q == 0) ? true : false;
-    NCCLCHECK(ncclIbRtrQp(qp->qp, &rCommDev->base.gidInfo, remMeta.qpInfo[q].qpn, remDevInfo, override_tc));
-    NCCLCHECK(ncclIbRtsQp(qp->qp));
+    NCCLCHECKGOTO(ncclIbRtrQp(qp->qp, &rCommDev->base.gidInfo, remMeta.qpInfo[q].qpn, remDevInfo, override_tc), ret, fail);
+    NCCLCHECKGOTO(ncclIbRtsQp(qp->qp), ret, fail);
   }
 
   rComm->flushEnabled = ((ncclIbGdrSupport() == ncclSuccess || ncclIbDmaBufSupport(lComm->dev) == ncclSuccess)
@@ -1366,17 +1487,17 @@ ib_recv:
     // Retain remote fifo info and prepare my RDMA ops
     rCommDev->fifoRkey = remMeta.devs[i].fifoRkey;
     rComm->remFifo.addr = remMeta.fifoAddr;
-    NCCLCHECK(wrap_ibv_reg_mr(&rCommDev->fifoMr, rCommDev->base.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ));
+    NCCLCHECKGOTO(wrap_ibv_reg_mr(&rCommDev->fifoMr, rCommDev->base.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
     rCommDev->fifoSge.lkey = rCommDev->fifoMr->lkey;
     if (ncclParamIbUseInline()) rComm->remFifo.flags = IBV_SEND_INLINE;
 
     // Allocate Flush dummy buffer for GPU Direct RDMA
     if (rComm->flushEnabled) {
-      NCCLCHECK(wrap_ibv_reg_mr(&rCommDev->gpuFlush.hostMr, rCommDev->base.pd, &rComm->gpuFlushHostMem, sizeof(int), IBV_ACCESS_LOCAL_WRITE));
+      NCCLCHECKGOTO(wrap_ibv_reg_mr(&rCommDev->gpuFlush.hostMr, rCommDev->base.pd, &rComm->gpuFlushHostMem, sizeof(int), IBV_ACCESS_LOCAL_WRITE), ret, fail);
       rCommDev->gpuFlush.sge.addr = (uint64_t)&rComm->gpuFlushHostMem;
       rCommDev->gpuFlush.sge.length = 1;
       rCommDev->gpuFlush.sge.lkey = rCommDev->gpuFlush.hostMr->lkey;
-      NCCLCHECK(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ, &rCommDev->gpuFlush.qp));
+      NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ, &rComm->base.stats, &rCommDev->gpuFlush.qp), ret, fail);
       struct ncclIbDevInfo devInfo;
       devInfo.lid         = ibDev->portAttr.lid;
       devInfo.link_layer  = ibDev->portAttr.link_layer;
@@ -1384,8 +1505,8 @@ ib_recv:
       devInfo.gid.global.subnet_prefix        = rCommDev->base.gidInfo.localGid.global.subnet_prefix;
       devInfo.gid.global.interface_id         = rCommDev->base.gidInfo.localGid.global.interface_id;
       devInfo.mtu         = ibDev->portAttr.active_mtu;
-      NCCLCHECK(ncclIbRtrQp(rCommDev->gpuFlush.qp.qp, &rCommDev->base.gidInfo, rCommDev->gpuFlush.qp.qp->qp_num, &devInfo, false));
-      NCCLCHECK(ncclIbRtsQp(rCommDev->gpuFlush.qp.qp));
+      NCCLCHECKGOTO(ncclIbRtrQp(rCommDev->gpuFlush.qp.qp, &rCommDev->base.gidInfo, rCommDev->gpuFlush.qp.qp->qp_num, &devInfo, false), ret, fail);
+      NCCLCHECKGOTO(ncclIbRtsQp(rCommDev->gpuFlush.qp.qp), ret, fail);
     }
 
     // Fill Handle
@@ -1400,7 +1521,7 @@ ib_recv:
     meta.devs[i].mtu      = remMeta.devs[i].mtu;
 
     // Prepare sizes fifo
-    NCCLCHECK(wrap_ibv_reg_mr(&rComm->devs[i].sizesFifoMr, rComm->devs[i].base.pd, rComm->sizesFifo, sizeof(int)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
+    NCCLCHECKGOTO(wrap_ibv_reg_mr(&rComm->devs[i].sizesFifoMr, rComm->devs[i].base.pd, rComm->sizesFifo, sizeof(int)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
     meta.devs[i].fifoRkey = rComm->devs[i].sizesFifoMr->rkey;
   }
   meta.fifoAddr = (uint64_t)rComm->sizesFifo;
@@ -1415,30 +1536,36 @@ ib_recv:
 
   stage->state = ncclIbCommStateSend;
   stage->offset = 0;
-  if (stage->buffer) free(stage->buffer);
-  NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(struct ncclIbConnectionMetadata)));
+  if (stage->buffer) {
+    free(stage->buffer);
+    stage->buffer = NULL;
+  }
+  NCCLCHECKGOTO(ncclIbMalloc((void**)&stage->buffer, sizeof(struct ncclIbConnectionMetadata)), ret, fail);
   memcpy(stage->buffer, &meta, sizeof(struct ncclIbConnectionMetadata));
 
 ib_send:
-  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &rComm->base.sock, stage->buffer, sizeof(struct ncclIbConnectionMetadata), &stage->offset));
+  NCCLCHECKGOTO(ncclSocketProgress(NCCL_SOCKET_SEND, &rComm->base.sock, stage->buffer, sizeof(struct ncclIbConnectionMetadata), &stage->offset), ret, fail);
   if (stage->offset < sizeof(struct ncclIbConnectionMetadata)) return ncclSuccess;
 
   stage->offset = 0;
   stage->state = ncclIbCommStatePendingReady;
 
 ib_recv_ready:
-  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV,  &rComm->base.sock, &rComm->base.ready, sizeof(int), &stage->offset));
+  NCCLCHECKGOTO(ncclSocketProgress(NCCL_SOCKET_RECV,  &rComm->base.sock, &rComm->base.ready, sizeof(int), &stage->offset), ret, fail);
   if (stage->offset != sizeof(int)) return ncclSuccess;
 
-  free(stage->buffer);
   *recvComm = rComm;
-
+exit:
   /* reset lComm stage */
+  if (stage->buffer) free(stage->buffer);
   stage->state = ncclIbCommStateStart;
   stage->offset = 0;
   stage->comm = NULL;
   stage->buffer = NULL;
-  return ncclSuccess;
+  return ret;
+fail:
+  free(rComm);
+  goto exit;
 }
 
 ncclResult_t ncclIbGetRequest(struct ncclIbNetCommBase* base, struct ncclIbRequest** req) {
@@ -1531,16 +1658,21 @@ struct ncclIbNetCommDevBase* ncclIbGetNetCommDevBase(ncclIbNetCommBase* base, in
 
 /* DMA-BUF support */
 ncclResult_t ncclIbRegMrDmaBuf(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) {
+  ncclResult_t ret = ncclSuccess;
   assert(size > 0);
   struct ncclIbNetCommBase* base = (struct ncclIbNetCommBase*) comm;
   struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) malloc(sizeof(struct ncclIbMrHandle));
   for (int i = 0; i < base->ndevs; i++) {
     // Each ncclIbNetCommDevBase is at different offset in send and recv netComms
     struct ncclIbNetCommDevBase* devComm = ncclIbGetNetCommDevBase(base, i);
-    NCCLCHECK(ncclIbRegMrDmaBufInternal(devComm, data, size, type, offset, fd, mhandleWrapper->mrs + i));
+    NCCLCHECKGOTO(ncclIbRegMrDmaBufInternal(devComm, data, size, type, offset, fd, mhandleWrapper->mrs + i), ret, fail);
   }
   *mhandle = (void*) mhandleWrapper;
-  return ncclSuccess;
+exit:
+  return ret;
+fail:
+  free(mhandleWrapper);
+  goto exit;
 }
 
 ncclResult_t ncclIbRegMr(void* comm, void* data, size_t size, int type, void** mhandle) {
@@ -1694,6 +1826,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mh
   struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
   if (comm->base.ready == 0) { WARN("NET/IB: ncclIbIsend() called when comm->base.ready == 0"); return ncclInternalError; }
   if (comm->base.ready == 0) { *request = NULL; return ncclSuccess; }
+  NCCLCHECK(ncclIbStatsCheckFatalCount(&comm->base.stats,__func__));
 
   struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) mhandle;
 
@@ -1858,6 +1991,7 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* ta
   if (comm->base.ready == 0) { WARN("NET/IB: ncclIbIrecv() called when comm->base.ready == 0"); return ncclInternalError; }
   if (comm->base.ready == 0) { *request = NULL; return ncclSuccess; }
   if (n > NCCL_NET_IB_MAX_RECVS) return ncclInternalError;
+  NCCLCHECK(ncclIbStatsCheckFatalCount(&comm->base.stats,__func__));
 
   struct ncclIbRequest* req;
   NCCLCHECK(ncclIbGetRequest(&comm->base, &req));
@@ -1937,10 +2071,13 @@ ncclResult_t ncclIbIflush(void* recvComm, int n, void** data, int* sizes, void**
   return ncclSuccess;
 }
 
+#define HCA_NAME(req, index) ((req)->devBases[(index)]->pd->context->device->name)
+
 ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
   struct ncclIbRequest *r = (struct ncclIbRequest*)request;
   *done = 0;
   while (1) {
+    NCCLCHECK(ncclIbStatsCheckFatalCount(&r->base->stats,__func__));
     if (r->events[0] == 0 && r->events[1] == 0) {
       TRACE(NCCL_NET, "r=%p done", r);
       *done = 1;
@@ -1996,7 +2133,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
           TRACE(NCCL_NET, "Got completion from peer %s with status=%d opcode=%d len=%d wr_id=%ld r=%p type=%d events={%d,%d}, i=%d",
               ncclSocketToString(&addr, line), wc->status, wc->opcode,wc->byte_len, wc->wr_id, req, req->type, req->events[0], req->events[1], i);
           #endif
-          if (req->type == NCCL_NET_IB_REQ_SEND) {
+          if (req && req->type == NCCL_NET_IB_REQ_SEND) {
             for (int j = 0; j < req->nreqs; j++) {
               struct ncclIbRequest* sendReq = r->base->reqs+((wc->wr_id >> (j*8)) & 0xff);
               if ((sendReq->events[i] <= 0)) {
@@ -2018,6 +2155,9 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
             req->events[i]--;
           }
         }
+        // Once the IB fatal event is reported in the async thread, we want to propagate this error
+        // to communicator and prevent further polling to reduce error pollution.
+        NCCLCHECK(ncclIbStatsCheckFatalCount(&ncclIbDevs[r->devBases[i]->ibDevN].stats,__func__));
       }
     }
 
diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc
index e9e0357141..73a5d55b00 100644
--- a/src/transport/net_socket.cc
+++ b/src/transport/net_socket.cc
@@ -73,22 +73,27 @@ ncclResult_t ncclNetSocketDevices(int* ndev) {
 }
 
 static ncclResult_t ncclNetSocketGetSpeed(char* devName, int* speed) {
+  ncclResult_t ret = ncclSuccess;
   *speed = 0;
   char speedPath[PATH_MAX];
   sprintf(speedPath, "/sys/class/net/%s/speed", devName);
-  int fd = open(speedPath, O_RDONLY);
+  int fd = -1;
+  SYSCHECKSYNC(open(speedPath, O_RDONLY), "open", fd);
   if (fd != -1) {
     char speedStr[] = "        ";
-    if (read(fd, speedStr, sizeof(speedStr)-1) > 0) {
+    int n;
+    // Allow this to silently fail
+    n = read(fd, speedStr, sizeof(speedStr)-1);
+    if (n > 0) {
       *speed = strtol(speedStr, NULL, 0);
     }
-    close(fd);
   }
   if (*speed <= 0) {
     INFO(NCCL_NET, "Could not get speed from %s. Defaulting to 10 Gbps.", speedPath);
     *speed = 10000;
   }
-  return ncclSuccess;
+  if (fd != -1) SYSCHECK(close(fd), "close");
+  return ret;
 }
 
 ncclResult_t ncclNetSocketGetProperties(int dev, ncclNetProperties_t* props) {
@@ -235,19 +240,24 @@ void* persistentSocketThread(void *args_) {
 }
 
 ncclResult_t ncclNetSocketGetNsockNthread(int dev, int* ns, int* nt) {
+  ncclResult_t ret = ncclSuccess;
   int nSocksPerThread = ncclParamSocketNsocksPerThread();
   int nThreads = ncclParamSocketNthreads();
   if (nThreads > MAX_THREADS) {
     WARN("NET/Socket : NCCL_SOCKET_NTHREADS is greater than the maximum allowed, setting to %d", MAX_THREADS);
     nThreads = MAX_THREADS;
   }
+  int fd = -1;
+  int nSocks;
   if (nThreads == -2 || nSocksPerThread == -2) {
     // Auto-detection
     int autoNt=0, autoNs=1; // By default, we only use the main thread and do not spawn extra threads
     char vendorPath[PATH_MAX];
     snprintf(vendorPath, PATH_MAX, "/sys/class/net/%s/device/vendor", ncclNetSocketDevs[dev].devName);
+    // Coverity is wrong.  NULL second argument to realpath() is OK by POSIX.1-2008.
+    // coverity[alias_transfer:FALSE]
     char* rPath = realpath(vendorPath, NULL);
-    int fd = open(rPath, O_RDONLY);
+    fd = open(rPath, O_RDONLY);
     free(rPath);
     if (fd == -1) {
       // Could not find device vendor. This is handled silently so
@@ -257,9 +267,7 @@ ncclResult_t ncclNetSocketGetNsockNthread(int dev, int* ns, int* nt) {
     }
     char vendor[7];
     strncpy(vendor, "0x0000", 7);
-    int len;
-    SYSCHECKVAL(read(fd, vendor, 6), "read", len);
-    SYSCHECK(close(fd), "close");
+    SYSCHECKGOTO(read(fd, vendor, 6), "read", ret, fail);
     if (strcmp(vendor, "0x1d0f") == 0) { // AWS
       autoNt = 2;
       autoNs = 8;
@@ -271,7 +279,7 @@ end:
     if (nThreads == -2) nThreads = autoNt;
     if (nSocksPerThread == -2) nSocksPerThread = autoNs;
   }
-  int nSocks = nSocksPerThread * nThreads;
+  nSocks = nSocksPerThread * nThreads;
   if (nSocks > MAX_SOCKETS) {
     nSocksPerThread = MAX_SOCKETS/nThreads;
     WARN("NET/Socket : the total number of sockets is greater than the maximum allowed, setting NCCL_NSOCKS_PERTHREAD to %d", nSocksPerThread);
@@ -280,28 +288,38 @@ end:
   *ns = nSocks;
   *nt = nThreads;
   if (nSocks > 0) INFO(NCCL_INIT, "NET/Socket: Using %d threads and %d sockets per thread", nThreads, nSocksPerThread);
-  return ncclSuccess;
+exit:
+  if (fd != -1) close(fd);
+  return ret;
+fail:
+  goto exit;
 }
 
 ncclResult_t ncclNetSocketListen(int dev, void* opaqueHandle, void** listenComm) {
   if (dev < 0 || dev >= ncclNetIfs) { // data transfer socket is based on specified dev
     return ncclInternalError;
   }
+  ncclResult_t ret = ncclSuccess;
   struct ncclNetSocketHandle* handle = (struct ncclNetSocketHandle*) opaqueHandle;
   memset(handle, 0, sizeof(struct ncclNetSocketHandle));
   static_assert(sizeof(struct ncclNetSocketHandle) <= NCCL_NET_HANDLE_MAXSIZE, "ncclNetSocketHandle size too large");
   struct ncclNetSocketListenComm* comm;
   NCCLCHECK(ncclCalloc(&comm, 1));
   handle->magic = NCCL_SOCKET_MAGIC;
-  NCCLCHECK(ncclSocketInit(&comm->sock, &ncclNetSocketDevs[dev].addr, handle->magic, ncclSocketTypeNetSocket, NULL, 1));
-  NCCLCHECK(ncclSocketListen(&comm->sock));
-  NCCLCHECK(ncclSocketGetAddr(&comm->sock, &handle->connectAddr));
-  NCCLCHECK(ncclNetSocketGetNsockNthread(dev, &comm->nSocks, &comm->nThreads));
+  NCCLCHECKGOTO(ncclSocketInit(&comm->sock, &ncclNetSocketDevs[dev].addr, handle->magic, ncclSocketTypeNetSocket, NULL, 1), ret, fail);
+  NCCLCHECKGOTO(ncclSocketListen(&comm->sock), ret, fail);
+  NCCLCHECKGOTO(ncclSocketGetAddr(&comm->sock, &handle->connectAddr), ret, fail);
+  NCCLCHECKGOTO(ncclNetSocketGetNsockNthread(dev, &comm->nSocks, &comm->nThreads), ret, fail);
   handle->nSocks = comm->nSocks;
   handle->nThreads = comm->nThreads;
   comm->dev = dev;
   *listenComm = comm;
-  return ncclSuccess;
+exit:
+  return ret;
+fail:
+  (void)ncclSocketClose(&comm->sock);
+  free(comm);
+  goto exit;
 }
 
 ncclResult_t ncclNetSocketConnect(int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
@@ -437,7 +455,7 @@ ncclResult_t ncclNetSocketGetTask(struct ncclNetSocketComm* comm, int op, void*
     res->comm = comm;
     pthread_mutex_init(&res->threadLock, NULL);
     pthread_cond_init(&res->threadCond, NULL);
-    pthread_create(comm->helperThread+tid, NULL, persistentSocketThread, res);
+    PTHREADCHECK(pthread_create(comm->helperThread+tid, NULL, persistentSocketThread, res), "pthread_create");
     ncclSetThreadName(comm->helperThread[tid], "NCCL Sock%c%1u%2u%2u", op == NCCL_SOCKET_SEND ? 'S' : 'R', comm->dev, tid, comm->cudaDev);
   }
   struct ncclNetSocketTask* r = queue->tasks+queue->next;
@@ -482,7 +500,7 @@ ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) {
     if (r->op == NCCL_SOCKET_RECV && data > r->size) {
       char line[SOCKET_NAME_MAXLEN+1];
       union ncclSocketAddress addr;
-      ncclSocketGetAddr(r->ctrlSock, &addr);
+      NCCLCHECK(ncclSocketGetAddr(r->ctrlSock, &addr));
       WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d. If you believe your socket network is in healthy state, \
           there may be a mismatch in collective sizes or environment settings (e.g. NCCL_PROTO, NCCL_ALGO) between ranks",
           ncclSocketToString(&addr, line), data, r->size);
@@ -579,7 +597,7 @@ ncclResult_t ncclNetSocketClose(void* opaqueComm) {
         res->stop = 1;
         pthread_cond_signal(&res->threadCond);
         pthread_mutex_unlock(&res->threadLock);
-        pthread_join(comm->helperThread[i], NULL);
+        PTHREADCHECK(pthread_join(comm->helperThread[i], NULL), "pthread_join");
       }
       free(res->threadTaskQueue.tasks);
     }
diff --git a/src/transport/nvls.cc b/src/transport/nvls.cc
index 61d5946c4c..aa9c486b14 100644
--- a/src/transport/nvls.cc
+++ b/src/transport/nvls.cc
@@ -26,7 +26,7 @@ struct localRegData {
   intptr_t offset;
 };
 
-ncclResult_t nvlsCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
+ncclResult_t nvlsCanConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
   // This transport cannot be used for p2p
   *ret = 0;
   return ncclSuccess;
@@ -71,28 +71,31 @@ ncclResult_t nvlsGroupCreate(struct ncclComm *comm, CUmulticastObjectProp *prop,
 
 ncclResult_t nvlsGroupConnect(struct ncclComm *comm, char *shareableHandle, int rank, CUmemGenericAllocationHandle *mcHandle) {
   CUmemAllocationHandleType type = ncclCuMemHandleType;
-
+  int fd = -1;
+  ncclResult_t ret = ncclSuccess;
   INFO(NCCL_NVLS, "NVLS importing shareableHandle %p from rank %d", shareableHandle, rank);
 
   // Import and map the remote memory descriptor to the local GPU
   if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
     // cuMem UDS support
-    int fd = -1;
     TRACE(NCCL_NVLS, "NVLS rank %d Importing shareable handle %p from rank %d", comm->localRank, shareableHandle, rank);
-    int tpProxyRank = comm->topParentRanks[rank];
     TRACE(NCCL_NVLS, "NVLS rank %d request conversion of handle 0x%lx from rank %d", comm->localRank, *(uint64_t*)shareableHandle, rank);
-    NCCLCHECK(ncclProxyClientGetFdBlocking(comm, tpProxyRank, shareableHandle, &fd));
+    NCCLCHECKGOTO(ncclProxyClientGetFdBlocking(comm, rank, shareableHandle, &fd), ret, fail);
     TRACE(NCCL_NVLS, "NVLS rank %d received converted fd %d from rank %d", comm->localRank, fd, rank);
-    CUCHECK(cuMemImportFromShareableHandle(mcHandle, (void *)(uintptr_t)fd, type));
-    (void) close(fd);
+    CUCHECKGOTO(cuMemImportFromShareableHandle(mcHandle, (void *)(uintptr_t)fd, type), ret, fail);
+    SYSCHECK(close(fd), "close");
   } else {
     if (type == CU_MEM_HANDLE_TYPE_FABRIC) {
-      CUCHECK(cuMemImportFromShareableHandle(mcHandle, (void *)shareableHandle, type));
+      CUCHECKGOTO(cuMemImportFromShareableHandle(mcHandle, (void *)shareableHandle, type), ret, fail);
     } else {
       memcpy(mcHandle, shareableHandle, sizeof(CUmemGenericAllocationHandle));
     }
   }
-  return ncclSuccess;
+exit:
+  return ret;
+fail:
+  if (fd != -1) close(fd);
+  goto exit;
 }
 
 ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, size_t size, CUmemGenericAllocationHandle* mcHandle) {
@@ -100,7 +103,7 @@ ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, size_t size, CUmemGenericAll
   INFO(NCCL_NVLS, "NVLS Unbind MC handle %llx size %zu dev %d", *mcHandle, size, dev);
 
   // Unbind physical memory from group for the given device
-  CUCHECK(cuMulticastUnbind(*mcHandle, dev, 0/*mcOffset*/, size));
+  if (size) CUCHECK(cuMulticastUnbind(*mcHandle, dev, 0/*mcOffset*/, size));
 
   return ncclSuccess;
 }
@@ -117,14 +120,18 @@ ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, size_t size, void* ucptr,
   INFO(NCCL_NVLS, "NVLS Unmap mem UC handle 0x%llx(%p) MC handle 0x%llx(%p)", *ucHandle, ucptr, *mcHandle, mcptr);
 
   // Release the UC memory and mapping
-  CUCHECK(cuMemUnmap((CUdeviceptr)ucptr, size));
-  CUCHECK(cuMemAddressFree((CUdeviceptr)ucptr, size));
-  CUCHECK(cuMemRelease(*ucHandle));
+  if (ucptr) {
+    CUCHECK(cuMemUnmap((CUdeviceptr)ucptr, size));
+    CUCHECK(cuMemAddressFree((CUdeviceptr)ucptr, size));
+    CUCHECK(cuMemRelease(*ucHandle));
+  }
 
   // Release the MC memory and mapping
-  CUCHECK(cuMemUnmap((CUdeviceptr)mcptr, size));
-  CUCHECK(cuMemAddressFree((CUdeviceptr)mcptr, size));
-  CUCHECK(cuMemRelease(*mcHandle));
+  if (mcptr) {
+    CUCHECK(cuMemUnmap((CUdeviceptr)mcptr, size));
+    CUCHECK(cuMemAddressFree((CUdeviceptr)mcptr, size));
+    CUCHECK(cuMemRelease(*mcHandle));
+  }
 
   return ncclSuccess;
 }
@@ -191,7 +198,9 @@ static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, CUmulticastGranularit
   size_t size = *sizePtr;
   size_t originSize = size;
   size_t ucgran, mcgran;
+  int allocMcHandle = 0;
 
+  *ucptr = *mcptr = NULL;
   memset(&mcprop, 0, sizeof(CUmulticastObjectProp));
   mcprop.numDevices = comm->localRanks;
   mcprop.handleTypes = ncclCuMemHandleType;
@@ -203,10 +212,12 @@ static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, CUmulticastGranularit
 
   if (comm->localRank == 0) {
     NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, mcHandle, shareableHandle), ret, fail);
+    allocMcHandle = 1;
     NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
   } else {
     NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
     NCCLCHECKGOTO(nvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], mcHandle), ret, fail);
+    allocMcHandle = 1;
   }
 
   CUCHECKGOTO(cuMulticastAddDevice(*mcHandle, comm->cudaDev), ret, fail);
@@ -226,6 +237,8 @@ static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, CUmulticastGranularit
   CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)*ucptr, size, desc, 1), ret, fail);
   CUDACHECKGOTO(cudaMemset(*ucptr, 0, size), ret, fail);
 
+  // intra-node barrier to mitigate the possible hang in cuMulticastBindMem during abort
+  NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
   // Bind physical memory to the Multicast group
   // NB: It will block until all ranks have been added to the Group
   CUCHECKGOTO(cuMulticastBindMem(*mcHandle, 0/*mcOffset*/, *ucHandle, 0/*memOffset*/, size, 0/*flags*/), ret, fail);
@@ -239,6 +252,7 @@ static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, CUmulticastGranularit
 exit:
   return ret;
 fail:
+  if (allocMcHandle && *mcptr == NULL && *ucptr == NULL) CUCHECK(cuMemRelease(*mcHandle));
   goto exit;
 }
 
@@ -350,10 +364,10 @@ setup:
     struct ncclNvlsSharedRes* resources = NULL;
     int nHeads = comm->channels[0].nvls.nHeads;
     int nChannels = comm->nChannels;
-    size_t memSize = 16;
+    size_t memSize = 64;
     size_t creditSize = nChannels * 2 * memSize * nHeads;
     int nvlsStepSize = comm->nvlsChunkSize;
-  
+
     NCCLCHECKGOTO(ncclCalloc(&comm->nvlsResources, 1), res, fail);
     comm->nvlsResources->inited = false;
     comm->nvlsResources->refCount = 1;
@@ -466,7 +480,7 @@ ncclResult_t ncclNvlsFree(struct ncclComm* comm) {
     if (!comm->MNNVL && resources->nvlsShmemHandle)
       NCCLCHECK(ncclShmClose(resources->nvlsShmemHandle));
 
-    if (resources->ucCredit && resources->mcCredit) {
+    if (resources->ucCredit || resources->mcCredit) {
       NCCLCHECK(nvlsGroupUnbind(comm, resources->creditSize, &resources->mcCreditHandle));
       NCCLCHECK(nvlsGroupUnmapMem(comm, resources->creditSize, resources->ucCredit, &resources->ucCreditHandle, resources->mcCredit, &resources->mcCreditHandle));
     }
@@ -490,7 +504,6 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
   char shareableHandle[NVLS_HANDLE_SIZE];
   CUmemGenericAllocationHandle mcHandle;
   size_t minSize = SIZE_MAX;
-  bool localRegBufUsed = false;
   struct localRegData* regData = NULL;
   cudaPointerAttributes attr;
   size_t ucgran, mcgran;
@@ -500,7 +513,7 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
   if (userBuff) {
     NCCLCHECKGOTO(ncclRegFind(comm, (void*)userBuff, buffSize, &regRecord), ret, fail);
     if (regRecord) {
-      CUDACHECK(cudaPointerGetAttributes(&attr, (void*)regRecord->addr));
+      CUDACHECKGOTO(cudaPointerGetAttributes(&attr, (void*)regRecord->addr), ret, fail);
       if (attr.type == cudaMemoryTypeDevice) {
         size_t regSize = regRecord->pages * comm->regCache.pageSize;
         memset(&mcprop, 0, sizeof(CUmulticastObjectProp));
@@ -508,7 +521,7 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
         mcprop.handleTypes = ncclCuMemHandleType;
         mcprop.flags = 0;
         mcprop.size = regSize;
-        CUCHECK(cuMulticastGetGranularity(&mcgran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
+        CUCHECKGOTO(cuMulticastGetGranularity(&mcgran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail);
 
         memset(&ucprop, 0, sizeof(CUmemAllocationProp));
         ucprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
@@ -517,7 +530,7 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
         ucprop.requestedHandleTypes = ncclCuMemHandleType;
         CUCHECKGOTO(cuMemGetAllocationGranularity(&ucgran, &ucprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail);
 
-        CUCHECK(cuMemGetAddressRange((CUdeviceptr*)&regRecord->baseAddr, &regRecord->baseSize, (CUdeviceptr)regRecord->addr));
+        CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&regRecord->baseAddr, &regRecord->baseSize, (CUdeviceptr)regRecord->addr), ret, fail);
         if (regSize % mcgran == 0) {
           regRecord->regSize = regSize;
         } else {
@@ -560,6 +573,9 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
   }
 
   CUCHECKGOTO(cuMulticastAddDevice(mcHandle, comm->nvlsResources->dev), ret, fail);
+  // Coverity complains that regRecord could be NULL.  That won't in practice be the case because we've already checked
+  // (regData[i].reg.state & NVLS_REG_POSSIBLE) of all local ranks, which would catch it and bail out.
+  // coverity[var_deref_op]
   CUCHECKGOTO(cuMulticastBindAddr(mcHandle, 0, (CUdeviceptr)regRecord->addr, minSize, 0), ret, fail);
 
   // Create a VA for the NVLS
@@ -584,15 +600,13 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
     }
   }
 
-  localRegBufUsed = true;
-
+  *regAddr = (uintptr_t)regPtr + regData[comm->localRank].offset;
+  *regUsed = true;
 exit:
-  if (localRegBufUsed) *regAddr = (uintptr_t)regPtr + regData[comm->localRank].offset;
-  *regUsed = localRegBufUsed;
   free(regData);
   return ret;
 fail:
-  localRegBufUsed = false;
+  *regUsed = false;
   goto exit;
 }
 
@@ -862,19 +876,21 @@ exit:
     }
 
     if (recvRecord) {
+      // Yes, it's a dead code.  That's fine...
+      // coverity[dead_error_begin]
       ncclNvlsDeregBuffer(&recvRecord->mcHandle, recvRecord->ptr, recvRecord->dev, recvRecord->size);
       free(recvRecord);
     }
   } else {
     if (sendRecord) {
       *outRegBufSend = (void*)((uintptr_t)regSendPtr + (uintptr_t)sendbuff - (uintptr_t)baseSend);
-      ncclIntruQueueEnqueue(cleanupQueue, &sendRecord->base);
+      ncclIntruQueueEnqueue(cleanupQueue, (struct ncclCommCallback*)sendRecord);
       *nCleanupQueueEltsAdded += 1;
     }
 
     if (recvRecord) {
       *outRegBufRecv = (void*)((uintptr_t)regRecvPtr + (uintptr_t)recvbuff - (uintptr_t)baseRecv);
-      ncclIntruQueueEnqueue(cleanupQueue, &recvRecord->base);
+      ncclIntruQueueEnqueue(cleanupQueue, (struct ncclCommCallback*)recvRecord);
       *nCleanupQueueEltsAdded += 1;
     }
 
diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc
index 90a714b409..6569ae175e 100644
--- a/src/transport/p2p.cc
+++ b/src/transport/p2p.cc
@@ -7,9 +7,11 @@
 #include "comm.h"
 #include "graph.h"
 #include "utils.h"
-#include "shm.h"
+#include "shmutils.h"
 #include "p2p.h"
 #include "transport.h"
+#include <assert.h>
+#include "shm.h"
 
 enum p2pType { P2P_DIRECT, P2P_INTERMEDIATE, P2P_IPC, P2P_CUMEM };
 
@@ -19,16 +21,28 @@ struct ncclP2pBuff {
   ncclIpcDesc ipcDesc;
 };
 
+struct ncclP2pRequest {
+  size_t size;
+  int refcount;
+};
+
 struct p2pConnectInfo {
   int rank;
   int read;
   struct ncclP2pBuff p2pBuff;
   // Used by CE memcpy
-  char shmName[7];
-  int shmSize;
+  ncclShmIpcDesc_t desc;
 };
 static_assert(sizeof(struct p2pConnectInfo) <= CONNECT_SIZE, "p2pConnectInfo is too large");
 
+struct p2pIpcExpInfo {
+  ncclIpcDesc ipcDesc;
+  bool legacyIpcCap;
+  int impFd;
+  size_t size;
+  uintptr_t offset;
+};
+
 struct p2pShm {
   struct ncclSendMem sendMem;
   struct ncclRecvMem recvMem;
@@ -37,9 +51,7 @@ struct p2pShmProxyInfo {
   // Shared memory between proxy and receiving GPU
   struct p2pShm* shm;
   struct p2pShm* devShm;
-  char shmName[7];
-  int shmSize;
-  ncclShmHandle_t handle;
+  ncclShmIpcDesc_t desc;
 
   // Intermediate step for sender
   struct ncclRecvMem* ceRecvMem;
@@ -62,13 +74,14 @@ struct p2pResources {
     struct ncclRecvMem* recvDevMem;
   };
   void* sendMemIpc;
+  int sendMemSameProc;
   void* recvMemIpc;
+  int recvMemSameProc;
   // CE memcpy support
   struct p2pShmProxyInfo proxyInfo;
   struct p2pShm* shm;
   struct p2pShm* devShm;
-  int shmSize;
-  ncclShmHandle_t handle;
+  ncclShmIpcDesc_t desc;
 };
 
 // cuMem API support
@@ -104,12 +117,12 @@ static void initCeOperation();
 extern int64_t ncclParamMNNVLEnable();
 
 /* Determine if two peers can communicate through p2p */
-ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
+ncclResult_t p2pCanConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
   initCeOperation();
 
   // MNNVL support
-  if (ncclParamMNNVLEnable() != 0 && info1->hostHash != info2->hostHash) {
-    NCCLCHECK(ncclTopoCheckMNNVL(topo, info1, info2, ret));
+  if (comm->MNNVL && info1->hostHash != info2->hostHash) {
+    NCCLCHECK(ncclTopoCheckMNNVL(comm->topo, info1, info2, ret));
     if (*ret) return ncclSuccess;
   }
 
@@ -121,7 +134,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
 
   // Check topology / p2p level.
   int intermediateRank;
-  NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, ret, NULL, &intermediateRank));
+  NCCLCHECK(ncclTopoCheckP2p(comm->topo, info1->rank, info2->rank, ret, NULL, &intermediateRank));
   if (*ret == 0) return ncclSuccess;
   if (intermediateRank != -1) {
     if (useMemcpy) *ret = 0;
@@ -130,7 +143,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
 
   // Check if NET would work better
   int useNet = 0;
-  NCCLCHECK(ncclTopoCheckNet(topo, info1->busId, info2->busId, &useNet));
+  NCCLCHECK(ncclTopoCheckNet(comm->topo, info1->rank, info2->rank, &useNet));
   if (useNet) {
     *ret = 0;
     return ncclSuccess;
@@ -197,7 +210,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
   } while (0)
 
 // cuMem API support
-ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, void **ptr) {
+ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, int refcount, ncclIpcDesc *ipcDesc, void **ptr) {
   if (ncclCuMemEnable()) {
 #if CUDART_VERSION >= 11030
     CUmemAllocationHandleType type = ncclCuMemHandleType;
@@ -211,6 +224,10 @@ ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, v
     } else {
       CUCHECK(cuMemExportToShareableHandle(&ipcDesc->cuDesc, handle, type, 0));
     }
+    if (refcount) {
+      memcpy(&ipcDesc->memHandle, &handle, sizeof(handle));
+      for (int r = 0; r < refcount; ++r) CUCHECK(cuMemRetainAllocationHandle(&handle, *ptr));
+    }
 #else
     return ncclInternalError;
 #endif
@@ -233,7 +250,7 @@ ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc) {
   return ncclSuccess;
 }
 
-ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr) {
+ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int peer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr) {
   if (ncclCuMemEnable()) {
 #if CUDART_VERSION >= 11030
     // cuMem API support
@@ -241,16 +258,25 @@ ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, siz
     CUmemAllocationHandleType type = ncclCuMemHandleType;
     CUmemGenericAllocationHandle handle;
     ncclCuDesc *cuDesc = &ipcDesc->cuDesc;
+    CUmemAllocationProp prop = {};
+    size_t granularity = 0;
+
+    prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+    prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+    prop.requestedHandleTypes = type;
+    prop.location.id = comm->cudaDev;
+    CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
+    ALIGN_SIZE(size, granularity);
 
     // Import and map the remote memory descriptor to the local GPU
     if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
       // UDS fd support
       int fd = -1;
       // Send cuMem handle to remote for conversion to an fd
-      NCCLCHECK(ncclProxyClientGetFdBlocking(comm, tpPeer, &cuDesc->data, &fd));
-      INFO(NCCL_P2P, "UDS converted handle 0x%lx to fd %d on remote peer %d", *(uint64_t*)&cuDesc->data, fd, tpPeer);
+      NCCLCHECK(ncclProxyClientGetFdBlocking(comm, peer, &cuDesc->data, &fd));
+      INFO(NCCL_P2P, "UDS converted handle 0x%lx to fd %d on remote peer %d", *(uint64_t*)&cuDesc->data, fd, peer);
       CUCHECK(cuMemImportFromShareableHandle(&handle, (void *)(uintptr_t)fd, type));
-      (void) close(fd);
+      SYSCHECK(close(fd), "close");
     } else {
       CUCHECK(cuMemImportFromShareableHandle(&handle, cuDesc, type));
     }
@@ -291,7 +317,7 @@ static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo*
   int p2p;
   // Queries the topology to see if the GPUs are Ampere and
   // connected via NVLink, if so we enable P2P Read by default
-  NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, &p2p, read, intermediateRank));
+  NCCLCHECK(ncclTopoCheckP2p(topo, info1->rank, info2->rank, &p2p, read, intermediateRank));
 
   int readEnable = ncclParamP2pReadEnable();
   if (readEnable != -2) *read = readEnable;
@@ -311,24 +337,23 @@ static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclProxyConnector* pro
             peerInfo->cudaDev, peerInfo->busId, err, cudaGetErrorString(err));
         return ncclInternalError;
       }
-#if CUDART_VERSION >= 11030
-      // cuMem API support
       if (ncclCuMemEnable()) {
-        // Allow direct access to the remote buffer from the local GPU
-        CUmemAccessDesc accessDesc = {};
-        accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-        accessDesc.location.id = myInfo->cudaDev;
-        accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-        INFO(NCCL_P2P, "Set Access for buffer %p size %zu on dev %d", p2pBuff->directPtr, p2pBuff->size, peerInfo->cudaDev);
-        CUCHECK(cuMemSetAccess((CUdeviceptr) p2pBuff->directPtr, p2pBuff->size, &accessDesc, 1));
+        // for intra-process ranks, we should map memHandle of the peers to increase refcount.
+        // Otherwise, if peers abort and free the buffer, the rank can suffer invalid access.
+        NCCLCHECK(ncclCuMemAllocAddr(devMem, &p2pBuff->ipcDesc.memHandle, p2pBuff->size));
+        CUCHECK(cuMemRelease(p2pBuff->ipcDesc.memHandle));
+        *ipcPtr = *devMem;
+      } else {
+        *devMem = p2pBuff->directPtr;
+        *ipcPtr = NULL;
       }
-#endif
+    } else {
+      *devMem = p2pBuff->directPtr;
+      *ipcPtr = NULL;
     }
-    *devMem = p2pBuff->directPtr;
-    *ipcPtr = NULL;
   } else {
     // Different PID
-    NCCLCHECK(ncclP2pImportShareableBuffer(comm, comm->topParentRanks[peerInfo->rank], p2pBuff->size, &p2pBuff->ipcDesc, devMem));
+    NCCLCHECK(ncclP2pImportShareableBuffer(comm, peerInfo->rank, p2pBuff->size, &p2pBuff->ipcDesc, devMem));
     *ipcPtr = *devMem;
   }
   return ncclSuccess;
@@ -338,7 +363,7 @@ static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclProxyConnector* pro
 ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
     struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
   struct p2pResources* resources;
-  int tpProxyRank;
+  struct ncclP2pRequest req;
   NCCLCHECK(ncclCalloc(&resources, 1));
   send->transportResources = resources;
   int useRead, intermediateRank;
@@ -387,15 +412,18 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
 	  comm->peerInfo[intermediateRank].nvmlDev, useReadStr);
   }
 
-  tpProxyRank = comm->topParentRanks[info->rank];
-  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, tpProxyRank, &send->proxyConn));
+  req.size = sendSize;
+  req.refcount = 0;
+  if (P2P_SAME_PID((comm->peerInfo + info->rank), peerInfo) && (comm->peerInfo[info->rank].cudaDev != peerInfo->cudaDev)) req.refcount++;
+  if (P2P_SAME_PID((comm->peerInfo + info->rank), myInfo) && (comm->peerInfo[info->rank].cudaDev != myInfo->cudaDev)) req.refcount++;
+  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, info->rank, &send->proxyConn));
   if (useMemcpy) {
     NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, NULL, 0, &resources->proxyInfo, sizeof(struct p2pShmProxyInfo)));
-    info->shmSize = resources->proxyInfo.shmSize;
-    memcpy(info->shmName, resources->proxyInfo.shmName, sizeof(info->shmName));
+    memcpy(&info->desc, &resources->proxyInfo.desc, sizeof(ncclShmIpcDesc_t));
   } else {
-    NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
+    NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(struct ncclP2pRequest), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
     NCCLCHECK(p2pMap(comm, &send->proxyConn, myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->sendDevMem, &resources->sendMemIpc));
+    resources->sendMemSameProc = P2P_SAME_PID(myInfo, (comm->peerInfo + info->rank));
   }
 
   return ncclSuccess;
@@ -405,7 +433,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
 ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
     struct ncclConnect* connectInfo, struct ncclConnector * recv, int channelId, int connIndex) {
   struct p2pResources* resources;
-  int tpProxyRank;
+  struct ncclP2pRequest req;
   NCCLCHECK(ncclCalloc(&resources, 1));
   recv->transportResources = resources;
   int useRead, intermediateRank;
@@ -444,11 +472,15 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
     info->rank = intermediateRank;
   }
 
-  tpProxyRank = comm->topParentRanks[info->rank];
-  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 0, tpProxyRank, &recv->proxyConn));
-  NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &recvSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
+  req.size = recvSize;
+  req.refcount = 0;
+  if (P2P_SAME_PID((comm->peerInfo + info->rank), peerInfo) && (comm->peerInfo[info->rank].cudaDev != peerInfo->cudaDev)) req.refcount++;
+  if (P2P_SAME_PID((comm->peerInfo + info->rank), myInfo) && (comm->peerInfo[info->rank].cudaDev != myInfo->cudaDev)) req.refcount++;
+  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 0, info->rank, &recv->proxyConn));
+  NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(struct ncclP2pRequest), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
 
   NCCLCHECK(p2pMap(comm, &recv->proxyConn, myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->recvDevMem, &resources->recvMemIpc));
+  resources->recvMemSameProc = P2P_SAME_PID(myInfo, (comm->peerInfo + info->rank));
   return ncclSuccess;
 }
 
@@ -459,6 +491,7 @@ static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* co
   struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
 
   NCCLCHECK(p2pMap(comm, &send->proxyConn, comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->recvMemIpc));
+  resources->recvMemSameProc = P2P_SAME_PID((comm->peerInfo + rank), (comm->peerInfo + info->rank));
 
   char* buff = (char*)(remDevMem+1);
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
@@ -499,17 +532,14 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
   struct ncclSendMem* remDevMem = NULL;
 
   if (useMemcpy) {
-    char shmPath[PATH_MAX];
-    sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName);
-    TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
-    resources->shmSize = info->shmSize;
     // Attach to peer's SHM segment
-    NCCLCHECK(ncclShmOpen(shmPath, info->shmSize, (void**)&resources->shm, (void**)&resources->devShm, -1, &resources->handle));
+    NCCLCHECK(ncclShmImportShareableBuffer(comm, &info->desc, (void**)&resources->shm, (void**)&resources->devShm, &resources->desc));
 
     recv->conn.tail = &resources->devShm->recvMem.tail;
     recv->conn.head = &resources->devShm->sendMem.head;
   } else {
     NCCLCHECK(p2pMap(comm, &recv->proxyConn, comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->sendMemIpc));
+    resources->sendMemSameProc = P2P_SAME_PID((comm->peerInfo + rank), (comm->peerInfo + info->rank));
 
     struct ncclRecvMem* devMem = resources->recvDevMem;
     recv->conn.tail = &devMem->tail;
@@ -538,8 +568,21 @@ ncclResult_t p2pSendFree(struct ncclConnector* send) {
   if (resources) {
     if (ncclCuMemEnable()) {
       // cuMem API support
-      if (resources->sendMemIpc) NCCLCHECK(ncclCudaFree(resources->sendMemIpc));
-      if (resources->recvMemIpc) NCCLCHECK(ncclCudaFree(resources->recvMemIpc));
+      if (resources->sendMemIpc) {
+        if (resources->sendMemSameProc) {
+          NCCLCHECK(ncclCuMemFreeAddr(resources->sendMemIpc));
+        } else {
+          NCCLCHECK(ncclCudaFree(resources->sendMemIpc));
+        }
+      }
+
+      if (resources->recvMemIpc) {
+        if (resources->recvMemSameProc) {
+          NCCLCHECK(ncclCuMemFreeAddr(resources->recvMemIpc));
+        } else {
+          NCCLCHECK(ncclCudaFree(resources->recvMemIpc));
+        }
+      }
     }
     else {
       if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc));
@@ -555,14 +598,27 @@ ncclResult_t p2pRecvFree(struct ncclConnector* recv) {
   if (resources) {
     if (ncclCuMemEnable()) {
       // cuMem API support
-      if (resources->sendMemIpc) NCCLCHECK(ncclCudaFree(resources->sendMemIpc));
-      if (resources->recvMemIpc) NCCLCHECK(ncclCudaFree(resources->recvMemIpc));
+      if (resources->sendMemIpc) {
+        if (resources->sendMemSameProc) {
+          NCCLCHECK(ncclCuMemFreeAddr(resources->sendMemIpc));
+        } else {
+          NCCLCHECK(ncclCudaFree(resources->sendMemIpc));
+        }
+      }
+
+      if (resources->recvMemIpc) {
+        if (resources->recvMemSameProc) {
+          NCCLCHECK(ncclCuMemFreeAddr(resources->recvMemIpc));
+        } else {
+          NCCLCHECK(ncclCudaFree(resources->recvMemIpc));
+        }
+      }
     }
     else {
       if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc));
       if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc));
       if (useMemcpy) {
-        NCCLCHECK(ncclShmClose(resources->handle));
+        NCCLCHECK(ncclShmIpcClose(&resources->desc));
       }
     }
     free(resources);
@@ -574,29 +630,27 @@ static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, st
   if (useMemcpy) {
     // CE memcpy support
     struct p2pShmProxyInfo* proxyInfo;
+    size_t shmSize;
+
+    if (respSize != sizeof(struct p2pShmProxyInfo)) return ncclInternalError;
     NCCLCHECK(ncclCalloc(&proxyInfo, 1));
     connection->transportResources = proxyInfo;
 
     NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, proxyState->buffSizes[NCCL_PROTO_SIMPLE]));
 
-    char shmPath[PATH_MAX];
-    shmPath[0] = '\0';
-    proxyInfo->shmSize = sizeof(struct ncclSendMem) + sizeof(struct ncclRecvMem);
     // Create a SHM segment for the peer to attach to
-    NCCLCHECK(ncclShmOpen(shmPath, proxyInfo->shmSize, (void**)&proxyInfo->shm, (void**)&proxyInfo->devShm, 1, &proxyInfo->handle));
-    TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, proxyInfo->shmSize);
-    memcpy(proxyInfo->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(proxyInfo->shmName));
+    shmSize = sizeof(struct ncclSendMem) + sizeof(struct ncclRecvMem);
+    NCCLCHECK(ncclShmAllocateShareableBuffer(proxyState->tpRank, shmSize, false, &proxyInfo->desc, (void**)&proxyInfo->shm, (void**)&proxyInfo->devShm));
 
     NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1));
-
-    if (respSize != sizeof(struct p2pShmProxyInfo)) return ncclInternalError;
     memcpy(respBuff, proxyInfo, sizeof(struct p2pShmProxyInfo));
   } else {
-    if (reqSize != sizeof(int)) return ncclInternalError;
-    int size = *((int*)reqBuff);
+    struct ncclP2pRequest* req = (struct ncclP2pRequest*)reqBuff;
+    if (reqSize != sizeof(struct ncclP2pRequest)) return ncclInternalError;
+    int size = req->size;
     if (respSize != sizeof(struct ncclP2pBuff)) return ncclInternalError;
     struct ncclP2pBuff* p2pBuff = (struct ncclP2pBuff*)respBuff;
-    NCCLCHECK(ncclP2pAllocateShareableBuffer(size, &p2pBuff->ipcDesc, &p2pBuff->directPtr));
+    NCCLCHECK(ncclP2pAllocateShareableBuffer(size, req->refcount, &p2pBuff->ipcDesc, &p2pBuff->directPtr));
     p2pBuff->size = size;
     if (ncclCuMemEnable()) {
       // cuMem API support
@@ -613,11 +667,12 @@ static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, st
 }
 
 static ncclResult_t p2pRecvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
-  if (reqSize != sizeof(int)) return ncclInternalError;
-  int size = *((int*)reqBuff);
+  struct ncclP2pRequest* req = (struct ncclP2pRequest*)reqBuff;
+  if (reqSize != sizeof(struct ncclP2pRequest)) return ncclInternalError;
+  int size = req->size;
   if (respSize != sizeof(struct ncclP2pBuff)) return ncclInternalError;
   struct ncclP2pBuff* p2pBuff = (struct ncclP2pBuff*)respBuff;
-  NCCLCHECK(ncclP2pAllocateShareableBuffer(size, &p2pBuff->ipcDesc, &p2pBuff->directPtr));
+  NCCLCHECK(ncclP2pAllocateShareableBuffer(size, req->refcount, &p2pBuff->ipcDesc, &p2pBuff->directPtr));
   p2pBuff->size = size;
   if (ncclCuMemEnable()) {
     // cuMem API support
@@ -651,7 +706,7 @@ static ncclResult_t p2pSendProxyFree(struct ncclProxyConnection* connection, str
   if (useMemcpy) {
     struct p2pShmProxyInfo* proxyInfo = (struct p2pShmProxyInfo*)connection->transportResources;
     if (proxyInfo) {
-      NCCLCHECK(ncclShmClose(proxyInfo->handle));
+      NCCLCHECK(ncclShmIpcClose(&proxyInfo->desc));
       NCCLCHECK(ncclCudaHostFree(proxyInfo->ceRecvMem));
       NCCLCHECK(ncclCudaFree(proxyInfo->ceDevBuff));
       CUDACHECK(cudaStreamDestroy(proxyInfo->stream));
@@ -752,11 +807,382 @@ static ncclResult_t p2pSendProxyProgress(struct ncclProxyState* proxyState, stru
   return ncclSuccess;
 }
 
+ncclResult_t ncclIpcLocalRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut) {
+  ncclResult_t ret = ncclSuccess;
+  struct ncclReg *regRecord = NULL;
+  struct ncclIpcRegInfo* newInfo = NULL;
+  uintptr_t* peerRmtAddrs = NULL;
+  bool legacyIpcCap = false;
+  size_t baseSize = 0;
+  void* baseAddr = NULL;
+  bool needUpdate = false;
+
+  *regBufFlag = 0;
+  *offsetOut = 0;
+  *peerRmtAddrsOut = NULL;
+  if (comm && userbuff && buffSize > 0 && nPeers > 0) {
+    NCCLCHECKGOTO(ncclRegFind(comm, userbuff, buffSize, &regRecord), ret, fail);
+    if (regRecord) {
+      // buffer was registered by by users, we need to start to register or reuse it
+      int peerLocalRank;
+      for (int p = 0; p < nPeers; p++) {
+        int peerRank = peerRanks[p];
+        peerLocalRank = comm->rankToLocalRank[peerRank];
+        if (regRecord->ipcInfos[peerLocalRank]) {
+          // We already have IPC info for peerLocalRank, no need to register it, we can reuse it
+          *regBufFlag = 1;
+          INFO(NCCL_REG, "rank %d - IPC local reuse buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p", comm->rank, userbuff, buffSize, (void*)regRecord->addr, regRecord->pages * comm->regCache.pageSize, peerRank, regRecord->ipcInfos[peerLocalRank]->impInfo.rmtRegAddr);
+        } else {
+          // Register buffer with peerLocalRank
+          struct ncclProxyConnector* proxyConn = NULL;
+          struct p2pIpcExpInfo ipcInfo;
+
+          if (baseAddr == NULL) {
+            CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&baseAddr, &baseSize, (CUdeviceptr)userbuff), ret, fail);
+            CUCHECKGOTO(cuPointerGetAttribute((void*)&legacyIpcCap, CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE, (CUdeviceptr)baseAddr), ret, fail);
+          }
+          if (comm->gproxyConn[peerRank].initialized == false)
+            NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_P2P, 1, peerRank, &comm->gproxyConn[peerRank]), ret, fail);
+          proxyConn = &comm->gproxyConn[peerRank];
+
+          ipcInfo.legacyIpcCap = legacyIpcCap;
+          // Get the mem handle for that buffer. It may have been allocated through cudaMalloc in which case we'll
+          // get the CUDA legacy mem handle, or through cuMem*.
+          if (ipcInfo.legacyIpcCap) {
+            // legacy export
+            if (comm->directMode) goto fail;
+            CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail);
+          } else if (ncclCuMemEnable()) {
+            CUmemGenericAllocationHandle handle;
+            if (CUPFN(cuMemRetainAllocationHandle(&handle, baseAddr)) != CUDA_SUCCESS) {
+              // if cuMem* export fails, retry legacy export
+              if (comm->directMode) goto fail;
+              CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail);
+              ipcInfo.legacyIpcCap = true;
+            } else {
+              // cuMem* export to file descriptor or fabric handle
+              if (proxyConn->sameProcess) {
+                memcpy(&ipcInfo.ipcDesc.memHandle, &handle, sizeof(CUmemGenericAllocationHandle));
+              } else {
+                if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
+                  int expFd = -1;
+                  CUCHECKGOTO(cuMemExportToShareableHandle(&expFd, handle, ncclCuMemHandleType, 0), ret, fail);
+                  NCCLCHECKGOTO(ncclProxyClientQueryFdBlocking(comm, proxyConn, expFd, &ipcInfo.impFd), ret, fail);
+                  SYSCHECKGOTO(close(expFd), "close", ret, fail);
+                } else {
+                  // Allow this to silently fail for cases where the user buff cannot be registered
+                  if (CUPFN(cuMemExportToShareableHandle(&ipcInfo.ipcDesc.cuDesc.handle, handle, ncclCuMemHandleType, 0)) != CUDA_SUCCESS) {
+                    CUCHECKGOTO(cuMemRelease(handle), ret, fail);
+                    goto fail;
+                  }
+                }
+              }
+              CUCHECKGOTO(cuMemRelease(handle), ret, fail);
+            }
+          } else {
+            // nothing works, just return
+            goto fail;
+          }
+
+          void* rmtRegAddr = NULL;
+          ipcInfo.size = baseSize;
+          ipcInfo.offset = regRecord->addr - (uintptr_t)baseAddr;
+          // Now ipcInfo contains all necessary registration info. Start to register buffer on proxy side
+          // and get the remote register address back.
+          if (proxyConn)
+            NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgRegister, &ipcInfo, sizeof(p2pIpcExpInfo), &rmtRegAddr, sizeof(void*)), ret, fail);
+          if (rmtRegAddr) {
+            NCCLCHECKGOTO(ncclCalloc(&newInfo, 1), ret, fail);
+            assert(regRecord->ipcInfos[peerLocalRank] == NULL);
+            regRecord->state |= IPC_REG_COMPLETE;
+            newInfo->peerRank = peerRank;
+            newInfo->baseAddr = baseAddr;
+            newInfo->impInfo.rmtRegAddr = rmtRegAddr;
+            newInfo->impInfo.offset = ipcInfo.offset;
+            newInfo->impInfo.legacyIpcCap = ipcInfo.legacyIpcCap;
+            newInfo->ipcProxyconn = proxyConn;
+            regRecord->ipcInfos[peerLocalRank] = newInfo;
+            if (regRecord->regIpcAddrs.hostPeerRmtAddrs == NULL) {
+              NCCLCHECKGOTO(ncclCalloc(&regRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks), ret, fail);
+            }
+            regRecord->regIpcAddrs.hostPeerRmtAddrs[peerLocalRank] = (uintptr_t)rmtRegAddr;
+            needUpdate = true;
+            *regBufFlag = 1;
+            INFO(NCCL_REG, "rank %d - IPC local register buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p offsetOut %ld", comm->rank, userbuff, buffSize, (void*)regRecord->addr, ipcInfo.size, peerRank, rmtRegAddr, (uintptr_t)userbuff - regRecord->addr);
+          }
+        }
+      }
+
+      if (*regBufFlag) {
+        if (type == NCCL_IPC_COLLECTIVE) {
+          // for collective, store registered remote buffers into dev memory for future reference
+          if (regRecord->regIpcAddrs.devPeerRmtAddrs == NULL || needUpdate) {
+            NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail);
+            if (regRecord->regIpcAddrs.devPeerRmtAddrs == NULL)
+              NCCLCHECKGOTO(ncclCudaCallocAsync(&regRecord->regIpcAddrs.devPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
+            if (needUpdate)
+              NCCLCHECKGOTO(ncclCudaMemcpyAsync(regRecord->regIpcAddrs.devPeerRmtAddrs, regRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
+            NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), ret, fail);
+            NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), ret, fail);
+          }
+          peerRmtAddrs = regRecord->regIpcAddrs.devPeerRmtAddrs;
+        } else {
+          assert(nPeers == 1);
+          // p2p always returns remote addr here since remote buffer addr is passed in ncclDevWorkP2p struct
+          peerRmtAddrs = (uintptr_t*)regRecord->regIpcAddrs.hostPeerRmtAddrs[peerLocalRank];
+        }
+        *offsetOut = (uintptr_t)userbuff - regRecord->addr;
+        *peerRmtAddrsOut = peerRmtAddrs;
+      }
+    }
+  }
+
+exit:
+  return ret;
+fail:
+  *regBufFlag = 0;
+  *offsetOut = 0;
+  *peerRmtAddrsOut = NULL;
+  if (newInfo) free(newInfo);
+  goto exit;
+}
+
+struct ncclIpcCleanupCallback {
+  struct ncclCommCallback base;
+  bool isAddrs;
+  union {
+    struct ncclIpcRegInfo regInfo;
+    struct ncclPeerRegIpcAddr regIpcAddrs;
+  };
+};
+
+static ncclResult_t cleanupIpc(struct ncclComm* comm, struct ncclCommCallback* cb) {
+  struct ncclIpcCleanupCallback* obj = (struct ncclIpcCleanupCallback*)cb;
+  if (obj->isAddrs) {
+    if (obj->regIpcAddrs.hostPeerRmtAddrs)
+      free(obj->regIpcAddrs.hostPeerRmtAddrs);
+    if (obj->regIpcAddrs.devPeerRmtAddrs)
+      NCCLCHECK(ncclCudaFree(obj->regIpcAddrs.devPeerRmtAddrs));
+  } else {
+    NCCLCHECK(ncclIpcDeregBuffer(comm, &obj->regInfo));
+  }
+  free(obj);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIpcGraphRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut, void* cleanupQueuePtr, int* nCleanupQueueElts) {
+  ncclResult_t ret = ncclSuccess;
+  struct ncclProxyConnector* proxyConn = NULL;
+  struct p2pIpcExpInfo ipcInfo;
+  void* baseAddr;
+  size_t baseSize;
+  struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue = reinterpret_cast<struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>*>(cleanupQueuePtr);
+  uintptr_t* peerRmtAddrs = NULL;
+  struct ncclIpcCleanupCallback* addrsRecord = NULL;
+
+  *regBufFlag = 0;
+  CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&baseAddr, &baseSize, (CUdeviceptr)userbuff), ret, fail);
+  CUCHECKGOTO(cuPointerGetAttribute((void*)&ipcInfo.legacyIpcCap, CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE, (CUdeviceptr)baseAddr), ret, fail);
+
+  if (type == NCCL_IPC_COLLECTIVE) {
+    // collective needs host memory array to hold all remote buffer addrs.
+    // We need to put this into graph release queue
+    NCCLCHECKGOTO(ncclCalloc(&addrsRecord, 1), ret, fail);
+    addrsRecord->base.fn = cleanupIpc;
+    addrsRecord->isAddrs = true;
+    NCCLCHECKGOTO(ncclCalloc(&addrsRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks), ret, fail);
+  } else {
+    assert(nPeers == 1);
+    // p2p does not need anything, just returning the remote buffer is enough, but for now, we register
+    // peer one by one so nPeers must be 1
+  }
+
+  for (int p = 0; p < nPeers; ++p) {
+    int peerRank = peerRanks[p];
+    if (comm->gproxyConn[peerRank].initialized == false)
+      NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_P2P, 1, peerRank, &comm->gproxyConn[peerRank]), ret, fail);
+    proxyConn = &comm->gproxyConn[peerRank];
+    // Same as local registration. Get the mem handle for that buffer. It may have been allocated through
+    // cudaMalloc in which case we'll get the CUDA legacy mem handle, or through cuMem*.
+    if (ipcInfo.legacyIpcCap) {
+      if (comm->directMode) goto fail;
+      CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail);
+    } else if (ncclCuMemEnable()) {
+      // cuMem* export
+      CUmemGenericAllocationHandle handle;
+      if (pfn_cuMemRetainAllocationHandle(&handle, baseAddr) != CUDA_SUCCESS) {
+        if (comm->directMode) goto fail;
+        CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail);
+        ipcInfo.legacyIpcCap = true;
+      } else {
+        if (proxyConn->sameProcess) {
+          memcpy(&ipcInfo.ipcDesc.memHandle, &handle, sizeof(CUmemGenericAllocationHandle));
+        } else {
+          if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
+            int expFd = -1;
+            CUCHECKGOTO(cuMemExportToShareableHandle(&expFd, handle, ncclCuMemHandleType, 0), ret, fail);
+            if (proxyConn->sameProcess) {
+              ipcInfo.impFd = expFd;
+            } else {
+              NCCLCHECKGOTO(ncclProxyClientQueryFdBlocking(comm, proxyConn, expFd, &ipcInfo.impFd), ret, fail);
+              SYSCHECKGOTO(close(expFd), "close", ret, fail);
+            }
+          } else {
+            CUCHECKGOTO(cuMemExportToShareableHandle(&ipcInfo.ipcDesc.cuDesc.handle, handle, ncclCuMemHandleType, 0), ret, fail);
+          }
+        }
+        CUCHECKGOTO(cuMemRelease(handle), ret, fail);
+      }
+    } else {
+      goto fail;
+    }
+
+    void* rmtRegAddr = NULL;
+    ipcInfo.size = baseSize;
+    ipcInfo.offset = 0;
+    NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgRegister, &ipcInfo, sizeof(struct p2pIpcExpInfo), &rmtRegAddr, sizeof(void*)), ret, fail);
+    if (rmtRegAddr) {
+      struct ncclIpcCleanupCallback* record;
+      NCCLCHECKGOTO(ncclCalloc(&record, 1), ret, fail);
+      record->base.fn = cleanupIpc;
+      record->isAddrs = false;
+      record->regInfo.peerRank = peerRank;
+      record->regInfo.baseAddr = baseAddr;
+      record->regInfo.impInfo.rmtRegAddr = rmtRegAddr;
+      record->regInfo.impInfo.offset = 0;
+      record->regInfo.impInfo.legacyIpcCap = ipcInfo.legacyIpcCap;
+      record->regInfo.ipcProxyconn = proxyConn;
+      // store the remote address into host addr array
+      if (type == NCCL_IPC_COLLECTIVE)
+        addrsRecord->regIpcAddrs.hostPeerRmtAddrs[comm->rankToLocalRank[peerRank]] = (uintptr_t)rmtRegAddr;
+      else
+        peerRmtAddrs = (uintptr_t*)rmtRegAddr;
+      *regBufFlag = 1;
+      if (ipcInfo.legacyIpcCap)
+        ncclIntruQueueEnqueue(&comm->legacyRegCleanupQueue, &record->base);
+      else
+        ncclIntruQueueEnqueue(cleanupQueue, &record->base);
+      if (nCleanupQueueElts) *nCleanupQueueElts += 1;
+      INFO(NCCL_REG, "rank %d - IPC graph register buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p offsetOut %ld", comm->rank, userbuff, buffSize, baseAddr, ipcInfo.size, peerRank, rmtRegAddr, (uintptr_t)userbuff - (uintptr_t)baseAddr);
+    }
+  }
+
+  if (type == NCCL_IPC_COLLECTIVE) {
+    // allocate the dev addr array and copy all previously stored addrs into it.
+    NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail);
+    NCCLCHECKGOTO(ncclCudaCallocAsync(&addrsRecord->regIpcAddrs.devPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
+    NCCLCHECKGOTO(ncclCudaMemcpyAsync(addrsRecord->regIpcAddrs.devPeerRmtAddrs, addrsRecord->regIpcAddrs.hostPeerRmtAddrs, comm->nRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
+    NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), ret, fail);
+    NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), ret, fail);
+    peerRmtAddrs = addrsRecord->regIpcAddrs.devPeerRmtAddrs;
+    if (ipcInfo.legacyIpcCap)
+      ncclIntruQueueEnqueue(&comm->legacyRegCleanupQueue, &addrsRecord->base);
+    else
+      ncclIntruQueueEnqueue(cleanupQueue, &addrsRecord->base);
+  }
+  *offsetOut = (uintptr_t)userbuff - (uintptr_t)baseAddr;
+  *peerRmtAddrsOut = peerRmtAddrs;
+
+exit:
+  return ret;
+fail:
+  *regBufFlag = 0;
+  *offsetOut = 0;
+  *peerRmtAddrsOut = NULL;
+  goto exit;
+}
+
+ncclResult_t ncclIpcDeregBuffer(struct ncclComm* comm, struct ncclIpcRegInfo* regInfo) {
+  NCCLCHECK(ncclProxyCallBlocking(comm, regInfo->ipcProxyconn, ncclProxyMsgDeregister, &regInfo->impInfo, sizeof(struct ncclIpcImpInfo), NULL, 0));
+  INFO(NCCL_REG, "rank %d - IPC deregistered buffer %p peer %d ipc remote buffer %p", comm->rank, regInfo->baseAddr, regInfo->peerRank, regInfo->impInfo.rmtRegAddr);
+  return ncclSuccess;
+}
+
+static ncclResult_t p2pProxyRegister(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  struct p2pIpcExpInfo* ipcExpInfo = (struct p2pIpcExpInfo*)reqBuff;
+  void* regAddr = NULL;
+  ncclResult_t ret = ncclSuccess;
+  bool mapped = false;
+  bool imported = false;
+  CUmemGenericAllocationHandle handle;
+
+  assert(sizeof(struct p2pIpcExpInfo) == reqSize);
+  assert(sizeof(void*) == respSize);
+
+  // request peer passes all necessary buffer info to import. The proxy thread would register
+  // the buffer locally and return register addr back
+  if (ipcExpInfo->legacyIpcCap) {
+    // legacy import
+    CUDACHECKGOTO(cudaIpcOpenMemHandle(&regAddr, ipcExpInfo->ipcDesc.devIpc, cudaIpcMemLazyEnablePeerAccess), ret, fail);
+    regAddr = (void*)((uintptr_t)regAddr + ipcExpInfo->offset);
+  } else {
+    // cuMem import
+    if (connection->sameProcess) {
+      // if proxy is same process as request peer, we just need to map the handle.
+      memcpy(&handle, &ipcExpInfo->ipcDesc.memHandle, sizeof(CUmemGenericAllocationHandle));
+    } else {
+      if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
+        CUCHECKGOTO(cuMemImportFromShareableHandle(&handle, (void*)(uintptr_t)ipcExpInfo->impFd, ncclCuMemHandleType), ret, fail);
+        SYSCHECKGOTO(close(ipcExpInfo->impFd), "close", ret, fail);
+      } else {
+        CUCHECKGOTO(cuMemImportFromShareableHandle(&handle, (void*)&ipcExpInfo->ipcDesc.cuDesc, ncclCuMemHandleType), ret, fail);
+      }
+    }
+    imported = true;
+    CUCHECKGOTO(cuMemAddressReserve((CUdeviceptr*)&regAddr, ipcExpInfo->size, /* alignment */ 0, /* addr */ 0, /* flags */ 0), ret, fail);
+    CUCHECKGOTO(cuMemMap((CUdeviceptr)regAddr, ipcExpInfo->size, /* offset */ 0, handle, /* flags */ 0), ret, fail);
+    mapped = true;
+    // Allow access by the local GPU
+    CUmemAccessDesc accessDesc = {};
+    accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+    accessDesc.location.id = proxyState->cudaDev;
+    accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+    CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)regAddr, ipcExpInfo->size, &accessDesc, 1), ret, fail);
+    regAddr = (void*)((uintptr_t)regAddr + ipcExpInfo->offset);
+  }
+  INFO(NCCL_REG, "Proxy rank %d register succeeds, regAddr %p size %ld offset %ld legacyIpcCap %d sameProcess %d", proxyState->tpRank, regAddr, ipcExpInfo->size, ipcExpInfo->offset, ipcExpInfo->legacyIpcCap, connection->sameProcess);
+
+exit:
+  memcpy(respBuff, (void*)&regAddr, sizeof(void*));
+  *done = 1;
+  return ret;
+fail:
+  if (!ipcExpInfo->legacyIpcCap) {
+    if (mapped) CUCHECK(cuMemUnmap((CUdeviceptr)regAddr, ipcExpInfo->size));
+    if (regAddr) CUCHECK(cuMemAddressFree((CUdeviceptr)regAddr, ipcExpInfo->size));
+    if (imported) CUCHECK(cuMemRelease(handle));
+  }
+  regAddr = NULL;
+  goto exit;
+}
+
+static ncclResult_t p2pProxyDeregister(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, int* done) {
+  ncclResult_t ret = ncclSuccess;
+  struct ncclIpcImpInfo* ipcInfo = (struct ncclIpcImpInfo*)reqBuff;
+  assert(sizeof(struct ncclIpcImpInfo) == reqSize);
+
+  if (ipcInfo->legacyIpcCap) {
+    CUDACHECKGOTO(cudaIpcCloseMemHandle((void*)((uintptr_t)ipcInfo->rmtRegAddr - ipcInfo->offset)), ret, fail);
+  } else {
+    if (connection->sameProcess) {
+      NCCLCHECKGOTO(ncclCuMemFreeAddr((void*)((uintptr_t)ipcInfo->rmtRegAddr - ipcInfo->offset)), ret, fail);
+    } else {
+      NCCLCHECKGOTO(ncclCudaFree((void*)((uintptr_t)ipcInfo->rmtRegAddr - ipcInfo->offset)), ret, fail);
+    }
+  }
+
+exit:
+  *done = 1;
+  return ret;
+fail:
+  goto exit;
+}
+
 struct ncclTransport p2pTransport = {
   "P2P",
   p2pCanConnect,
-  { p2pSendSetup, p2pSendConnect, p2pSendFree, NULL, p2pSendProxySetup, NULL, p2pSendProxyFree, NULL, NULL },
-  { p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL, p2pRecvProxySetup, NULL, p2pRecvProxyFree, NULL, NULL }
+  { p2pSendSetup, p2pSendConnect, p2pSendFree, NULL, p2pSendProxySetup, NULL, p2pSendProxyFree, NULL, p2pProxyRegister, p2pProxyDeregister },
+  { p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL, p2pRecvProxySetup, NULL, p2pRecvProxyFree, NULL, p2pProxyRegister, p2pProxyDeregister }
 };
 
 static void initCeOperation() {
diff --git a/src/transport/shm.cc b/src/transport/shm.cc
index 7fc6251b63..9be95fd803 100644
--- a/src/transport/shm.cc
+++ b/src/transport/shm.cc
@@ -5,35 +5,58 @@
  ************************************************************************/
 
 #include "comm.h"
+#include "shmutils.h"
 #include "shm.h"
 #include "transport.h"
 
-struct shmConnectInfo {
-  char shmName[7];
-  int shmSize;
+#define SHM_PATH_MAX 128
+#define SHM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
+
+struct shmBuffInfo {
+  void *hptr;
+  void *dptr;
+};
+
+struct shmConnectInfo {
+  ncclShmIpcDesc_t desc;
+  struct shmBuffInfo buf;
 };
-static_assert(sizeof(shmConnectInfo) <= CONNECT_SIZE, "SHM Connect info is too large");
 
 struct shmSendResources {
-  int remShmSize;
   struct ncclRecvMem* remHostMem;
   struct ncclRecvMem* devRemHostMem;
-  ncclShmHandle_t remHandle;
-  int shmSize;
+  ncclShmIpcDesc_t remDesc;
   struct ncclSendMem* hostMem;
   struct ncclSendMem* devHostMem;
-  ncclShmHandle_t hostHandle;
 };
 
 struct shmRecvResources {
-  int remShmSize;
   struct ncclSendMem* remHostMem;
   struct ncclSendMem* devRemHostMem;
-  ncclShmHandle_t remHandle;
-  int shmSize;
+  ncclShmIpcDesc_t remDesc;
   struct ncclRecvMem* hostMem;
   struct ncclRecvMem* devHostMem;
-  ncclShmHandle_t hostHandle;
+};
+
+struct shmProxyInfo {
+  struct ncclRecvMem* ceRecvMem;
+  char* devFifo;
+  char* shmFifo;
+  struct ncclSendMem* sendMem;
+  struct ncclRecvMem* recvMem;
+
+  // used by progress only
+  uint64_t step;
+  cudaStream_t stream;
+  cudaEvent_t events[NCCL_STEPS];
+
+  // ipc desc
+  ncclShmIpcDesc_t desc;
+};
+
+struct shmRequest {
+  size_t size;
+  bool legacy;
 };
 
 #define SHM_SEND_SIDE 1
@@ -48,14 +71,14 @@ static int shmLocality = 0;
 static void initCeOperation();
 
 /* Determine two peers can communicate with SHM */
-static ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
+static ncclResult_t shmCanConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
   *ret = 0;
   initCeOperation();
 
   if (ncclParamShmDisable() == 1) return ncclSuccess;
 
   int useNet = 0;
-  NCCLCHECK(ncclTopoCheckNet(topo, info1->busId, info2->busId, &useNet));
+  NCCLCHECK(ncclTopoCheckNet(comm->topo, info1->rank, info2->rank, &useNet));
   if (useNet) return ncclSuccess;
 
   // Same host?
@@ -76,22 +99,29 @@ static ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct
 /* Create and return connect structures for this peer to connect to me */
 static ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
   struct shmSendResources* resources;
+  struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
+  size_t shmSize = sizeof(struct ncclSendMem);
+  struct shmRequest req;
+
+  static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Info is too big");
+
   NCCLCHECK(ncclCalloc(&resources, 1));
   send->transportResources = resources;
 
-  static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Info is too big");
-  struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
-
-  char shmPath[PATH_MAX];
-  shmPath[0] = '\0';
-  int shmSize = sizeof(struct ncclSendMem);
   if (shmLocality == SHM_SEND_SIDE) {
     for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += comm->buffSizes[p];
   }
-  info->shmSize = resources->shmSize = shmSize;
-  NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1, &resources->hostHandle));
-  TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize);
-  memcpy(info->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(info->shmName));
+  req.size = shmSize;
+  if (myInfo->hostHash == peerInfo->hostHash && myInfo->pidHash == peerInfo->pidHash)
+    req.legacy = true;
+  else
+    req.legacy = false;
+
+  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 1, myInfo->rank, &send->proxyConn));
+  NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, (void*)&req, sizeof(struct shmRequest), (void*)info, sizeof(struct shmConnectInfo)));
+
+  resources->hostMem = (struct ncclSendMem*)info->buf.hptr;
+  resources->devHostMem = (struct ncclSendMem*)info->buf.dptr;
 
   INFO(NCCL_INIT|NCCL_SHM,"Channel %02d : %d[%d] -> %d[%d] via SHM/%s/%s", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, useMemcpySend?"CE":"direct", useMemcpyRecv?"CE":"direct");
   return ncclSuccess;
@@ -99,52 +129,43 @@ static ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* gr
 
 static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
   struct shmRecvResources* resources;
+  struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
+  size_t shmSize = sizeof(struct ncclRecvMem);
+  struct shmRequest req;
+
   NCCLCHECK(ncclCalloc(&resources, 1));
   recv->transportResources = resources;
 
   static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Info is too big");
-  struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
 
-  char shmPath[PATH_MAX];
-  shmPath[0] = '\0';
-  int shmSize = sizeof(struct ncclRecvMem);
   if (shmLocality == SHM_RECV_SIDE) {
     for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += comm->buffSizes[p];
   }
-  info->shmSize = resources->shmSize = shmSize;
-  NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1, &resources->hostHandle));
-  TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize);
-  memcpy(info->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(info->shmName));
+  req.size = shmSize;
+  if (myInfo->hostHash == peerInfo->hostHash && myInfo->pidHash == peerInfo->pidHash)
+    req.legacy = true;
+  else
+    req.legacy = false;
+
+  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 0, myInfo->rank, &recv->proxyConn));
+  NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, (void*)&req, sizeof(struct shmRequest), (void*)info, sizeof(struct shmConnectInfo)));
+
+  resources->hostMem = (struct ncclRecvMem*)info->buf.hptr;
+  resources->devHostMem = (struct ncclRecvMem*)info->buf.dptr;
 
   return ncclSuccess;
 }
 
-struct shmProxyInfo {
-  struct ncclRecvMem* ceRecvMem;
-  char* devFifo;
-  char* shmFifo;
-  struct ncclSendMem* sendMem;
-  struct ncclRecvMem* recvMem;
-
-  // used by progress only
-  uint64_t step;
-  cudaStream_t stream;
-  cudaEvent_t events[NCCL_STEPS];
-};
-
 /* Connect to this peer */
 static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
   // Setup device pointers
   struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
   struct shmSendResources* resources = (struct shmSendResources*)send->transportResources;
+  char* buff;
 
-  char shmPath[PATH_MAX];
-  sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName);
-  resources->remShmSize = info->shmSize;
-  TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
-  NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, -1, &resources->remHandle));
+  NCCLCHECK(ncclShmImportShareableBuffer(comm, &info->desc, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, &resources->remDesc));
 
-  char* buff = shmLocality == SHM_SEND_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1);
+  buff = shmLocality == SHM_SEND_SIDE ? (char*)(resources->devHostMem + 1) : (char*)(resources->devRemHostMem + 1);
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
     send->conn.buffs[p] = buff;
     buff += comm->buffSizes[p];
@@ -157,9 +178,6 @@ static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* co
     send->conn.connFifo = resources->devRemHostMem->connFifo;
   }
   if (useMemcpySend) {
-    int tpProxyRank;
-    tpProxyRank = comm->topParentRanks[comm->rank];
-    NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 1, tpProxyRank, &send->proxyConn));
     struct shmProxyInfo proxyInfo = { NULL, NULL, send->conn.buffs[NCCL_PROTO_SIMPLE], resources->hostMem, resources->remHostMem };
     NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
     send->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo;
@@ -177,14 +195,11 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co
   // Setup device pointers
   struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
   struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
+  char* buff;
 
-  char shmPath[PATH_MAX];
-  sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName);
-  resources->remShmSize = info->shmSize;
-  TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
-  NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, -1, &resources->remHandle));
+  NCCLCHECK(ncclShmImportShareableBuffer(comm, &info->desc, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, &resources->remDesc));
 
-  char* buff = shmLocality == SHM_RECV_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1);
+  buff = shmLocality == SHM_RECV_SIDE ? (char*)(resources->devHostMem + 1) : (char*)(resources->devRemHostMem + 1);
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
     recv->conn.buffs[p] = buff;
     buff += comm->buffSizes[p];
@@ -194,7 +209,6 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co
   recv->conn.stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
 
   if (useMemcpyRecv) {
-    NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 0, comm->rank, &recv->proxyConn));
     struct shmProxyInfo proxyInfo = { NULL, NULL, recv->conn.buffs[NCCL_PROTO_SIMPLE], resources->remHostMem, resources->hostMem };
     NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
     recv->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo;
@@ -210,8 +224,7 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co
 static ncclResult_t shmSendFree(struct ncclConnector* send) {
   struct shmRecvResources* resources = (struct shmRecvResources*)send->transportResources;
   if (resources) {
-    NCCLCHECK(ncclShmClose(resources->hostHandle));
-    NCCLCHECK(ncclShmClose(resources->remHandle));
+    NCCLCHECK(ncclShmIpcClose(&resources->remDesc));
     free(resources);
     send->transportResources = NULL;
   }
@@ -221,8 +234,7 @@ static ncclResult_t shmSendFree(struct ncclConnector* send) {
 static ncclResult_t shmRecvFree(struct ncclConnector* recv) {
   struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
   if (resources) {
-    NCCLCHECK(ncclShmClose(resources->hostHandle));
-    NCCLCHECK(ncclShmClose(resources->remHandle));
+    NCCLCHECK(ncclShmIpcClose(&resources->remDesc));
     free(resources);
     recv->transportResources = NULL;
   }
@@ -230,51 +242,76 @@ static ncclResult_t shmRecvFree(struct ncclConnector* recv) {
 }
 
 static ncclResult_t shmSendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  ncclResult_t ret = ncclSuccess;
+  if (reqSize != sizeof(struct shmProxyInfo) || respSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
   struct shmProxyInfo* proxyInfo;
-  NCCLCHECK(ncclCalloc(&proxyInfo, 1));
-  if (reqSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
-  memcpy(proxyInfo, reqBuff, reqSize);
-  NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, proxyState->buffSizes[NCCL_PROTO_SIMPLE]));
-  NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1));
-  CUDACHECK(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking));
+  struct shmProxyInfo* reqInfo = (struct shmProxyInfo*)reqBuff;
+
+  proxyInfo = (struct shmProxyInfo*)connection->transportResources;
+  proxyInfo->shmFifo = reqInfo->shmFifo;
+  proxyInfo->sendMem = reqInfo->sendMem;
+  proxyInfo->recvMem = reqInfo->recvMem;
+  NCCLCHECKGOTO(ncclCudaCalloc(&proxyInfo->devFifo, proxyState->buffSizes[NCCL_PROTO_SIMPLE]), ret, fail);
+  NCCLCHECKGOTO(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1), ret, fail);
+  CUDACHECKGOTO(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking), ret, fail);
   for (int i=0; i<NCCL_STEPS; i++) {
-    CUDACHECK(cudaEventCreate(proxyInfo->events+i));
+    CUDACHECKGOTO(cudaEventCreate(proxyInfo->events+i), ret, fail);
   }
   connection->proxyAppendPtr = &connection->proxyAppend;
   connection->transportResources = proxyInfo;
   if (respSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
   memcpy(respBuff, proxyInfo, respSize);
-  return ncclSuccess;
+  *done = 1;
+exit:
+  return ret;
+fail:
+  if (proxyInfo->ceRecvMem) ncclCudaHostFree(proxyInfo->ceRecvMem);
+  if (proxyInfo->devFifo) (void)ncclCudaFree(proxyInfo->devFifo);
+  free(proxyInfo);
+  goto exit;
 }
 
 static ncclResult_t shmRecvProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  ncclResult_t ret = ncclSuccess;
+  if (reqSize != sizeof(struct shmProxyInfo) || respSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
   struct shmProxyInfo* proxyInfo;
-  NCCLCHECK(ncclCalloc(&proxyInfo, 1));
-  if (reqSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
-  memcpy(proxyInfo, reqBuff, reqSize);
-  NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, proxyState->buffSizes[NCCL_PROTO_SIMPLE]));
-  NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1));
-  CUDACHECK(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking));
+  struct shmProxyInfo* reqInfo = (struct shmProxyInfo*)reqBuff;
+
+  proxyInfo = (struct shmProxyInfo*)connection->transportResources;
+  proxyInfo->shmFifo = reqInfo->shmFifo;
+  proxyInfo->sendMem = reqInfo->sendMem;
+  proxyInfo->recvMem = reqInfo->recvMem;
+  NCCLCHECKGOTO(ncclCudaCalloc(&proxyInfo->devFifo, proxyState->buffSizes[NCCL_PROTO_SIMPLE]), ret, fail);
+  NCCLCHECKGOTO(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1), ret, fail);
+  CUDACHECKGOTO(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking), ret, fail);
   for (int i=0; i<NCCL_STEPS; i++) {
-    CUDACHECK(cudaEventCreate(proxyInfo->events+i));
+    CUDACHECKGOTO(cudaEventCreate(proxyInfo->events+i), ret, fail);
   }
   connection->proxyAppendPtr = &connection->proxyAppend;
-  connection->transportResources = proxyInfo;
-  if (respSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
   memcpy(respBuff, proxyInfo, respSize);
-  return ncclSuccess;
+  *done = 1;
+exit:
+  return ret;
+fail:
+  if (proxyInfo->ceRecvMem) ncclCudaHostFree(proxyInfo->ceRecvMem);
+  if (proxyInfo->devFifo) (void)ncclCudaFree(proxyInfo->devFifo);
+  free(proxyInfo);
+  goto exit;
 }
 
 static ncclResult_t shmSendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) {
   struct shmProxyInfo* resources = (struct shmProxyInfo*)connection->transportResources;
 
   if (resources) {
-    CUDACHECK(cudaStreamDestroy(resources->stream));
-    NCCLCHECK(ncclCudaFree(resources->devFifo));
-    NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
-    for (int i=0; i<NCCL_STEPS; i++) {
-      CUDACHECK(cudaEventDestroy(resources->events[i]));
+    if (useMemcpySend) {
+      CUDACHECK(cudaStreamDestroy(resources->stream));
+      NCCLCHECK(ncclCudaFree(resources->devFifo));
+      NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
+      for (int i=0; i<NCCL_STEPS; i++) {
+        CUDACHECK(cudaEventDestroy(resources->events[i]));
+      }
     }
+    NCCLCHECK(ncclShmIpcClose(&resources->desc));
     free(connection->transportResources);
     connection->transportResources = NULL;
   }
@@ -285,12 +322,15 @@ static ncclResult_t shmRecvProxyFree(struct ncclProxyConnection* connection, str
   struct shmProxyInfo* resources = (struct shmProxyInfo*)connection->transportResources;
 
   if (resources) {
-    CUDACHECK(cudaStreamDestroy(resources->stream));
-    NCCLCHECK(ncclCudaFree(resources->devFifo));
-    NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
-    for (int i=0; i<NCCL_STEPS; i++) {
-      CUDACHECK(cudaEventDestroy(resources->events[i]));
+    if (useMemcpyRecv) {
+      CUDACHECK(cudaStreamDestroy(resources->stream));
+      NCCLCHECK(ncclCudaFree(resources->devFifo));
+      NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
+      for (int i=0; i<NCCL_STEPS; i++) {
+        CUDACHECK(cudaEventDestroy(resources->events[i]));
+      }
     }
+    NCCLCHECK(ncclShmIpcClose(&resources->desc));
     free(connection->transportResources);
     connection->transportResources = NULL;
   }
@@ -413,12 +453,37 @@ static ncclResult_t shmRecvProxyProgress(struct ncclProxyState* proxyState, stru
   return ncclSuccess;
 }
 
-struct ncclTransport shmTransport = {
-  "SHM",
-  shmCanConnect,
-  { shmSendSetup, shmSendConnect, shmSendFree, NULL, NULL, NULL, NULL, NULL, NULL },
-  { shmRecvSetup, shmRecvConnect, shmRecvFree, NULL, NULL, NULL, NULL, NULL, NULL }
-};
+static ncclResult_t shmSendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  struct shmRequest* req = (struct shmRequest*)reqBuff;
+  /* check message size */
+  if (reqSize != sizeof(struct shmRequest)) return ncclInternalError;
+  if (respSize != sizeof(struct shmConnectInfo)) return ncclInternalError;
+
+  struct shmConnectInfo* info = (struct shmConnectInfo*)respBuff;
+  struct shmProxyInfo* proxyInfo;
+
+  NCCLCHECK(ncclCalloc(&proxyInfo, 1));
+  NCCLCHECK(ncclShmAllocateShareableBuffer(proxyState->tpRank, req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr));
+  memcpy(&info->desc, &proxyInfo->desc, sizeof(ncclShmIpcDesc_t));
+  connection->transportResources = proxyInfo;
+  return ncclSuccess;
+}
+
+static ncclResult_t shmRecvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  struct shmRequest* req = (struct shmRequest*)reqBuff;
+  /* check message size */
+  if (reqSize != sizeof(struct shmRequest)) return ncclInternalError;
+  if (respSize != sizeof(struct shmConnectInfo)) return ncclInternalError;
+
+  struct shmConnectInfo* info = (struct shmConnectInfo*)respBuff;
+  struct shmProxyInfo* proxyInfo;
+
+  NCCLCHECK(ncclCalloc(&proxyInfo, 1));
+  NCCLCHECK(ncclShmAllocateShareableBuffer(proxyState->tpRank, req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr));
+  memcpy(&info->desc, &proxyInfo->desc, sizeof(ncclShmIpcDesc_t));
+  connection->transportResources = proxyInfo;
+  return ncclSuccess;
+}
 
 static void initCeOperation() {
   static int init = 0;
@@ -427,12 +492,10 @@ static void initCeOperation() {
     useMemcpyRecv = ncclParamShmUseCudaMemcpy() && (ncclParamShmMemcpyMode() & 2);
     if (useMemcpySend) {
       shmTransport.send.proxyConnect = shmSendProxyConnect;
-      shmTransport.send.proxyFree = shmSendProxyFree;
       shmTransport.send.proxyProgress = shmSendProxyProgress;
     }
     if (useMemcpyRecv) {
       shmTransport.recv.proxyConnect = shmRecvProxyConnect;
-      shmTransport.recv.proxyFree = shmRecvProxyFree;
       shmTransport.recv.proxyProgress = shmRecvProxyProgress;
     }
     shmLocality = ncclParamShmLocality();
@@ -443,3 +506,152 @@ static void initCeOperation() {
     init = 1;
   }
 }
+
+ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool legacy, ncclShmIpcDesc_t *desc, void **hptr, void **dptr) {
+  if (desc == NULL || hptr == NULL || tpProxyRank < -1) {
+    WARN("Invalid argument desc %p, hptr %p, tpProxyRank %d", desc, hptr, tpProxyRank);
+    return ncclInvalidArgument;
+  }
+#if CUDART_VERSION >= 12020
+  if (ncclCuMemEnable() && ncclCuMemHostEnable() && !legacy) {
+    // cuMem API support
+    CUmemAllocationHandleType type = SHM_HANDLE_TYPE;
+    CUmemGenericAllocationHandle handle;
+
+    NCCLCHECK(ncclCuMemHostAlloc(hptr, &handle, size));
+    if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
+      // Return the native cuMem handle for later Export/Import via UDS
+      memcpy(&desc->shmci.data, &handle, sizeof(handle));
+      desc->shmci.tpProxyRank = tpProxyRank;
+    } else {
+      CUCHECK(cuMemExportToShareableHandle(&desc->shmci.handle, handle, type, 0));
+    }
+    desc->shmci.size = size;
+    desc->shmci.ptr = *hptr;
+    if (dptr) *dptr = *hptr;
+    desc->legacy = false;
+    INFO(NCCL_SHM, "CUMEM allocated shareable buffer %p size %zi", desc->shmci.ptr, desc->shmci.size);
+  } else {
+    char shmPath[SHM_PATH_MAX] = { '\0' };
+    desc->shmli.shmSize = size;
+    NCCLCHECK(ncclShmOpen(shmPath, size, hptr, dptr, 1, &desc->shmli.handle));
+    memcpy(desc->shmli.shmSuffix, shmPath + sizeof("/dev/shm/nccl-") - 1, sizeof(desc->shmli.shmSuffix));
+    desc->legacy = true;
+    INFO(NCCL_SHM, "MMAP allocated shareable host buffer %s size %zi ptr %p", shmPath, desc->shmli.shmSize, *hptr);
+  }
+#else /* CUDART_VERSION >= 12020 */
+  char shmPath[SHM_PATH_MAX] = { '\0' };
+  desc->shmli.shmSize = size;
+  NCCLCHECK(ncclShmOpen(shmPath, size, hptr, dptr, 1, &desc->shmli.handle));
+  memcpy(desc->shmli.shmSuffix, shmPath + sizeof("/dev/shm/nccl-") - 1, sizeof(desc->shmli.shmSuffix));
+  desc->legacy = true;
+  INFO(NCCL_SHM, "MMAP allocated shareable host buffer %s size %zi ptr %p", shmPath, size, *hptr);
+#endif /* CUDART_VERSION >= 12020 */
+  return ncclSuccess;
+}
+
+ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut) {
+  if (comm == NULL || desc == NULL || hptr == NULL || descOut == NULL) {
+    WARN("Invalid argument comm %p, desc %p, hptr %p, descOut %p", comm, desc, hptr, descOut);
+    return ncclInvalidArgument;
+  }
+#if CUDART_VERSION >= 12020
+  if (ncclCuMemEnable() && ncclCuMemHostEnable() && !desc->legacy) {
+    // cuMem API support
+    CUdeviceptr hostptr = 0;
+    CUmemAllocationHandleType type = SHM_HANDLE_TYPE;
+    CUmemGenericAllocationHandle handle;
+    int cudaDev;
+    CUdevice currentDev;
+    CUmemAccessDesc accessDesc = {};
+    int cpuNumaNodeId;
+    size_t granularity;
+    size_t size = desc->shmci.size;
+    CUmemAllocationProp prop = {};
+
+    // Import and map the remote memory descriptor to the local GPU
+    if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
+      // UDS fd support
+      int fd = -1;
+      // Send cuMem handle to remote for conversion to an fd
+      NCCLCHECK(ncclProxyClientGetFdBlocking(comm, desc->shmci.tpProxyRank, &desc->shmci.data, &fd));
+      CUCHECK(cuMemImportFromShareableHandle(&handle, (void *)(uintptr_t)fd, type));
+      (void) close(fd);
+    } else {
+      CUCHECK(cuMemImportFromShareableHandle(&handle, &desc->shmci.handle, type));
+    }
+
+    // Get cpu numa id
+    CUDACHECK(cudaGetDevice(&cudaDev));
+    CUCHECK(cuDeviceGet(&currentDev, cudaDev));
+    CUCHECK(cuDeviceGetAttribute(&cpuNumaNodeId, CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID, currentDev));
+    if (cpuNumaNodeId < 0) cpuNumaNodeId = 0;
+
+    // Get granularity
+    prop.location.type = CU_MEM_LOCATION_TYPE_HOST_NUMA;
+    prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+    prop.requestedHandleTypes = type;
+    prop.location.id = cpuNumaNodeId;
+    CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
+
+    ALIGN_SIZE(size, granularity);
+
+    // Reserve and map address
+    CUCHECK(cuMemAddressReserve(&hostptr, size, /* alignment */ 0, /* addr */ 0, /* flags */ 0));
+    CUCHECK(cuMemMap(hostptr, size, /* offset */ 0, handle, /* flags */ 0));
+
+    // Allow access by the local GPU
+    accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+    accessDesc.location.id = cudaDev;
+    accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+    CUCHECK(cuMemSetAccess(hostptr, size, &accessDesc, 1));
+
+    // Allow access by the local numa
+    accessDesc.location.type = CU_MEM_LOCATION_TYPE_HOST_NUMA;
+    accessDesc.location.id = cpuNumaNodeId;
+    accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+    CUCHECK(cuMemSetAccess(hostptr, size, &accessDesc, 1));
+
+    descOut->shmci.ptr = *hptr = (void *)hostptr;
+    descOut->legacy = false;
+    if (dptr) *dptr = (void *)hostptr;
+    INFO(NCCL_SHM, "CUMEM imported shareable host buffer from tpProxyRank %d size %zi ptr %p, granularity %ld", desc->shmci.tpProxyRank, desc->shmci.size, descOut->shmci.ptr, granularity);
+  } else {
+    char shmPath[SHM_PATH_MAX];
+    sprintf(shmPath, "/dev/shm/nccl-%s", desc->shmli.shmSuffix);
+    NCCLCHECK(ncclShmOpen(shmPath, desc->shmli.shmSize, hptr, dptr, -1, &descOut->shmli.handle));
+    descOut->legacy = true;
+    INFO(NCCL_SHM, "MMAP imported shareable host buffer %s size %zi ptr %p", shmPath, desc->shmli.shmSize, *hptr);
+  }
+#else /* CUDART_VERSION >= 12020 */
+  char shmPath[SHM_PATH_MAX];
+  sprintf(shmPath, "/dev/shm/nccl-%s", desc->shmli.shmSuffix);
+  NCCLCHECK(ncclShmOpen(shmPath, desc->shmli.shmSize, hptr, dptr, -1, &descOut->shmli.handle));
+  descOut->legacy = true;
+  INFO(NCCL_SHM, "MMAP imported shareable host buffer %s size %zi ptr %p", shmPath, desc->shmli.shmSize, *hptr);
+#endif
+  return ncclSuccess;
+}
+
+ncclResult_t ncclShmIpcClose(ncclShmIpcDesc_t *desc) {
+  if (desc) {
+#if CUDART_VERSION >= 12020
+    if (ncclCuMemEnable() && ncclCuMemHostEnable() && !desc->legacy) {
+      NCCLCHECK(ncclCuMemHostFree(desc->shmci.ptr));
+    } else {
+      NCCLCHECK(ncclShmClose(desc->shmli.handle));
+    }
+#else
+    NCCLCHECK(ncclShmClose(desc->shmli.handle));
+#endif
+  }
+
+  return ncclSuccess;
+}
+
+struct ncclTransport shmTransport = {
+  "SHM",
+  shmCanConnect,
+  { shmSendSetup, shmSendConnect, shmSendFree, NULL, shmSendProxySetup, NULL, shmSendProxyFree, NULL },
+  { shmRecvSetup, shmRecvConnect, shmRecvFree, NULL, shmRecvProxySetup, NULL, shmRecvProxyFree, NULL }
+};

From 1a16f427507cb985123e7565874f57b77a1ba2df Mon Sep 17 00:00:00 2001
From: Kamil Iskra <kiskra@nvidia.com>
Date: Mon, 16 Sep 2024 14:47:37 -0700
Subject: [PATCH 2/4] Add missing header files

---
 src/include/nccl_profiler.h | 150 ++++++++++++++++++++++++++++++++++++
 src/include/shmutils.h      |  26 +++++++
 2 files changed, 176 insertions(+)
 create mode 100644 src/include/nccl_profiler.h
 create mode 100644 src/include/shmutils.h

diff --git a/src/include/nccl_profiler.h b/src/include/nccl_profiler.h
new file mode 100644
index 0000000000..556a0f6e45
--- /dev/null
+++ b/src/include/nccl_profiler.h
@@ -0,0 +1,150 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_PROFILER_H_
+#define NCCL_PROFILER_H_
+
+#include <cstdint>
+
+enum {
+  ncclProfileGroup     = (1 << 0),  // group event type
+  ncclProfileColl      = (1 << 1),  // host collective call event type
+  ncclProfileP2p       = (1 << 2),  // host point-to-point call event type
+  ncclProfileProxyOp   = (1 << 3),  // proxy operation event type
+  ncclProfileProxyStep = (1 << 4),  // proxy step event type
+  ncclProfileProxyCtrl = (1 << 5),  // proxy control event type
+  ncclProfileNumEvents = (     6),
+};
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint64_t seqNumber;
+      uint8_t func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      uint8_t datatype;
+      uint32_t op;
+      size_t trafficBytes;
+      uint8_t nMaxChannels;
+      uint8_t nWarps;
+      uint8_t algo;
+      uint8_t proto;
+      int isCollnet;
+      int isNvls;
+    } coll;
+
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint8_t func;
+      void* buff;
+      uint8_t datatype;
+      size_t count;
+      int peer;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+  };
+} ncclProfilerEventDescr_v1_t;
+
+typedef enum {
+  ncclProfilerProxyOpSendPosted,
+  ncclProfilerProxyOpSendRemFifoWait,
+  ncclProfilerProxyOpSendTransmitted,
+  ncclProfilerProxyOpSendDone,
+  ncclProfilerProxyOpRecvPosted,
+  ncclProfilerProxyOpRecvReceived,
+  ncclProfilerProxyOpRecvTransmitted,
+  ncclProfilerProxyOpRecvDone,
+
+  /* Legacy proxy profiler states */
+  ncclProfilerProxyStepSendGPUWait,
+  ncclProfilerProxyStepSendWait,
+  ncclProfilerProxyStepRecvWait,
+  ncclProfilerProxyStepRecvFlushWait,
+  ncclProfilerProxyStepRecvGPUWait,
+
+  /* Legacy proxy control states */
+  ncclProfilerProxyCtrlIdle,
+  ncclProfilerProxyCtrlActive,
+  ncclProfilerProxyCtrlSleep,
+  ncclProfilerProxyCtrlWakeup,
+  ncclProfilerProxyCtrlAppend,
+  ncclProfilerProxyCtrlAppendEnd,
+} ncclProfilerEventState_v1_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+    int steps;
+  } proxyOp;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+} ncclProfilerEventStateArgs_v1_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v1_t;
+
+typedef ncclProfilerEventDescr_v1_t ncclProfilerEventDescr_t;
+typedef ncclProfilerEventState_v1_t ncclProfilerEventState_t;
+typedef ncclProfilerEventStateArgs_v1_t ncclProfilerEventStateArgs_t;
+typedef ncclProfiler_v1_t ncclProfiler_t;
+
+#endif
diff --git a/src/include/shmutils.h b/src/include/shmutils.h
new file mode 100644
index 0000000000..43e8afb79a
--- /dev/null
+++ b/src/include/shmutils.h
@@ -0,0 +1,26 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_SHMUTILS_H_
+#define NCCL_SHMUTILS_H_
+
+#include "nccl.h"
+
+typedef void* ncclShmHandle_t;
+ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle);
+ncclResult_t ncclShmClose(ncclShmHandle_t handle);
+ncclResult_t ncclShmUnlink(ncclShmHandle_t handle);
+
+struct ncclShmemCollBuff {
+  volatile size_t *cnt[2];
+  volatile void *ptr[2];
+  int round;
+  size_t maxTypeSize;
+};
+
+ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize);
+
+#endif

From 187511909594737eb45eef37ad6010edad010534 Mon Sep 17 00:00:00 2001
From: WoosungMyung <dntjd517@naver.com>
Date: Sat, 4 Jan 2025 08:47:46 +0900
Subject: [PATCH 3/4] Explanation of Function change

---
 src/bootstrap.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/bootstrap.cc b/src/bootstrap.cc
index c1d085e4ce..e49514b90b 100644
--- a/src/bootstrap.cc
+++ b/src/bootstrap.cc
@@ -70,7 +70,7 @@ static int localIdFromRoot(int rank, int root, int nRanks, int nRoots) {
   int ir = BOOTSTRAP_PID(root, nRoots);
   return rank - firstRankFromRoot(ir, nRanks, nRoots);
 }
-// return the number of child for a root, root will be periodized
+// Check if the given rank is the first rank from the root
 static int isFirstFromRoot(int rank, int root, int nRanks, int nRoots) {
   return (rank == firstRankFromRoot(root, nRanks, nRoots));
 }

From 6aae37927840b2bd7b7d42d2f0050e75f88ee97f Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Wed, 18 Dec 2024 08:26:06 -0800
Subject: [PATCH 4/4] 2.24.3-1

Network user buffer support for collectives
 * Leverage user buffer registration to achieve zero-copy
   inter-node communications for Ring, NVLS and Collnet

Add RAS subsystem
 * Create a RAS thread keeping track of all NCCL communicators.
 * Add a ncclras tool contacting the RAS thread and getting a
   report.

Add fp8 support
 * Add support for e5m2 and e4m3 8-bit floating point operations.
 * Use Tree/PAT algorithms when possible for better numerical
   stability.

Add NIC fusion
 * Add a NET API to ask the network plugin to fuse a set of
   interfaces together.
 * Fuse multiple NICs under the same PCI switch as a single,
   larger NIC.

Socket connection failure retry
 * Retry in case of socket connection failure (unreachable host)
 * Avoid "Software caused connection abort" errors on retries

QP connection failure retry
 * Retry in case of IB QP connection failure during ibv_modify_qp.

NET API improvements
 * Allow plugins to force a flush in case data and completion
   ordering is not guaranteed.
 * Indicate when completion is not needed (e.g. for the LL128
   protocol), allowing plugins to skip generating a completion.
 * Allow for full offload of allgather operations when using one
   GPU per node.

NCCL_ALGO/NCCL_PROTO strict enforcement
 * Extend NCCL_ALGO/NCCL_PROTO syntax to be able to specify
   ALGO/PROTO filters for each collective operation.
 * Strictly enforce the ALGO/PROTO filters, no longer fall back
   on the ring algorithm when the filtering leaves no option and
   error out instead.

Enable CUMEM host allocations
 * Use cumem functions for host memory allocation by default.

Improved profiler plugin API
 * Avoid dependencies with NCCL includes.
 * Add information on whether the buffer is registered or not

Adjust PAT tuning
 * Improve transition between PAT and ring at scale.

Fix hangs when running with different CPU architectures
 * Detect when we use a mix of GPU architectures
 * Ensure Algo/Proto decisions are made based on that unified
   state.

Fix FD leak in UDS
 * Fix a leak when mapping buffers intra-node with cumem IPCs.

Fix crash when mixing buffer registration and graph buffer registration.
 * Separate local and graph registration to avoid crashes when we free
   buffers.

Fix user buffer registration with dmabuf
 * Make ncclSend/ncclRecv communication with buffer registration functional
   on network plugins relying on dmabuf for buffer registration.

Fix crash in IB code caused by uninitialized fields.

Fix non-blocking ncclSend/ncclRecv
 * Fix case where ncclSend/ncclRecv would return ncclSuccess in non-blocking
   mode even though the operation was not enqueued onto the stream.
 * Issue #1495

Various compiler tweaks and fixes
 * PR #758

Fix typo in ncclTopoPrintGraph
 * Issue #1468
---
 ext-net/README.md                       |   61 +-
 ext-net/example/nccl/net.h              |    3 +
 ext-net/example/nccl/net_device.h       |    3 +-
 ext-net/example/nccl/net_v8.h           |    2 -
 ext-net/example/nccl/net_v9.h           |   99 ++
 ext-net/example/plugin.c                |  102 +-
 ext-profiler/example/event.h            |   21 +-
 ext-profiler/example/nccl/profiler.h    |    1 +
 ext-profiler/example/nccl/profiler_v1.h |   53 +-
 ext-profiler/example/nccl/profiler_v2.h |  146 ++
 ext-profiler/example/plugin.c           |  114 +-
 ext-profiler/example/print_event.c      |   72 +-
 ext-tuner/example/nccl/tuner.h          |    9 +-
 ext-tuner/example/plugin.c              |    9 +-
 makefiles/common.mk                     |    5 +
 makefiles/version.mk                    |    4 +-
 src/Makefile                            |   30 +-
 src/bootstrap.cc                        |  103 +-
 src/collectives.cc                      |   36 +-
 src/debug.cc                            |   16 +
 src/device/all_gather.h                 |  147 +-
 src/device/all_reduce.h                 |  167 +--
 src/device/broadcast.h                  |   52 +-
 src/device/common.h                     |    3 +
 src/device/common_kernel.h              |   19 +-
 src/device/generate.py                  |   35 +-
 src/device/network/unpack/unpack.h      |    4 +
 src/device/onerank.cu                   |    4 +
 src/device/primitives.h                 |    7 +-
 src/device/prims_ll.h                   |    6 +-
 src/device/prims_ll128.h                |    6 +-
 src/device/prims_simple.h               |  238 +--
 src/device/reduce_kernel.h              |  171 ++-
 src/device/reduce_scatter.h             |   55 +-
 src/device/sendrecv.h                   |   18 +-
 src/enqueue.cc                          |  818 +++++------
 src/graph/paths.cc                      |   71 +-
 src/graph/search.cc                     |    2 +-
 src/graph/topo.cc                       |  600 +++++++-
 src/graph/topo.h                        |    4 +
 src/graph/tuning.cc                     |  246 ++--
 src/graph/xml.cc                        |   16 +-
 src/graph/xml.h                         |   27 +-
 src/group.cc                            |   22 +-
 src/include/collectives.h               |  321 ++++-
 src/include/comm.h                      |   22 +-
 src/include/debug.h                     |    2 +
 src/include/device.h                    |   30 +-
 src/include/enqueue.h                   |   11 +
 src/include/graph.h                     |   11 +-
 src/include/ibvwrap.h                   |   12 +
 src/include/nccl_common.h               |    1 +
 src/include/nccl_net.h                  |  168 ++-
 src/include/nccl_profiler.h             |  169 ++-
 src/include/nccl_tuner.h                |   53 +-
 src/include/net_device.h                |    3 +-
 src/include/nvmlwrap.h                  |    2 +-
 src/include/profiler.h                  |    8 +-
 src/include/proxy.h                     |   32 +-
 src/include/ras.h                       |   24 +
 src/include/register.h                  |   21 +-
 src/include/shmutils.h                  |    2 +-
 src/include/socket.h                    |   26 +-
 src/include/transport.h                 |   18 +-
 src/include/utils.h                     |    3 +-
 src/init.cc                             |  142 +-
 src/misc/cudawrap.cc                    |   37 +-
 src/misc/ibvwrap.cc                     |   94 +-
 src/misc/ipcsocket.cc                   |   16 +-
 src/misc/nvmlwrap.cc                    |    8 +-
 src/misc/profiler.cc                    |  220 ++-
 src/misc/shmutils.cc                    |    6 +-
 src/misc/socket.cc                      |  297 ++--
 src/misc/tuner.cc                       |   57 +-
 src/nccl.h.in                           |   15 +-
 src/net.cc                              |  681 ++++++---
 src/proxy.cc                            |   95 +-
 src/ras/client.cc                       |  318 ++++
 src/ras/client_support.cc               | 1755 +++++++++++++++++++++++
 src/ras/collectives.cc                  |  762 ++++++++++
 src/ras/peers.cc                        |  960 +++++++++++++
 src/ras/ras.cc                          |  668 +++++++++
 src/ras/ras_internal.h                  |  512 +++++++
 src/ras/rasnet.cc                       | 1189 +++++++++++++++
 src/register.cc                         |  204 ---
 src/register/coll_reg.cc                |  446 ++++++
 src/register/register.cc                |  179 +++
 src/register/sendrecv_reg.cc            |   35 +
 src/transport.cc                        |   28 +-
 src/transport/coll_net.cc               |  633 +++++---
 src/transport/generic.cc                |   22 +-
 src/transport/net.cc                    |  468 ++++--
 src/transport/net_ib.cc                 |  554 ++++---
 src/transport/net_socket.cc             |   17 +-
 src/transport/nvls.cc                   |  291 ++--
 src/transport/p2p.cc                    |  410 +++---
 src/transport/shm.cc                    |   30 +-
 97 files changed, 12588 insertions(+), 3127 deletions(-)
 create mode 100644 ext-net/example/nccl/net_v9.h
 create mode 100644 ext-profiler/example/nccl/profiler_v2.h
 create mode 100644 src/include/ras.h
 create mode 100644 src/ras/client.cc
 create mode 100644 src/ras/client_support.cc
 create mode 100644 src/ras/collectives.cc
 create mode 100644 src/ras/peers.cc
 create mode 100644 src/ras/ras.cc
 create mode 100644 src/ras/ras_internal.h
 create mode 100644 src/ras/rasnet.cc
 delete mode 100644 src/register.cc
 create mode 100644 src/register/coll_reg.cc
 create mode 100644 src/register/register.cc
 create mode 100644 src/register/sendrecv_reg.cc

diff --git a/ext-net/README.md b/ext-net/README.md
index 781fd904a4..aa1a3945e6 100644
--- a/ext-net/README.md
+++ b/ext-net/README.md
@@ -60,9 +60,9 @@ of newer ones.
 The `nccl/` directory is populated with `net_vX.h` files extracting all relevant definitions
 from old API versions. It also provides error codes in `err.h`.
 
-# API (v6)
+# API (v9)
 
-Below is the main `ncclNet_v6` struct. Each function is explained in later sections.
+Below is the main `ncclNet_v9` struct. Each function is explained in later sections.
 
 ```
 typedef struct {
@@ -73,7 +73,7 @@ typedef struct {
   // Return the number of adapters.
   ncclResult_t (*devices)(int* ndev);
   // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
   // Create a receiving object and provide a handle to connect to it. The
   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
   // between ranks to create a connection.
@@ -82,24 +82,26 @@ typedef struct {
   // This call must not block for the connection to be established, and instead
   // should return successfully with sendComm == NULL with the expectation that
   // it will be called again until sendComm != NULL.
-  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
   // Finalize connection establishment after remote peer has called connect.
   // This call must not block for the connection to be established, and instead
   // should return successfully with recvComm == NULL with the expectation that
   // it will be called again until recvComm != NULL.
-  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
   // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
   // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
   /* DMA-BUF support */
   ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
   ncclResult_t (*deregMr)(void* comm, void* mhandle);
   // Asynchronous send to a peer.
   // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request);
   // Asynchronous recv from a peer.
   // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request);
   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
   // visible to the GPU
   ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
@@ -110,7 +112,17 @@ typedef struct {
   ncclResult_t (*closeSend)(void* sendComm);
   ncclResult_t (*closeRecv)(void* recvComm);
   ncclResult_t (*closeListen)(void* listenComm);
-} ncclNet_v6_t;
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+
+  // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
+  // what index this new vNIC exists at
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props);
+} ncclNet_t;
 ```
 
 ## Error codes
@@ -136,11 +148,19 @@ not need to rely on CUDA, this should not be common.
 NCCL will call the `init` function first, then query the number of network devices with the
 `devices` function, getting each network device properties with `getProperties`.
 
+If NCCL wishes to initialize virtual devices, used in NIC fusion currently, it can call `makeVDevice`
+specifying a list of physical devices (the original devices listed from `devices`) it wishes to
+merge together. If the plugin does not support NIC fusion, it can set `makeVDevice` to null.
+
 To establish a connection between two network devices, NCCL will first call `listen` on the
 receiving side, pass the returned handle to the sender side of the connection, and call `connect`
 with that handle. Finally, `accept` will be called on the receiving side to finalize the connection
 establishment.
 
+`connect` and `accept` can receive an optional `netDevComm` pointer from the caller, if the caller
+wishes to make use of device networking. This parameter may be ignored by the plugin if it does
+not support device-side networking.
+
 Once the connection is established, communication will be done using the functions `isend`,
 `irecv` and `test`. Prior to calling `isend` or `irecv`, NCCL will call the `regMr` function on
 all buffers to allow RDMA NICs to prepare buffers. `deregMr` will be used to unregister buffers.
@@ -219,6 +239,12 @@ different offset within the original buffer, with a smaller size, etc), then der
 The call to ncclCommDeregister should call the final deregMr() and effectively remove the mapping
 on the network adapter.
 
+The `forceFlush` field can request the NCCL core to call flush for all transfers. By default,
+flushes are only called when the GPU architecture or PCI topology would not not guarantee correct
+PCI ordering. Plugins can set it to one if the NIC operates in a mode where e.g. the data and the
+completion paths use different PCI links and therefore need a call to flush() to guarantee
+ordering.
+
 The `speed` field indicates the speed of the network port in Mbps (10^6 bits per second). This is
 important to ensure proper optimization of flows within the node.
 
@@ -234,6 +260,17 @@ The `maxComms` field indicates the maximum number of connections we can create.
 The `maxRecvs` field indicates the maximum number for grouped receive operations (see grouped
 receive).
 
+The `netDeviceType` indicates which type of device networking this plugin supports. The current supported
+options are `NCCL_NET_DEVICE_HOST` and `NCCL_NET_DEVICE_UNPACK`.
+
+The `netDeviceVersion` indicates the version of device networking this plugin supports. Currently, this must match the associated netDeviceVersion of this netDeviceType compiled into NCCL core. Net device functionality is built as apart of NCCL core's device code.
+
+The `maxP2pBytes` and `maxCollBytes` fields indicate the maximum size the plugin can handle for
+point-to-point and collective calls. This will tell the NCCL core to cut large operations into
+multiple smaller chunks if needed.
+
+`vProps` is the list of devices that have been fused into the current device. Each entry is an index pointing to the child device.
+
 ### Connection establishment
 
 Connections are used in an unidirectional manner. There is therefore a sender side and a receiver
@@ -332,6 +369,12 @@ handled by a single request handle.
 The sizes provided to `irecv` can (and will) be larger than the size of the `isend` operation.
 The contrary (receive size being lower than the send size) is an error, however.
 
+NCCL sets request pointer in `irecv` to `NCCL_NET_OPTIONAL_RECV_COMPLETION` when it is using
+LL or LL128 protocols. In these cases, NCCL polls on flag embedded in data to detect completion
+of irecv and is resilient to redundant network writes. This allows the plugin to optimize request
+completions on such irecvs (for example, complete the request immediately). The plugin is still
+expected to set a valid request pointer on return which NCCL can poll to check for completion.
+
 Note: for a given connection, send/receive operations should always match in the order they were
 posted. Tags provided for receive operations are only used to assign a given send operation to one
 of the buffers of the first (multi-)receive in the queue, not to allow for out-of-order tag
diff --git a/ext-net/example/nccl/net.h b/ext-net/example/nccl/net.h
index 2aea8c439b..112967ab86 100644
--- a/ext-net/example/nccl/net.h
+++ b/ext-net/example/nccl/net.h
@@ -12,6 +12,8 @@
 #include "err.h"
 
 #define NCCL_NET_HANDLE_MAXSIZE 128
+#define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L) //1TB
+#define NCCL_NET_OPTIONAL_RECV_COMPLETION 0x1
 
 #define NCCL_PTR_HOST 0x1
 #define NCCL_PTR_CUDA 0x2
@@ -20,6 +22,7 @@
 // Maximum number of requests per comm object
 #define NCCL_NET_MAX_REQUESTS 32
 
+#include "net_v9.h"
 #include "net_v8.h"
 #include "net_v7.h"
 #include "net_v6.h"
diff --git a/ext-net/example/nccl/net_device.h b/ext-net/example/nccl/net_device.h
index b430d90646..874fb5999a 100644
--- a/ext-net/example/nccl/net_device.h
+++ b/ext-net/example/nccl/net_device.h
@@ -25,6 +25,7 @@ typedef struct {
 } ncclNetDeviceHandle_v7_t;
 
 typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
-typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_t;
+typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t;
+typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_t;
 
 #endif
diff --git a/ext-net/example/nccl/net_v8.h b/ext-net/example/nccl/net_v8.h
index 3161558205..54a61f61b4 100644
--- a/ext-net/example/nccl/net_v8.h
+++ b/ext-net/example/nccl/net_v8.h
@@ -23,8 +23,6 @@ typedef struct {
   int netDeviceVersion;            // Version number for network offload
 } ncclNetProperties_v8_t;
 
-typedef ncclNetProperties_v8_t ncclNetProperties_t;
-
 typedef struct {
   // Name of the network (mainly for logs)
   const char* name;
diff --git a/ext-net/example/nccl/net_v9.h b/ext-net/example/nccl/net_v9.h
new file mode 100644
index 0000000000..61035ecc93
--- /dev/null
+++ b/ext-net/example/nccl/net_v9.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NCCL_NET_V9_H_
+#define NCCL_NET_V9_H_
+
+#include "net_device.h"
+
+#define NCCL_NET_MAX_DEVS_PER_NIC_V9 4
+#define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V9
+typedef struct {
+  int ndevs;
+  int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9];
+} ncclNetVDeviceProps_v9_t;
+typedef ncclNetVDeviceProps_v9_t ncclNetVDeviceProps_t;
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int regIsGlobal;                 // regMr is not tied to a particular comm
+  int forceFlush;                  // Force a flush on receives
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+  ncclNetVDeviceProps_v9_t vProps;
+  size_t maxP2pBytes;              // Max transfer size for point-to-point operations
+  size_t maxCollBytes;             // Max transfer size for collective operations
+} ncclNetProperties_v9_t;
+
+typedef ncclNetProperties_v9_t ncclNetProperties_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v9_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v9_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+
+  // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
+  // what index this new vNIC exists at
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props);
+} ncclNet_v9_t;
+
+#endif // end include guard
diff --git a/ext-net/example/plugin.c b/ext-net/example/plugin.c
index 128dde9b47..2852242617 100644
--- a/ext-net/example/plugin.c
+++ b/ext-net/example/plugin.c
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2024, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -7,15 +7,15 @@
 #include "net.h"
 
 #define __hidden __attribute__ ((visibility("hidden")))
+#define NCCL_PLUGIN_MAX_RECVS 1
 
 int max_requests = NCCL_NET_MAX_REQUESTS;
 
 __hidden ncclResult_t pluginInit(ncclDebugLogger_t logFunction) { return ncclSuccess; }
 __hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess; }
-
 __hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; }
 __hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; }
-__hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_v8_t* props) {
+__hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_t* props) {
   // Below are default values, if unsure don't change.
 
   props->name = "Example";
@@ -27,6 +27,8 @@ __hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_v8_t* props
   props->ptrSupport = NCCL_PTR_HOST;
   // If you regMr has a fast registration cache, set to 1. If set to 0, user buffer registration may be disabled.
   props->regIsGlobal = 0;
+  // Force flush after receive. Needed if the control path and data path use a different path to the GPU
+  props->forceFlush = 0;
   // Speed in *Mbps*. 100000 means 100G
   props->speed = 100000;
   // Port number, used in conjunction with guid
@@ -36,20 +38,27 @@ __hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_v8_t* props
   // Maximum number of comm objects we can create.
   props->maxComms = 1024*1024;
   // Maximum number of receive operations taken by irecv().
-  props->maxRecvs = 1;
+  props->maxRecvs = NCCL_PLUGIN_MAX_RECVS;
   // Coupling with NCCL network device-side code.
-  props->netDeviceType = 0;
+  props->netDeviceType = NCCL_NET_DEVICE_HOST;
   props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
-  return ncclInternalError;
+  // Used to tell NCCL core whether this is a virtual device fusing multiple physical devices.
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  // maximum transfer sizes the plugin can handle
+  props->maxP2pBytes = NCCL_MAX_NET_SIZE_BYTES;
+  props->maxCollBytes = NCCL_MAX_NET_SIZE_BYTES;
+  return ncclSuccess;
 }
+
 __hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; }
-__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm) { return ncclInternalError; }
-__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** recvDevComm) { return ncclInternalError; }
 __hidden ncclResult_t pluginRegMr(void* collComm, void* data, size_t size, int type, void** mhandle) { return ncclInternalError; }
 __hidden ncclResult_t pluginRegMrDmaBuf(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { return ncclInternalError; }
 __hidden ncclResult_t pluginDeregMr(void* collComm, void* mhandle) { return ncclInternalError;}
-__hidden ncclResult_t pluginIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { return ncclInternalError; }
-__hidden ncclResult_t pluginIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { return ncclInternalError; }
+__hidden ncclResult_t pluginIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) { return ncclInternalError; }
+__hidden ncclResult_t pluginIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) { return ncclInternalError; }
 __hidden ncclResult_t pluginIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { return ncclInternalError; }
 __hidden ncclResult_t pluginTest(void* request, int* done, int* size) { return ncclInternalError; }
 __hidden ncclResult_t pluginCloseSend(void* sendComm) { return ncclInternalError; }
@@ -57,10 +66,11 @@ __hidden ncclResult_t pluginCloseRecv(void* recvComm) { return ncclInternalError
 __hidden ncclResult_t pluginCloseListen(void* listenComm) { return ncclInternalError; }
 __hidden ncclResult_t pluginIrecvConsumed(void* recvComm, int n, void* request) { return ncclInternalError; }
 __hidden ncclResult_t pluginGetDeviceMr(void* comm, void* mhandle, void** dptr_mhandle) { return ncclInternalError; }
+__hidden ncclResult_t pluginMakeVDevice(int* d, ncclNetVDeviceProps_t* props) { return ncclInternalError; }
 
 #define PLUGIN_NAME "Plugin"
 
-const ncclNet_v8_t ncclNetPlugin_v8 = {
+ncclNet_v9_t ncclNetPlugin_v9 = {
   .name = PLUGIN_NAME,
   .init = pluginInit,
   .devices = pluginDevices,
@@ -80,8 +90,60 @@ const ncclNet_v8_t ncclNetPlugin_v8 = {
   .closeListen = pluginCloseListen,
   .getDeviceMr = pluginGetDeviceMr,
   .irecvConsumed = pluginIrecvConsumed,
+  .makeVDevice   = pluginMakeVDevice,
 };
 
+__hidden ncclResult_t pluginGetProperties_v8(int dev, ncclNetProperties_v8_t* props_v8) {
+  ncclNetProperties_t props;
+  ncclResult_t ret = pluginGetProperties(dev, &props);
+  if (ret != ncclSuccess) return ret;
+  props_v8->name = props.name;
+  props_v8->pciPath = props.pciPath;
+  props_v8->guid = props.guid;
+  props_v8->ptrSupport = props.ptrSupport;
+  props_v8->regIsGlobal = props.regIsGlobal;
+  props_v8->speed = props.speed;
+  props_v8->latency = props.latency;
+  props_v8->port = props.port;
+  props_v8->maxComms = props.maxComms;
+  props_v8->maxRecvs = props.maxRecvs;
+  props_v8->netDeviceType = props.netDeviceType;
+  props_v8->netDeviceVersion = props.netDeviceVersion;
+  return ncclSuccess;
+}
+
+__hidden ncclResult_t pluginIsend_v8(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
+  return pluginIsend(sendComm, data, (int)size, tag, mhandle, request);
+}
+
+__hidden ncclResult_t pluginIrecv_v8(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
+  size_t sizesOut[NCCL_PLUGIN_MAX_RECVS];
+  for (int i=0; i<n; i++) sizesOut[i] = sizes[i];
+  return pluginIrecv(recvComm, 1, data, sizesOut, tags, mhandles, request);
+}
+
+const ncclNet_v8_t ncclNetPlugin_v8 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit,
+  .devices = pluginDevices,
+  .getProperties = pluginGetProperties_v8,
+  .listen = pluginListen,
+  .connect = pluginConnect,
+  .accept = pluginAccept,
+  .regMr = pluginRegMr,
+  .regMrDmaBuf = pluginRegMrDmaBuf,
+  .deregMr = pluginDeregMr,
+  .isend = pluginIsend_v8,
+  .irecv = pluginIrecv_v8,
+  .iflush = pluginIflush,
+  .test = pluginTest,
+  .closeSend = pluginCloseSend,
+  .closeRecv = pluginCloseRecv,
+  .closeListen = pluginCloseListen,
+  .getDeviceMr = pluginGetDeviceMr,
+  .irecvConsumed = pluginIrecvConsumed,
+ };
+
 __hidden ncclResult_t pluginGetProperties_v7(int dev, ncclNetProperties_v7_t* props_v7) {
   ncclNetProperties_t props;
   ncclResult_t ret = pluginGetProperties(dev, &props);
@@ -91,6 +153,7 @@ __hidden ncclResult_t pluginGetProperties_v7(int dev, ncclNetProperties_v7_t* pr
   props_v7->guid = props.guid;
   props_v7->ptrSupport = props.ptrSupport;
   props_v7->speed = props.speed;
+  props_v7->latency = props.latency;
   props_v7->port = props.port;
   props_v7->maxComms = props.maxComms;
   props_v7->maxRecvs = props.maxRecvs;
@@ -114,8 +177,8 @@ const ncclNet_v7_t ncclNetPlugin_v7 = {
   .regMr = pluginRegMr_v7,
   .regMrDmaBuf = pluginRegMrDmaBuf,
   .deregMr = pluginDeregMr,
-  .isend = pluginIsend,
-  .irecv = pluginIrecv,
+  .isend = pluginIsend_v8,
+  .irecv = pluginIrecv_v8,
   .iflush = pluginIflush,
   .test = pluginTest,
   .closeSend = pluginCloseSend,
@@ -134,6 +197,7 @@ __hidden ncclResult_t pluginGetProperties_v6(int dev, ncclNetProperties_v6_t* pr
   props_v6->guid = props.guid;
   props_v6->ptrSupport = props.ptrSupport;
   props_v6->speed = props.speed;
+  props_v6->latency = props.latency;
   props_v6->port = props.port;
   props_v6->maxComms = props.maxComms;
   props_v6->maxRecvs = props.maxRecvs;
@@ -154,8 +218,8 @@ const ncclNet_v6_t ncclNetPlugin_v6 = {
   .regMr = pluginRegMr_v7,
   .regMrDmaBuf = pluginRegMrDmaBuf,
   .deregMr = pluginDeregMr,
-  .isend = pluginIsend,
-  .irecv = pluginIrecv,
+  .isend = pluginIsend_v8,
+  .irecv = pluginIrecv_v8,
   .iflush = pluginIflush,
   .test = pluginTest,
   .closeSend = pluginCloseSend,
@@ -174,8 +238,8 @@ const ncclNet_v5_t ncclNetPlugin_v5 = {
   .accept = pluginAccept_v6,
   .regMr = pluginRegMr_v7,
   .deregMr = pluginDeregMr,
-  .isend = pluginIsend,
-  .irecv = pluginIrecv,
+  .isend = pluginIsend_v8,
+  .irecv = pluginIrecv_v8,
   .iflush = pluginIflush,
   .test = pluginTest,
   .closeSend = pluginCloseSend,
@@ -198,11 +262,11 @@ static ncclResult_t pluginGetProperties_v4(int dev, ncclNetProperties_v4_t* prop
   return ncclSuccess;
 }
 static ncclResult_t pluginIsend_v4(void *sendComm, void* data, int size, void *mhandle, void** request) {
-  return pluginIsend(sendComm, data, size, 0, mhandle, request);
+  return pluginIsend_v8(sendComm, data, size, 0, mhandle, request);
 }
 static ncclResult_t pluginIrecv_v4(void* recvComm, void* data, int size, void* mhandle, void** request) {
   int tag = 0;
-  return pluginIrecv(recvComm, 1, &data, &size, &tag, &mhandle, request);
+  return pluginIrecv_v8(recvComm, 1, &data, &size, &tag, &mhandle, request);
 }
 static ncclResult_t pluginIflush_v4(void* recvComm, void* data, int size, void* mhandle, void** request) {
   return pluginIflush(recvComm, 1, &data, &size, &mhandle, request);
diff --git a/ext-profiler/example/event.h b/ext-profiler/example/event.h
index 7432808133..1486a22482 100644
--- a/ext-profiler/example/event.h
+++ b/ext-profiler/example/event.h
@@ -14,6 +14,7 @@
 
 #define MAX_CHANNELS                     32
 #define MAX_STEPS                        16
+#define MAX_OPS                          16 // Up to 64K ranks for PAT
 
 #define PROXY_OP_SEND_STATE_OFFSET       (ncclProfilerProxyOpSendPosted)
 #define PROXY_OP_RECV_STATE_OFFSET       (ncclProfilerProxyOpRecvPosted)
@@ -86,7 +87,7 @@ struct taskEventBase {
   int rank;                         // rank of the operation in NCCL communicator
   const char* name;                 // FIXME: unused
   uint64_t commHash;                // communicator identifier
-  uint8_t func;                     // ncclFunc*
+  const char* func;                 // ncclFunc*
   int refCount;                     // number of references for this operation
   struct group* parent;             // parent event group
   struct taskEventBase* next;       // next top level event in group
@@ -102,16 +103,14 @@ struct collective {
   size_t count;
   size_t trafficBytes;
   int root;
-  uint8_t datatype;
+  const char* datatype;
   uint8_t nMaxChannels;
-  uint8_t algo;
-  uint8_t proto;
-  int op;
+  const char* algo;
+  const char* proto;
   int nWarps;
-  int isCollnet;
-  int isNvls;
-  struct proxyOp send[MAX_CHANNELS];// array of send proxy operation events
-  struct proxyOp recv[MAX_CHANNELS];// array of recv proxy operation events
+  struct proxyOp send[MAX_CHANNELS][MAX_OPS];// array of send proxy operation events
+  struct proxyOp recv[MAX_CHANNELS][MAX_OPS];// array of recv proxy operation events
+  int nProxyOps[MAX_CHANNELS];
 };
 
 struct p2p {
@@ -119,9 +118,9 @@ struct p2p {
   uint8_t func;
   void const* buff;
   size_t count;
-  uint8_t datatype;
+  const char* datatype;
   int peer;
-  struct proxyOp op;
+  struct proxyOp op[MAX_CHANNELS];
 };
 
 struct group {
diff --git a/ext-profiler/example/nccl/profiler.h b/ext-profiler/example/nccl/profiler.h
index db7bc3feae..6680cfecef 100644
--- a/ext-profiler/example/nccl/profiler.h
+++ b/ext-profiler/example/nccl/profiler.h
@@ -13,6 +13,7 @@
 #include "common.h"
 #include "err.h"
 
+#include "profiler_v2.h"
 #include "profiler_v1.h"
 
 #endif // end include guard
diff --git a/ext-profiler/example/nccl/profiler_v1.h b/ext-profiler/example/nccl/profiler_v1.h
index 8724a1c662..7d34bed57f 100644
--- a/ext-profiler/example/nccl/profiler_v1.h
+++ b/ext-profiler/example/nccl/profiler_v1.h
@@ -9,16 +9,6 @@
 
 #include <stdint.h>
 
-enum {
-  ncclProfileGroup     = (1 << 0),  // group event type
-  ncclProfileColl      = (1 << 1),  // host collective call event type
-  ncclProfileP2p       = (1 << 2),  // host point-to-point call event type
-  ncclProfileProxyOp   = (1 << 3),  // proxy operation event type
-  ncclProfileProxyStep = (1 << 4),  // proxy step event type
-  ncclProfileProxyCtrl = (1 << 5),  // proxy control event type
-  ncclProfileNumEvents = (     6),
-};
-
 typedef struct {
   uint8_t type;                 // event type descriptor: ncclProfileColl, ...
   void* parentObj;              // pointer to the profiler parent object (for coll is the group)
@@ -69,42 +59,8 @@ typedef struct {
   };
 } ncclProfilerEventDescr_v1_t;
 
-typedef enum {
-  ncclProfilerProxyOpSendPosted,
-  ncclProfilerProxyOpSendRemFifoWait,
-  ncclProfilerProxyOpSendTransmitted,
-  ncclProfilerProxyOpSendDone,
-  ncclProfilerProxyOpRecvPosted,
-  ncclProfilerProxyOpRecvReceived,
-  ncclProfilerProxyOpRecvTransmitted,
-  ncclProfilerProxyOpRecvDone,
-
-  /* Legacy proxy profiler states */
-  ncclProfilerProxyStepSendGPUWait,
-  ncclProfilerProxyStepSendWait,
-  ncclProfilerProxyStepRecvWait,
-  ncclProfilerProxyStepRecvFlushWait,
-  ncclProfilerProxyStepRecvGPUWait,
-
-  /* Legacy proxy control states */
-  ncclProfilerProxyCtrlIdle,
-  ncclProfilerProxyCtrlActive,
-  ncclProfilerProxyCtrlSleep,
-  ncclProfilerProxyCtrlWakeup,
-  ncclProfilerProxyCtrlAppend,
-  ncclProfilerProxyCtrlAppendEnd,
-} ncclProfilerEventState_v1_t;
-
-typedef union {
-  struct {
-    size_t transSize;
-    int steps;
-  } proxyOp;
-
-  struct {
-    int appendedProxyOps;
-  } proxyCtrl;
-} ncclProfilerEventStateArgs_v1_t;
+typedef ncclProfilerEventState_v2_t ncclProfilerEventState_v1_t;
+typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_v1_t;
 
 typedef struct {
   const char* name;
@@ -142,9 +98,4 @@ typedef struct {
   ncclResult_t (*finalize)(void* context);
 } ncclProfiler_v1_t;
 
-typedef ncclProfilerEventDescr_v1_t ncclProfilerEventDescr_t;
-typedef ncclProfilerEventState_v1_t ncclProfilerEventState_t;
-typedef ncclProfilerEventStateArgs_v1_t ncclProfilerEventStateArgs_t;
-typedef ncclProfiler_v1_t ncclProfiler_t;
-
 #endif
diff --git a/ext-profiler/example/nccl/profiler_v2.h b/ext-profiler/example/nccl/profiler_v2.h
new file mode 100644
index 0000000000..aab4ccf868
--- /dev/null
+++ b/ext-profiler/example/nccl/profiler_v2.h
@@ -0,0 +1,146 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_PROFILER_V2_H_
+#define NCCL_PROFILER_V2_H_
+
+#include <stdint.h>
+
+enum {
+  ncclProfileGroup     = (1 << 0),  // group event type
+  ncclProfileColl      = (1 << 1),  // host collective call event type
+  ncclProfileP2p       = (1 << 2),  // host point-to-point call event type
+  ncclProfileProxyOp   = (1 << 3),  // proxy operation event type
+  ncclProfileProxyStep = (1 << 4),  // proxy step event type
+  ncclProfileProxyCtrl = (1 << 5),  // proxy control event type
+};
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      size_t trafficBytes;
+      uint8_t nMaxChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+    } coll;
+
+    struct {
+      const char* name;
+      uint64_t commHash;
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+  };
+} ncclProfilerEventDescr_v2_t;
+
+typedef enum {
+  ncclProfilerProxyOpSendPosted,
+  ncclProfilerProxyOpSendRemFifoWait,
+  ncclProfilerProxyOpSendTransmitted,
+  ncclProfilerProxyOpSendDone,
+  ncclProfilerProxyOpRecvPosted,
+  ncclProfilerProxyOpRecvReceived,
+  ncclProfilerProxyOpRecvTransmitted,
+  ncclProfilerProxyOpRecvDone,
+
+  /* Legacy proxy profiler states */
+  ncclProfilerProxyStepSendGPUWait,
+  ncclProfilerProxyStepSendWait,
+  ncclProfilerProxyStepRecvWait,
+  ncclProfilerProxyStepRecvFlushWait,
+  ncclProfilerProxyStepRecvGPUWait,
+
+  /* Legacy proxy control states */
+  ncclProfilerProxyCtrlIdle,
+  ncclProfilerProxyCtrlActive,
+  ncclProfilerProxyCtrlSleep,
+  ncclProfilerProxyCtrlWakeup,
+  ncclProfilerProxyCtrlAppend,
+  ncclProfilerProxyCtrlAppendEnd,
+} ncclProfilerEventState_v2_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+    int steps;
+  } proxyOp;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+} ncclProfilerEventStateArgs_v2_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v2_t;
+
+typedef ncclProfilerEventDescr_v2_t ncclProfilerEventDescr_t;
+typedef ncclProfilerEventState_v2_t ncclProfilerEventState_t;
+typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_t;
+typedef ncclProfiler_v2_t ncclProfiler_t;
+
+#endif
diff --git a/ext-profiler/example/plugin.c b/ext-profiler/example/plugin.c
index f9de60813a..64d5d8be1d 100644
--- a/ext-profiler/example/plugin.c
+++ b/ext-profiler/example/plugin.c
@@ -21,11 +21,18 @@
 static int initialized;             // initialization counter for profiler
 static double startTime;            // profiler start time
 
-static int groupPoolSize = 16;
-static int collPoolSize = 16;
-static int p2pPoolSize = 1024;
-static int proxyCtrlPoolSize = 16;
-static int detachPoolSize = 128;
+static const int defaultEActivationMask = ncclProfileColl | ncclProfileP2p;
+static const int defaultGroupPoolSize = 16;
+static const int defaultCollPoolSize = 16;
+static const int defaultP2pPoolSize = 1024;
+static const int defaultProxyCtrlPoolSize = 16;
+static const int defaultDetachPoolSize = 128;
+
+static int groupPoolSize;
+static int collPoolSize;
+static int p2pPoolSize;
+static int proxyCtrlPoolSize;
+static int detachPoolSize;
 static int detachPoolBase;
 static int detachPoolIndex;
 static int detachPoolDone;
@@ -56,25 +63,25 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask)
   pthread_mutex_lock(&lock);
   if (__atomic_fetch_add(&initialized, 1, __ATOMIC_RELAXED) == 0) {
     // first thread initializes event mask, environment and detach pool
-    __atomic_store_n(eActivationMask, ncclProfileColl | ncclProfileP2p, __ATOMIC_RELAXED);
-    if (getenv("NCCL_PROFILE_EVENT_MASK")) {
-      __atomic_store_n(eActivationMask, atoi(getenv("NCCL_PROFILE_EVENT_MASK")), __ATOMIC_RELAXED);
-    }
-    if (getenv("NCCL_PROFILE_GROUP_POOL_SIZE")) {
-      groupPoolSize = atoi(getenv("NCCL_PROFILE_GROUP_POOL_SIZE"));
-    }
-    if (getenv("NCCL_PROFILE_COLL_POOL_SIZE")) {
-      collPoolSize = atoi(getenv("NCCL_PROFILE_COLL_POOL_SIZE"));
-    }
-    if (getenv("NCCL_PROFILE_P2P_POOL_SIZE")) {
-      p2pPoolSize = atoi(getenv("NCCL_PROFILE_P2P_POOL_SIZE"));
-    }
-    if (getenv("NCCL_PROFILE_PROXY_CTRL_POOL_SIZE")) {
-      proxyCtrlPoolSize = atoi(getenv("NCCL_PROFILE_PROXY_CTRL_POOL_SIZE"));
-    }
-    if (getenv("NCCL_PROFILE_PROXY_DETACH_POOL_SIZE")) {
-      detachPoolSize = atoi(getenv("NCCL_PROFILE_PROXY_DETACH_POOL_SIZE"));
-    }
+    const char* str;
+    str = getenv("NCCL_PROFILE_EVENT_MASK");
+    __atomic_store_n(eActivationMask, str ? atoi(str) : defaultEActivationMask, __ATOMIC_RELAXED);
+
+    str = getenv("NCCL_PROFILE_GROUP_POOL_SIZE");
+    groupPoolSize = str ? atoi(str) : defaultGroupPoolSize;
+
+    str = getenv("NCCL_PROFILE_COLL_POOL_SIZE");
+    collPoolSize = str ? atoi(str) : defaultCollPoolSize;
+
+    str = getenv("NCCL_PROFILE_P2P_POOL_SIZE");
+    p2pPoolSize = str ? atoi(str) : defaultP2pPoolSize;
+
+    str = getenv("NCCL_PROFILE_PROXY_CTRL_POOL_SIZE");
+    proxyCtrlPoolSize = str ? atoi(str) : defaultProxyCtrlPoolSize;
+
+    str = getenv("NCCL_PROFILE_PROXY_DETACH_POOL_SIZE");
+    detachPoolSize = str ? atoi(str) : defaultDetachPoolSize;
+
     // detach pool is used to store PXN proxyOps and is shared among threads
     detachPool = (struct proxyOp *)calloc(detachPoolSize, sizeof(*detachPool));
     if (detachPool == NULL) {
@@ -107,6 +114,13 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask)
   ctx->proxyCtrlPool = (struct proxyCtrl *)calloc(proxyCtrlPoolSize, sizeof(*ctx->proxyCtrlPool));
   if (ctx->proxyCtrlPool == NULL) goto fail;
 
+  // Print event pool sizes for debugging
+  //fprintf(stdout, "Profiler: Group pool size (bytes): %lu\n", sizeof(struct group)*groupPoolSize);
+  //fprintf(stdout, "Profiler: Coll  pool size (bytes): %lu\n", sizeof(struct collective)*collPoolSize);
+  //fprintf(stdout, "Profiler: P2p   pool size (bytes): %lu\n", sizeof(struct p2p)*p2pPoolSize);
+  //fprintf(stdout, "Profiler: Proxy pool size (bytes): %lu\n", sizeof(struct proxyCtrl)*proxyCtrlPoolSize);
+  //fprintf(stdout, "Profiler: PXN   pool size (bytes): %lu\n", sizeof(struct proxyOp)*detachPoolSize);
+
   *context = ctx;
   return ncclSuccess;
 
@@ -154,7 +168,7 @@ __hidden ncclResult_t exampleProfilerFinalize(void* context) {
   free(ctx);
 
   // last thread cleans up shared detach pool
-  if (__atomic_fetch_sub(&initialized, 1, __ATOMIC_RELAXED) - 1 == 0) {
+  if (__atomic_sub_fetch(&initialized, 1, __ATOMIC_RELAXED) == 0) {
     start = (detachPoolIndex - detachPoolSize >= 0) ? detachPoolIndex - detachPoolSize : 0;
     end = detachPoolIndex;
     for (int i = start; i < end; i++) {
@@ -171,7 +185,7 @@ __hidden ncclResult_t exampleProfilerFinalize(void* context) {
 
 __hidden void updateEvent(void* handle);
 
-__hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr) {
+__hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, ncclProfilerEventDescr_t* eDescr) {
   *eHandle = NULL;
   struct context* ctx = (struct context *)context;
   if (eDescr->type == ncclProfileGroup) {
@@ -185,14 +199,15 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
         if (base->type == ncclProfileColl) {
           struct collective* c = (struct collective *)base;
           // reset event proxyOps & proxySteps
-          memset(c->send, 0, sizeof(struct proxyOp)*MAX_CHANNELS);
-          memset(c->recv, 0, sizeof(struct proxyOp)*MAX_CHANNELS);
+          memset(c->send, 0, sizeof(struct proxyOp)*MAX_CHANNELS*MAX_OPS);
+          memset(c->recv, 0, sizeof(struct proxyOp)*MAX_CHANNELS*MAX_OPS);
+          memset(c->nProxyOps, 0, sizeof(int)*MAX_CHANNELS);
           // release collective events in the group and return them to the collective pool
           __atomic_fetch_add(&ctx->collPoolBase, 1, __ATOMIC_RELAXED);
         } else if (base->type == ncclProfileP2p) {
           struct p2p* p = (struct p2p *)base;
           // reset event proxyOp and proxySteps
-          memset(&p->op, 0, sizeof(struct proxyOp));
+          memset(&p->op, 0, sizeof(struct proxyOp)*MAX_CHANNELS);
           // release p2p events in the group and return them to the p2p pool
           __atomic_fetch_add(&ctx->p2pPoolBase, 1, __ATOMIC_RELAXED);
         }
@@ -203,7 +218,6 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
       return ncclSuccess;
     }
     event->type = ncclProfileGroup;
-    __atomic_store_n(&event->refCount, 1, __ATOMIC_RELAXED);
     event->ctx = ctx;
     event->groupId = groupId;
     event->startTs = gettime() - startTime;
@@ -238,14 +252,11 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
     event->count = eDescr->coll.count;
     event->root = eDescr->coll.root;
     event->datatype = eDescr->coll.datatype;
-    event->op = eDescr->coll.op;
     event->trafficBytes = eDescr->coll.trafficBytes;
     event->nMaxChannels = eDescr->coll.nMaxChannels;
     event->nWarps = eDescr->coll.nWarps;
     event->algo = eDescr->coll.algo;
     event->proto = eDescr->coll.proto;
-    event->isCollnet = eDescr->coll.isCollnet;
-    event->isNvls = eDescr->coll.isNvls;
     *eHandle = event;
     taskEventQueueEnqueue(parent, (struct taskEventBase *)event);
     // increment the group ref counter so the event will staty open
@@ -326,9 +337,13 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
 
     if (eventBase->type == ncclProfileColl) {
       struct collective* parent = (struct collective *)eDescr->parentObj;
-      struct proxyOp* event = (eDescr->proxyOp.isSend) ? &parent->send[eDescr->proxyOp.channelId] : &parent->recv[eDescr->proxyOp.channelId];
+      int channelId = eDescr->proxyOp.channelId;
+      struct proxyOp* event = (eDescr->proxyOp.isSend) ?
+        &parent->send[channelId][parent->nProxyOps[channelId]++] :
+        &parent->recv[channelId][parent->nProxyOps[channelId]++];
+
       event->type = ncclProfileProxyOp;
-      event->channelId = eDescr->proxyOp.channelId;
+      event->channelId = channelId;
       event->pid = eDescr->proxyOp.pid;
       event->rank = eDescr->rank;
       event->peer = eDescr->proxyOp.peer;
@@ -338,13 +353,14 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
       event->parent = eventBase;
       event->startTs = gettime() - startTime;
       *eHandle = event;
-      __atomic_store_n(&parent->base.refCount, 1, __ATOMIC_RELAXED);
+      __atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED);
       debugEvent(event, "ProxyOpStart");
     } else { // ncclProfileP2p
       struct p2p* parent = (struct p2p *)eDescr->parentObj;
-      struct proxyOp* event = &parent->op;
+      int channelId = eDescr->proxyOp.channelId;
+      struct proxyOp* event = &parent->op[channelId];
       event->type = ncclProfileProxyOp;
-      event->channelId = eDescr->proxyOp.channelId;
+      event->channelId = channelId;
       event->pid = eDescr->proxyOp.pid;
       event->rank = eDescr->rank;
       event->peer = eDescr->proxyOp.peer;
@@ -354,7 +370,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
       event->parent = eventBase;
       event->startTs = gettime() - startTime;
       *eHandle = event;
-      __atomic_store_n(&parent->base.refCount, 1, __ATOMIC_RELAXED);
+      __atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED);
       debugEvent(event, "ProxyOpStart");
     }
  } else if (eDescr->type == ncclProfileProxyStep) {
@@ -379,7 +395,7 @@ void updateEvent(void* handle) {
   uint8_t type = *(uint8_t *)handle;
   if (type == ncclProfileGroup) {
     struct group* event = (struct group *)handle;
-    if (__atomic_fetch_sub(&event->refCount, 1, __ATOMIC_RELAXED) == 1) {
+    if (__atomic_sub_fetch(&event->refCount, 1, __ATOMIC_RELAXED) == 0) {
       event->stopTs = gettime() - startTime;
       // return group event to the pool
       __atomic_fetch_add(&event->ctx->groupPoolBase, 1, __ATOMIC_RELAXED);
@@ -387,7 +403,7 @@ void updateEvent(void* handle) {
     debugEvent(event, "GroupStop");
   } else if (type == ncclProfileColl) {
     struct collective* event = (struct collective *)handle;
-    if (__atomic_fetch_sub(&event->base.refCount, 1, __ATOMIC_RELAXED) == 1) {
+    if (__atomic_sub_fetch(&event->base.refCount, 1, __ATOMIC_RELAXED) == 0) {
       event->base.stopTs = gettime() - startTime;
       debugEvent(event, "CollStop");
       updateEvent(event->base.parent);
@@ -396,7 +412,7 @@ void updateEvent(void* handle) {
     debugEvent(event, "CollStop");
   } else if (type == ncclProfileP2p) {
     struct p2p* event = (struct p2p *)handle;
-    if (__atomic_fetch_sub(&event->base.refCount, 1, __ATOMIC_RELAXED) == 1) {
+    if (__atomic_sub_fetch(&event->base.refCount, 1, __ATOMIC_RELAXED) == 0) {
       event->base.stopTs = gettime() - startTime;
       debugEvent(event, "P2pStop");
       updateEvent(event->base.parent);
@@ -408,7 +424,7 @@ void updateEvent(void* handle) {
     event->stopTs = gettime() - startTime;
     if (event->pid != pid) {
       // only for proxyOps that don't have a parent collective/p2p (i.e., PXN)
-      int done = __atomic_fetch_add(&detachPoolDone, 1, __ATOMIC_RELAXED) + 1;
+      int done = __atomic_add_fetch(&detachPoolDone, 1, __ATOMIC_RELAXED);
       if (done == detachPoolSize) {
         // reset the event completed (done) counter
         __atomic_store_n(&detachPoolDone, 0, __ATOMIC_RELAXED);
@@ -451,12 +467,20 @@ __hidden ncclResult_t exampleProfilerStopEvent(void* eHandle) {
     struct collective* event = (struct collective *)eHandle;
     event->base.stopTs = gettime() - startTime;
     return ncclSuccess;
+  } else if (type == ncclProfileP2p) {
+    // stopping the p2p event in NCCL core does not
+    // mean the p2p has completed. It means the p2p
+    // was submitted/enqueued so we need to keep the event open
+    struct p2p* event = (struct p2p *)eHandle;
+    event->base.stopTs = gettime() - startTime;
+    return ncclSuccess;
   }
+
   updateEvent(eHandle);
   return ncclSuccess;
 }
 
-__hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs) {
+__hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfilerEventState_t eState, ncclProfilerEventStateArgs_t* eStateArgs) {
   // the event handle might be null if we run out of events
   if (eHandle == NULL) return ncclSuccess;
 
@@ -482,7 +506,7 @@ __hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfile
   return ncclSuccess;
 }
 
-ncclProfiler_v1_t ncclProfiler_v1 = {
+ncclProfiler_t ncclProfiler_v2 = {
   "Example-profiler",
   exampleProfilerInit,
   exampleProfilerStartEvent,
diff --git a/ext-profiler/example/print_event.c b/ext-profiler/example/print_event.c
index 490ba7ce44..f26a9eeb21 100644
--- a/ext-profiler/example/print_event.c
+++ b/ext-profiler/example/print_event.c
@@ -11,56 +11,6 @@
 
 #define __hidden __attribute__ ((visibility("hidden")))
 
-__hidden const char* ncclFuncToString(int func) {
-  switch(func) {
-    case 0:
-      return "ncclBroadcast";
-    case 1:
-      return "ncclReduce";
-    case 2:
-      return "ncclAllGather";
-    case 3:
-      return "ncclReduceScatter";
-    case 4:
-      return "ncclAllReduce";
-    case 5:
-      return "ncclSendRecv";
-    case 6:
-      return "ncclSend";
-    case 7:
-      return "ncclRecv";
-  }
-  return NULL;
-}
-
-__hidden const char* ncclAlgoToString(int algo) {
-  switch(algo) {
-    case 0:
-      return "Tree";
-    case 1:
-      return "Ring";
-    case 2:
-      return "CollnetDirect";
-    case 3:
-      return "CollnetChain";
-    case 4:
-      return "Nvls";
-    case 5:
-      return "NvlsTree";
-  }
-}
-
-__hidden const char* ncclProtoToString(int proto) {
-  switch(proto) {
-    case 0:
-      return "LL";
-    case 1:
-      return "LL128";
-    case 2:
-      return "Simple";
-  }
-}
-
 // FIXME: chrome tracing asynchronous events (following used) allow event nesting for events that have same id and category
 // It appears that nesting more than three events causes issues. Therefore, every event is given an increasing id and a
 // category that matches the type of event (GROUP, COLL, P2P, PROXY, NET)
@@ -77,24 +27,24 @@ __hidden void printGroupEventTrailer(FILE* fh, struct group* event) {
 
 static __thread int collId;
 __hidden void printCollEventHeader(FILE* fh, struct collective* event) {
-  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"SeqNum\": %lu, \"CommHash\": %lu, \"Rank\": %d, \"Count\": %lu, \"Datatype\": %d, \"Algorithm\": \"%s\", \"Protocol\": \"%s\", \"nMaxChannels\": %d}},\n",
-          ncclFuncToString(event->base.func), collId, getpid(), 1, event->base.startTs, event->seqNumber, event->base.commHash, event->base.rank, event->count, event->datatype, ncclAlgoToString(event->algo), ncclProtoToString(event->proto), event->nMaxChannels);
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"SeqNum\": %lu, \"CommHash\": %lu, \"Rank\": %d, \"Count\": %lu, \"Datatype\": \"%s\", \"Algorithm\": \"%s\", \"Protocol\": \"%s\", \"nMaxChannels\": %d}},\n",
+          event->base.func, collId, getpid(), 1, event->base.startTs, event->seqNumber, event->base.commHash, event->base.rank, event->count, event->datatype, event->algo, event->proto, event->nMaxChannels);
 }
 
 __hidden void printCollEventTrailer(FILE* fh, struct collective* event) {
   fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
-          ncclFuncToString(event->base.func), collId++, getpid(), 1, event->base.stopTs);
+          event->base.func, collId++, getpid(), 1, event->base.stopTs);
 }
 
 static __thread int p2pId;
 __hidden void printP2pEventHeader(FILE* fh, struct p2p* event) {
-  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"CommHash\": %lu, \"Rank\": %d, \"Peer\": %d, \"Count\": %lu, \"Datatype\": %d}},\n",
-          ncclFuncToString(event->base.func), p2pId, getpid(), 1, event->base.startTs, event->base.commHash, event->base.rank, event->peer, event->count, event->datatype);
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"CommHash\": %lu, \"Rank\": %d, \"Peer\": %d, \"Count\": %lu, \"Datatype\": \"%s\"}},\n",
+          event->base.func, p2pId, getpid(), 1, event->base.startTs, event->base.commHash, event->base.rank, event->peer, event->count, event->datatype);
 }
 
 __hidden void printP2pEventTrailer(FILE* fh, struct p2p* event) {
   fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
-          ncclFuncToString(event->base.func), p2pId++, getpid(), 1, event->base.stopTs);
+          event->base.func, p2pId++, getpid(), 1, event->base.stopTs);
 }
 
 static __thread int proxyOpId;
@@ -250,14 +200,18 @@ void printEvent(FILE* fh, void* handle) {
     struct collective* c = (struct collective *)handle;
     printCollEventHeader(fh, c);
     for (int i = 0; i < MAX_CHANNELS; i++) {
-      printEvent(fh, &c->send[i]);
-      printEvent(fh, &c->recv[i]);
+      for (int j = 0; j < c->nProxyOps[i]; j++) {
+        printEvent(fh, &c->send[i][j]);
+        printEvent(fh, &c->recv[i][j]);
+      }
     }
     printCollEventTrailer(fh, c);
   } else if (type == ncclProfileP2p) {
     struct p2p* p = (struct p2p *)handle;
     printP2pEventHeader(fh, p);
-    printEvent(fh, &p->op);
+    for (int i = 0; i < MAX_CHANNELS; i++) {
+      printEvent(fh, &p->op[i]);
+    }
     printP2pEventTrailer(fh, p);
   } else if (type == ncclProfileProxyOp) {
     struct proxyOp* p = (struct proxyOp *)handle;
diff --git a/ext-tuner/example/nccl/tuner.h b/ext-tuner/example/nccl/tuner.h
index aafabd72d8..77b543d12c 100644
--- a/ext-tuner/example/nccl/tuner.h
+++ b/ext-tuner/example/nccl/tuner.h
@@ -67,6 +67,7 @@ typedef struct {
   //   - numPipeOps: number of operations in the group
   //   - numAlgo: number of algorithms in collCostTable
   //   - numProto: number of protocols in collCostTable
+  //   - regBuff: can register user buffer
   //
   // Outputs:
   //   - nChannels: number of channels (hence SMs) to be used.
@@ -82,15 +83,15 @@ typedef struct {
   // Unset fields will be set automatically by NCCL.
   ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
                               int numPipeOps, float** collCostTable, int numAlgo, int numProto,
-                              int* nChannels);
+                              int regBuff, int* nChannels);
 
   // Terminates the plugin and cleans up any resources that the plugin allocated.
   // context: tuner context object
   ncclResult_t (*destroy)(void* context);
-} ncclTuner_v3_t;
+} ncclTuner_v4_t;
 
-typedef ncclTuner_v3_t ncclTuner_t;
+typedef ncclTuner_v4_t ncclTuner_t;
 
-#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v3"
+#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4"
 
 #endif
diff --git a/ext-tuner/example/plugin.c b/ext-tuner/example/plugin.c
index c3cf00dfde..7925dcfa18 100644
--- a/ext-tuner/example/plugin.c
+++ b/ext-tuner/example/plugin.c
@@ -12,10 +12,11 @@ __hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t
 
 __hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes,
                               int numPipeOps, float** collCostTable, int numAlgo, int numProto,
-                              int* nChannels) {
+                              int regBuff, int* nChannels) {
   // Update NCCL core generated cost table. Updated table will be evaluated by NCCL to pick the best algo/proto combo
-  if (collCostTable[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) {
-    collCostTable[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0;
+  float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
+  if (table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) {
+    table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0;
   }
   *nChannels = 1;
   return ncclSuccess;
@@ -25,7 +26,7 @@ __hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; }
 
 #define PLUGIN_NAME "Example"
 
-const ncclTuner_v3_t ncclTunerPlugin_v3 = {
+const ncclTuner_v4_t ncclTunerPlugin_v4 = {
   .name = PLUGIN_NAME,
   .init = pluginInit,
   .getCollInfo = pluginGetCollInfo,
diff --git a/makefiles/common.mk b/makefiles/common.mk
index 59e4151cee..82164ab5c0 100644
--- a/makefiles/common.mk
+++ b/makefiles/common.mk
@@ -12,6 +12,7 @@ DEBUG ?= 0
 ASAN ?= 0
 UBSAN ?= 0
 TRACE ?= 0
+WERROR ?= 0
 PROFAPI ?= 1
 NVTX ?= 1
 RDMA_CORE ?= 0
@@ -115,6 +116,10 @@ ifeq ($(NVTX), 0)
 CXXFLAGS  += -DNVTX_DISABLE
 endif
 
+ifneq ($(WERROR), 0)
+CXXFLAGS  += -Werror
+endif
+
 ifneq ($(KEEP), 0)
 NVCUFLAGS += -keep
 endif
diff --git a/makefiles/version.mk b/makefiles/version.mk
index bcc0ff3ce1..2523009340 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 23
-NCCL_PATCH   := 4
+NCCL_MINOR   := 24
+NCCL_PATCH   := 3
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
diff --git a/src/Makefile b/src/Makefile
index b254eac32c..2c5d9e863e 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -7,17 +7,22 @@ include ../makefiles/common.mk
 include ../makefiles/version.mk
 
 ##### src files
-INCEXPORTS  := nccl.h nccl_net.h
+INCEXPORTS  := nccl.h
 LIBSRCFILES := \
 	bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \
-	init.cc init_nvtx.cc net.cc proxy.cc transport.cc register.cc \
+	init.cc init_nvtx.cc net.cc proxy.cc transport.cc \
 	$(wildcard graph/*.cc) \
 	$(wildcard misc/*.cc) \
-	$(wildcard transport/*.cc)
+	$(wildcard transport/*.cc) \
+	$(wildcard register/*.cc) \
+	$(filter-out ras/client.cc,$(wildcard ras/*.cc))
+BINSRCFILES := ras/client.cc
 
 ##### lib files
 LIBNAME     := libnccl.so
 STATICLIBNAME := libnccl_static.a
+##### binaries
+BINNAME := ncclras
 ##### pkgconfig files
 PKGCONFIGFILE := nccl.pc
 ##### dirs
@@ -26,11 +31,12 @@ INCDIR := $(BUILDDIR)/include
 LIBDIR := $(BUILDDIR)/lib
 OBJDIR := $(BUILDDIR)/obj
 PKGDIR := $(BUILDDIR)/lib/pkgconfig
+BINDIR := $(BUILDDIR)/bin
 ##### target files
 CUDARTLIB  ?= cudart_static
 
+# Use compatibility shim only with static cudart; see https://github.com/NVIDIA/nccl/issues/658
 ifeq ($(CUDARTLIB), cudart_static)
-	# Use compatibility shim only with static cudart; see https://github.com/NVIDIA/nccl/issues/658
 	LIBSRCFILES += enhcompat.cc
 endif
 
@@ -40,18 +46,21 @@ LIBTARGET  := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH))
 STATICLIBTARGET := $(STATICLIBNAME)
 PKGTARGET  := $(PKGCONFIGFILE)
 LIBOBJ     := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o)
-DEPFILES   := $(LIBOBJ:%.o=%.d)
+BINOBJ     := $(BINSRCFILES:%.cc=$(OBJDIR)/%.o)
+DEPFILES   := $(LIBOBJ:%.o=%.d) $(BINOBJ:%.o=%.d)
 LDFLAGS    += -L${CUDA_LIB} -l$(CUDARTLIB) -lpthread -lrt -ldl
 
 DEVMANIFEST := $(BUILDDIR)/obj/device/manifest
 
 ##### rules
-build : lib staticlib
+build : lib staticlib binary
 
 lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET) $(PKGDIR)/$(PKGTARGET)
 
 staticlib : $(LIBDIR)/$(STATICLIBTARGET)
 
+binary : $(BINDIR)/$(BINNAME)
+
 $(DEVMANIFEST): ALWAYS_REBUILD $(INCTARGETS)
 	$(MAKE) -C ./device
 
@@ -85,6 +94,11 @@ $(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) $(DEVMANIFEST)
 	mkdir -p $(LIBDIR)
 	ar cr $@ $(LIBOBJ) $$(cat $(DEVMANIFEST))
 
+$(BINDIR)/$(BINNAME): $(BINOBJ)
+	@printf "Linking    %-35s > %s\n" $(BINNAME) $@
+	mkdir -p $(BINDIR)
+	$(CXX) $(CXXFLAGS) $^ -o $@
+
 $(PKGDIR)/nccl.pc : nccl.pc.in
 	mkdir -p $(PKGDIR)
 	@printf "Generating %-35s > %s\n" $< $@
@@ -121,15 +135,17 @@ $(OBJDIR)/%.o : %.cc $(INCTARGETS)
 
 clean :
 	$(MAKE) -C device clean
-	rm -rf ${INCDIR} ${LIBDIR} ${PKGDIR} ${OBJDIR}
+	rm -rf ${BINDIR} ${INCDIR} ${LIBDIR} ${PKGDIR} ${OBJDIR}
 
 install : build
 	mkdir -p $(PREFIX)/lib
 	mkdir -p $(PREFIX)/lib/pkgconfig
 	mkdir -p $(PREFIX)/include
+	mkdir -p $(PREFIX)/bin
 	cp -P -v $(BUILDDIR)/lib/lib* $(PREFIX)/lib/
 	cp -P -v $(BUILDDIR)/lib/pkgconfig/* $(PREFIX)/lib/pkgconfig/
 	cp -v $(BUILDDIR)/include/* $(PREFIX)/include/
+	cp -v $(BUILDDIR)/bin/ncclras $(PREFIX)/bin/
 
 FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cc" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|gdrwrap.h|nccl.h')
 # Note that formatting.mk defines a new target so in order to not overwrite the default target,
diff --git a/src/bootstrap.cc b/src/bootstrap.cc
index c1d085e4ce..d11e59953b 100644
--- a/src/bootstrap.cc
+++ b/src/bootstrap.cc
@@ -13,6 +13,7 @@
 #include <sys/types.h>
 #include "proxy.h"
 #include "param.h"
+#include "ras.h"
 
 #define BOOTSTRAP_N_CHECK_ABORT           10000
 #define BOOTSTRAP_TAG_CONNECT             (0x1 << 31)
@@ -110,13 +111,13 @@ ncclResult_t bootstrapNetInit() {
         if (nIfs <= 0) {
           WARN("Bootstrap : no socket interface found");
           pthread_mutex_unlock(&bootstrapNetLock);
-          return ncclInternalError;
+          return ncclInvalidUsage;
         }
       }
       char line[SOCKET_NAME_MAXLEN+MAX_IF_NAME_SIZE+2];
-      sprintf(line, " %s:", bootstrapNetIfName);
+      snprintf(line, sizeof(line), " %s:", bootstrapNetIfName);
       ncclSocketToString(&bootstrapNetIfAddr, line+strlen(line));
-      INFO(NCCL_BOOTSTRAP, "Bootstrap : Using%s", line);
+      INFO(NCCL_BOOTSTRAP, "Bootstrap: Using%s", line);
       bootstrapNetInitDone = 1;
     }
     pthread_mutex_unlock(&bootstrapNetLock);
@@ -152,7 +153,7 @@ static ncclResult_t netIsend(ncclNet_t* net, void* sendComm, void* data, int siz
                              int* done) {
   if (*done) return ncclSuccess;
   if (!*sendReq) {
-    NCCLCHECK(net->isend(sendComm, data, size, tag, dataHandle, sendReq));
+    NCCLCHECK(net->isend(sendComm, data, (size_t)size, tag, dataHandle, sendReq));
   }
   if (*sendReq) {
     NCCLCHECK(net->test(*sendReq, done, NULL));
@@ -166,7 +167,8 @@ static ncclResult_t netIrecv(ncclNet_t* net, void* recvComm, void* data, int siz
                              int* done) {
   if (*done) return ncclSuccess;
   if (!*recvReq) {
-    NCCLCHECK(net->irecv(recvComm, 1, &data, &size, &tag, &dataHandle, recvReq));
+    size_t size64 = size; 
+    NCCLCHECK(net->irecv(recvComm, 1, &data, &size64, &tag, &dataHandle, recvReq));
   }
   if (*recvReq) {
     NCCLCHECK(net->test(*recvReq, done, NULL));
@@ -302,7 +304,7 @@ static void* bootstrapRoot(void* rargs) {
       // if the number of root > 1, we will receive one extra info from the first local_id of the next root
       n2send = nRankFromRoot(iroot, nranks, nroots);
       nrecv = n2send + ((nroots > 1) ? 1 : 0);
-      NCCLCHECKGOTO(ncclCalloc(&rankInfo, nrecv * sizeof(union ringConnectInfo)), res, out);
+      NCCLCHECKGOTO(ncclCalloc(&rankInfo, nrecv), res, out);
       NCCLCHECKGOTO(ncclCalloc(&rankAddressesRoot, nrecv), res, out);
     }
 
@@ -492,29 +494,37 @@ static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) {
         struct netIf userIfs[MAX_OOB_DEVS];
         int nUserIfs = parseStringList(userIfEnv, userIfs, MAX_OOB_DEVS);
         // loop over the device and return the first one matching
-        int devId = 0;
         int nDev = 0;
         NCCLCHECK(comm->ncclNet->devices(&nDev));
+        int devId = 0;
         while (devId < nDev) {
           ncclNetProperties_t props;
           comm->ncclNet->getProperties(devId, &props);
           // check against user specified HCAs/ports
-          bool found = matchIfList(props.name, props.port, userIfs, nUserIfs, searchExact) ^ searchNot;
-          if (found) {
+          if (matchIfList(props.name, props.port, userIfs, nUserIfs, searchExact) ^ searchNot) {
+            // All plain physical devices have been initialized at this point
             devOOB = devId;
             break;
           }
           devId++;
         }
         if (devOOB == -1) {
-          WARN("no device found matching NCCL_OOB_NET_IFNAME=%s, ignoring", userIfEnv);
-          goto noEnv;
+          if (!searchNot)
+            WARN("no device found matching %s%s, verify NCCL_OOB_NET_IFNAME", searchExact ? "exactly " : "", userIfEnv);
+          else
+            WARN("no device found after excluding %s%s, verify NCCL_OOB_NET_IFNAME", searchExact ? "exactly " : "", userIfEnv);
+          pthread_mutex_unlock(&bootstrapNetLock);
+          return ncclInvalidArgument;
         }
       } else {
-      noEnv:
         // default choice is device 0
         devOOB = 0;
       }
+      // display info on the chosen device
+      ncclNetProperties_t props;
+      ncclResult_t res = comm->ncclNet->getProperties(devOOB, &props);
+      bool hasProp = res == ncclSuccess;
+      INFO(NCCL_BOOTSTRAP, "Bootstrap: Using %s:%d", (hasProp) ? props.name : "N/A", (hasProp) ? props.port : -1);
     }
     pthread_mutex_unlock(&bootstrapNetLock);
   }
@@ -545,7 +555,8 @@ static ncclResult_t socketRingConnect(ncclSocketAddress* addr, struct ncclSocket
 }
 static ncclResult_t ringAllInfo(struct ncclComm* comm, struct bootstrapState* state,
                                 union ncclSocketAddress* peerAddresss,
-                                union ncclSocketAddress* peerProxy, uint64_t* peerUDS) {
+                                union ncclSocketAddress* peerProxy, uint64_t* peerUDS,
+                                struct rasRankInit* rasRanks) {
   ncclResult_t res = ncclSuccess;
   int rank = comm->rank;
   int nRanks = comm->nRanks;
@@ -553,6 +564,7 @@ static ncclResult_t ringAllInfo(struct ncclComm* comm, struct bootstrapState* st
     union ncclSocketAddress peerAddress;
     union ncclSocketAddress peerProxy;
     uint64_t peerUDS;
+    struct rasRankInit rasRank;
   }* ringData = NULL;
 
   NCCLCHECK(ncclCalloc(&ringData, nRanks));
@@ -563,6 +575,8 @@ static ncclResult_t ringAllInfo(struct ncclComm* comm, struct bootstrapState* st
     memcpy(&(ringData[rank].peerProxy), peerProxy + rank, sizeof(union ncclSocketAddress));
   if (peerUDS)
     memcpy(&(ringData[rank].peerUDS), peerUDS + rank, sizeof(uint64_t));
+  if (rasRanks)
+    memcpy(&(ringData[rank].rasRank), rasRanks + rank, sizeof(*rasRanks));
 
   // allgather
   NCCLCHECKGOTO(bootstrapAllGather(state, ringData, sizeof(struct bootstrapRingData)), res, exit);
@@ -575,6 +589,8 @@ static ncclResult_t ringAllInfo(struct ncclComm* comm, struct bootstrapState* st
       memcpy(peerProxy + irank, &(ringData[irank].peerProxy), sizeof(union ncclSocketAddress));
     if (peerUDS)
       memcpy(peerUDS + irank, &(ringData[irank].peerUDS), sizeof(uint64_t));
+    if (rasRanks)
+      memcpy(rasRanks + irank, &(ringData[irank].rasRank), sizeof(*rasRanks));
   }
 
 exit:
@@ -598,7 +614,10 @@ fail:
 NCCL_PARAM(StaggerRate, "UID_STAGGER_RATE", 7000);
 NCCL_PARAM(StaggerThreshold, "UID_STAGGER_THRESHOLD", 256);
 
+NCCL_PARAM(RasEnable, "RAS_ENABLE", 1);
+
 ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
+  ncclResult_t result = ncclSuccess;
   int rank = comm->rank;
   int nranks = comm->nRanks;
   // char nextPeerHandle[NCCL_NET_HANDLE_MAXSIZE];
@@ -607,6 +626,8 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
   struct ncclSocket sock, listenSockRoot;
   struct extInfo info = {0};
   union ringConnectInfo nextPeer;
+  bool performRasAddRanks = true;
+  struct rasRankInit* rasRanks = nullptr;
 
   uint64_t timers[BOOTSTRAP_INIT_TIME_N] = {0};
 
@@ -696,23 +717,45 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
   // in case of failure, those resources will be free'd when calling bootstrapDestroy, so we can return immediatly
   NCCLCHECK(ncclCalloc(&state->peerProxyAddresses, nranks));
   NCCLCHECK(ncclCalloc(&proxySocket, 1));
-  NCCLCHECK(createListenSocket(comm, comm->magic, proxySocket, state->peerProxyAddresses + rank, ncclSocketTypeProxy));
+  NCCLCHECKGOTO(createListenSocket(comm, comm->magic, proxySocket, state->peerProxyAddresses + rank, ncclSocketTypeProxy), result, fail);
 
-  NCCLCHECK(ncclCalloc(&state->peerProxyAddressesUDS, nranks));
-  NCCLCHECK(getUDS(state->peerProxyAddressesUDS + rank));
+  NCCLCHECKGOTO(ncclCalloc(&state->peerProxyAddressesUDS, nranks), result, fail);
+  NCCLCHECKGOTO(getUDS(state->peerProxyAddressesUDS + rank), result, fail);
 
   // create a socket for others to reach out (P2P)
   union ncclSocketAddress peerSocketAddress;
-  NCCLCHECK(createListenSocket(comm, comm->magic, &STATE_LISTEN(state, peerSocket), &peerSocketAddress, ncclSocketTypeBootstrap));
-  NCCLCHECK(ncclCalloc(&state->peerP2pAddresses, nranks * sizeof(union ncclSocketAddress)));
+  NCCLCHECKGOTO(createListenSocket(comm, comm->magic, &STATE_LISTEN(state, peerSocket), &peerSocketAddress, ncclSocketTypeBootstrap), result, fail);
+  NCCLCHECKGOTO(ncclCalloc(&state->peerP2pAddresses, nranks), result, fail);
   memcpy(state->peerP2pAddresses + rank, &peerSocketAddress, sizeof(union ncclSocketAddress));
 
+  // Initialize RAS
+  if (ncclParamRasEnable() == 1) {
+    // The RAS thread will take care of freeing the memory allocated below.
+    NCCLCHECK(ncclCalloc(&rasRanks, nranks));
+    memcpy(&rasRanks[rank].addr, &bootstrapNetIfAddr, sizeof(rasRanks[rank].addr));
+    rasRanks[rank].pid = getpid();
+    rasRanks[rank].cudaDev = comm->cudaDev;
+    rasRanks[rank].nvmlDev = comm->nvmlDev;
+    if (ncclRasCommInit(comm, rasRanks+rank) != ncclSuccess) {
+      INFO(NCCL_INIT|NCCL_RAS, "Continuing in spite of a RAS initialization error");
+      // We should still participate in the ringAllInfo below as the peers will be waiting for us.
+      // Just make sure that the address is clearly invalid...
+      memset(rasRanks+rank, '\0', sizeof(*rasRanks));
+      performRasAddRanks = false;
+    }
+  }
+
   BOOTSTRAP_PROF_OPEN(timers[BOOTSTRAP_INIT_TIME_RING]);
-  NCCLCHECK(ringAllInfo(comm, state, state->peerP2pAddresses, state->peerProxyAddresses, state->peerProxyAddressesUDS));
+  NCCLCHECKGOTO(ringAllInfo(comm, state, state->peerP2pAddresses, state->peerProxyAddresses, state->peerProxyAddressesUDS, rasRanks), result, fail);
   BOOTSTRAP_PROF_CLOSE(timers[BOOTSTRAP_INIT_TIME_RING]);
 
   // Create the service proxy and get the UDS
-  NCCLCHECK(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses, state->peerProxyAddressesUDS));
+  NCCLCHECKGOTO(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses, state->peerProxyAddressesUDS), result, fail);
+
+  if (ncclParamRasEnable() == 1 && performRasAddRanks) {
+    if (ncclRasAddRanks(rasRanks, nranks) != ncclSuccess)
+      INFO(NCCL_INIT|NCCL_RAS, "Continuing in spite of a RAS initialization error");
+  }
 
   BOOTSTRAP_PROF_CLOSE(timers[BOOTSTRAP_INIT_TIME_TOTAL]);
   TRACE(NCCL_BOOTSTRAP, "rank %d nranks %d - DONE", rank, nranks);
@@ -722,8 +765,11 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
        timers[BOOTSTRAP_INIT_TIME_RECV] / 1e9,
        timers[BOOTSTRAP_INIT_TIME_RING] / 1e9,
        timers[BOOTSTRAP_INIT_TIME_DELAY] / 1e9);
-
-  return ncclSuccess;
+exit:
+  return result;
+fail:
+  free(proxySocket);
+  goto exit;
 }
 
 ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks) {
@@ -761,6 +807,11 @@ ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclCo
   union ncclSocketAddress peerSocketAddress;
   NCCLCHECK(createListenSocket(comm, comm->magic, &STATE_LISTEN(state, peerSocket), &peerSocketAddress, ncclSocketTypeBootstrap));
 
+  if (ncclParamRasEnable() == 1) {
+    if (ncclRasCommInit(comm, nullptr) != ncclSuccess)
+      INFO(NCCL_INIT|NCCL_RAS, "Continuing in spite of a RAS initialization error");
+  }
+
   // Get addr from next rank using the parent's connections
   NCCLCHECKGOTO(bootstrapSend(parent->bootstrap, prev, BOOTSTRAP_TAG_COMMSPLIT, &info, sizeof(union ringConnectInfo)), ret, fail);
   NCCLCHECKGOTO(bootstrapRecv(parent->bootstrap, next, BOOTSTRAP_TAG_COMMSPLIT, &nextPeer, sizeof(union ringConnectInfo)), ret, fail);
@@ -773,14 +824,14 @@ ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclCo
     NCCLCHECK(socketRingConnect(&nextPeer.addr, &STATE_RING(state, socket.send), &STATE_LISTEN(state, socket), &STATE_RING(state, socket.recv), comm->magic, state->abortFlag));
   }
 
-  NCCLCHECKGOTO(ncclCalloc(&state->peerP2pAddresses, nranks * sizeof(union ncclSocketAddress)), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&state->peerP2pAddresses, nranks), ret, fail);
   memcpy(state->peerP2pAddresses + rank, &peerSocketAddress, sizeof(union ncclSocketAddress));
   if (parent->config.splitShare) {
     /* map local rank to top parent local rank. */
     for (int i = 0; i < nranks; ++i) {
       comm->topParentRanks[i] = parent->topParentRanks[parentRanks[i]];
     }
-    NCCLCHECKGOTO(ringAllInfo(comm, state, state->peerP2pAddresses, NULL, NULL), ret, fail);
+    NCCLCHECKGOTO(ringAllInfo(comm, state, state->peerP2pAddresses, NULL, NULL, NULL), ret, fail);
   } else {
     NCCLCHECKGOTO(ncclCalloc(&state->peerProxyAddresses, nranks), ret, fail);
     NCCLCHECKGOTO(ncclCalloc(&state->peerProxyAddressesUDS, nranks), ret, fail);
@@ -788,7 +839,7 @@ ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclCo
     NCCLCHECKGOTO(ncclCalloc(&proxySocket, 1), ret, fail);
     NCCLCHECKGOTO(getUDS(state->peerProxyAddressesUDS + rank), ret, fail);
     NCCLCHECKGOTO(createListenSocket(comm, comm->magic, proxySocket, state->peerProxyAddresses + rank, ncclSocketTypeProxy), ret, fail);
-    NCCLCHECKGOTO(ringAllInfo(comm, state, state->peerP2pAddresses, state->peerProxyAddresses, state->peerProxyAddressesUDS), ret, fail);
+    NCCLCHECKGOTO(ringAllInfo(comm, state, state->peerP2pAddresses, state->peerProxyAddresses, state->peerProxyAddressesUDS, NULL), ret, fail);
     NCCLCHECKGOTO(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses, state->peerProxyAddressesUDS), ret, fail);
   }
 
@@ -811,7 +862,7 @@ static ncclResult_t socketConnect(void* commState, int peer, int tag, struct ncc
   struct bootstrapState* state = (struct bootstrapState*)commState;
 
   struct socketAckInfo ack = (struct socketAckInfo){.rank = state->rank, .tag = tag};
-  NCCLCHECKGOTO(ncclSocketInit(sock, state->peerP2pAddresses + peer, state->magic, ncclSocketTypeBootstrap), ret, fail);
+  NCCLCHECKGOTO(ncclSocketInit(sock, state->peerP2pAddresses + peer, state->magic, ncclSocketTypeBootstrap, state->abortFlag), ret, fail);
   NCCLCHECKGOTO(ncclSocketConnect(sock), ret, fail);
   NCCLCHECKGOTO(socketSend(sock, &ack, sizeof(struct socketAckInfo)), ret, fail);
   return ncclSuccess;
diff --git a/src/collectives.cc b/src/collectives.cc
index be9468d49b..479d4c511b 100644
--- a/src/collectives.cc
+++ b/src/collectives.cc
@@ -44,9 +44,9 @@ const char* ncclDatatypeToString(ncclDataType_t type) {
   case ncclFloat16: return "ncclFloat16";
   case ncclFloat32: return "ncclFloat32";
   case ncclFloat64: return "ncclFloat64";
-#if defined(__CUDA_BF16_TYPES_EXIST__)
   case ncclBfloat16: return "ncclBfloat16";
-#endif
+  case ncclFloat8e4m3: return "ncclFloat8e4m3";
+  case ncclFloat8e5m2: return "ncclFloat8e5m2";
   default: return "Unknown";
   }
 }
@@ -87,8 +87,7 @@ ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcoun
   struct ncclInfo info = { ncclFuncAllGather, "AllGather",
     sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
     ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS };
-  NCCLCHECK(ncclEnqueueCheck(&info));
-  return ncclSuccess;
+  return ncclEnqueueCheck(&info);
 }
 
 NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
@@ -111,8 +110,7 @@ ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
   struct ncclInfo info = { ncclFuncAllReduce, "AllReduce",
     sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
     ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS };
-  NCCLCHECK(ncclEnqueueCheck(&info));
-  return ncclSuccess;
+  return ncclEnqueueCheck(&info);
 }
 
 NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
@@ -133,16 +131,14 @@ ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, n
   struct ncclInfo info = { ncclFuncBroadcast, "Broadcast",
     sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
     BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS };
-  NCCLCHECK(ncclEnqueueCheck(&info));
-  return ncclSuccess;
+  return ncclEnqueueCheck(&info);
 }
 /* Deprecated original "in place" function, similar to MPI */
 NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root,
     ncclComm_t comm, cudaStream_t stream);
 ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
     ncclComm_t comm, cudaStream_t stream) {
-  NCCLCHECK(ncclBroadcast(buff, buff, count, datatype, root, comm, stream));
-  return ncclSuccess;
+  return ncclBroadcast(buff, buff, count, datatype, root, comm, stream);
 }
 
 NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
@@ -166,8 +162,7 @@ ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
   struct ncclInfo info = { ncclFuncReduce, "Reduce",
     sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
     REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS };
-  NCCLCHECK(ncclEnqueueCheck(&info));
-  return ncclSuccess;
+  return ncclEnqueueCheck(&info);
 }
 
 NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
@@ -189,8 +184,7 @@ ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recv
   struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter",
     sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
     REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS };
-  NCCLCHECK(ncclEnqueueCheck(&info));
-  return ncclSuccess;
+  return ncclEnqueueCheck(&info);
 }
 
 struct NvtxParamsSendRecv {
@@ -212,12 +206,7 @@ ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatyp
   struct ncclInfo info = { ncclFuncSend, "Send",
     NULL, (void*)sendbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
     1, 1 };
-  ncclResult_t ret;
-  NCCLCHECK(ncclGroupStart());
-  NCCLCHECKGOTO(ncclEnqueueCheck(&info), ret, exit);
-exit:
-  NCCLCHECK(ncclGroupEnd());
-  return ret;
+  return ncclEnqueueCheck(&info);
 }
 
 NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
@@ -230,10 +219,5 @@ ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int
   struct ncclInfo info = { ncclFuncRecv, "Recv",
     NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
     1, 1 };
-  ncclResult_t ret;
-  NCCLCHECK(ncclGroupStart());
-  NCCLCHECKGOTO(ncclEnqueueCheck(&info), ret, exit);
-exit:
-  NCCLCHECK(ncclGroupEnd());
-  return ret;
+  return ncclEnqueueCheck(&info);
 }
diff --git a/src/debug.cc b/src/debug.cc
index d21ea3d12e..2ea6eabdee 100644
--- a/src/debug.cc
+++ b/src/debug.cc
@@ -8,6 +8,7 @@
 #include "nccl_net.h"
 #include <stdlib.h>
 #include <stdarg.h>
+#include <stdio.h>
 #include <string.h>
 #include <strings.h>
 #include <sys/syscall.h>
@@ -89,6 +90,8 @@ static void ncclDebugInit() {
         mask = NCCL_REG;
       } else if (strcasecmp(subsys, "PROFILE") == 0) {
         mask = NCCL_PROFILE;
+      } else if (strcasecmp(subsys, "RAS") == 0) {
+        mask = NCCL_RAS;
       } else if (strcasecmp(subsys, "ALL") == 0) {
         mask = NCCL_ALL;
       }
@@ -224,6 +227,19 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
   }
 }
 
+NCCL_API(void, ncclResetDebugInit);
+void ncclResetDebugInit() {
+  // Cleans up from a previous ncclDebugInit() and reruns.
+  // Use this after changing NCCL_DEBUG and related parameters in the environment.
+  __atomic_load_n(&ncclDebugLevel, __ATOMIC_ACQUIRE);
+  if (ncclDebugFile != stdout) {
+    fclose(ncclDebugFile);
+    ncclDebugFile = stdout;
+  }
+  ncclDebugLevel = -1;
+  ncclDebugInit();
+}
+
 NCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0);
 
 void ncclSetThreadName(pthread_t thread, const char *fmt, ...) {
diff --git a/src/device/all_gather.h b/src/device/all_gather.h
index fb56e483b6..5d79d73572 100644
--- a/src/device/all_gather.h
+++ b/src/device/all_gather.h
@@ -9,64 +9,88 @@
 #include "primitives.h"
 
 namespace {
-  template<typename T, typename RedOp, typename Proto>
+  template<typename T, typename RedOp, typename Proto, bool isNetOffload = false>
   __device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
     ncclRing *ring = &ncclShmem.channel.ring;
     const int *ringRanks = ring->userRanks;
     const int nranks = ncclShmem.comm.nRanks;
-    size_t count, partOffset, partCount, chunkCount;
+    ssize_t count, partOffset, partCount, chunkCount;
     ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &partOffset, &partCount, &chunkCount);
-    size_t offset;
-    size_t dataOffset;
+    ssize_t offset;
+    ssize_t dataOffset;
     int nelem;
     int rankDest;
-
+    int workNthreads;
     T *inputBuf = (T*)work->sendbuff;
     T *outputBuf = (T*)work->recvbuff;
-    // Coverity reports that the callee treats &ring->next as an array.  However, due to the use of
-    // FanSymmetric<1>, only the first element is ever accessed, so it's fine.
-    // coverity[callee_ptr_arith:FALSE]
-    Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0> prims
-      (tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg, 0, 0, 0, work);
 
-    for (size_t elemOffset = 0; elemOffset < partCount; elemOffset += chunkCount) {
-      /////////////// begin AllGather steps ///////////////
-      nelem = min(chunkCount, partCount - elemOffset);
-      dataOffset = partOffset + elemOffset;
+    // If isNetOffload == true, we only use 1 warp to drive Ring algo/network communication
+    // and the rest of warps proceed to copy src data into dst buffer in parallel when AG
+    // is not in-place.
+    if (isNetOffload) {
+      workNthreads = WARP_SIZE;
+      chunkCount = NCCL_MAX_NET_SIZE;
+    } else {
+      workNthreads = nthreads;
+    }
 
-      // step 0: push data to next GPU
-      rankDest = ringRanks[0];
-      offset = dataOffset + rankDest * count;
+    if (tid < workNthreads) {
+      // Coverity reports that the callee treats &ring->next as an array.  However, due to the use of
+      // FanSymmetric<1>, only the first element is ever accessed, so it's fine.
+      // coverity[callee_ptr_arith:FALSE]
+      Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0, isNetOffload> prims
+        (tid, workNthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg, 0, 0, 0, work, NULL, isNetOffload ? NCCL_MAX_NET_SIZE : 0);
+      for (size_t elemOffset = 0; elemOffset < partCount; elemOffset += chunkCount) {
+        /////////////// begin AllGather steps ///////////////
+        nelem = min(chunkCount, partCount - elemOffset);
+        dataOffset = partOffset + elemOffset;
 
-      if (inputBuf + dataOffset == outputBuf + offset) { // In place
-        prims.directSend(dataOffset, offset, nelem);
-      } else {
-        prims.directCopySend(dataOffset, offset, nelem);
-      }
-
-      // k-2 steps: copy to next GPU
-      for (int j=1; j<nranks-1; ++j) {
-        rankDest = ringRanks[nranks-j];
+        // step 0: push data to next GPU
+        rankDest = ringRanks[0];
         offset = dataOffset + rankDest * count;
 
-        prims.directRecvCopyDirectSend(offset, nelem);
+        if ((inputBuf + dataOffset == outputBuf + offset) || isNetOffload) { // In place or onePPN
+          prims.directSend(dataOffset, offset, nelem);
+        } else {
+          prims.directCopySend(dataOffset, offset, nelem);
+        }
+
+        // k-2 steps: copy to next GPU
+        for (int j = 1; j < nranks - 1; ++j) {
+          rankDest = ringRanks[nranks - j];
+          offset = dataOffset + rankDest * count;
+          prims.directRecvCopyDirectSend(offset, offset, nelem);
+        }
+
+        // Make final copy from buffer to dest.
+        rankDest = ringRanks[1];
+        offset = dataOffset + rankDest * count;
+
+        // Final wait/copy.
+        prims.directRecv(offset, offset, nelem);
       }
-
-      // Make final copy from buffer to dest.
-      rankDest = ringRanks[1];
-      offset = dataOffset + rankDest * count;
-
-      // Final wait/copy.
-      prims.directRecv(offset, offset, nelem);
+    } else if (inputBuf != outputBuf + ringRanks[0] * count) {
+      inputBuf = inputBuf + partOffset;
+      outputBuf = outputBuf + partOffset + ringRanks[0] * count;
+      reduceCopy<COLL_UNROLL, RedOp, T, 0, 1, 1, 0, 1, 1, /*PreOpSrcs=*/0>
+        (tid - workNthreads, nthreads - workNthreads, work->redOpArg, &work->redOpArg, false, 1, (void**)&inputBuf, 1, (void**)&outputBuf, partCount);
     }
+    // we have to wait for all warps before we can proceed to the next work;
+    // otherwise, we can have contention if next work will use the outputBuf
+    // in this work. We use bar 14 to avoid conflicts with prims barrier and
+    // __syncthread().
+    if (isNetOffload) barrier_sync(14, nthreads);
   }
 }
 
 template<typename T, typename RedOp>
 struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
   __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
-    using Proto = ProtoSimple<ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS>;
-    runRing<T, RedOp, Proto>(tid, nthreads, work);
+    bool isNetOffload = work->isOneRPN && work->netRegUsed;
+    if (isNetOffload)
+      runRing<T, RedOp, ProtoSimple<1, 1>, true>(tid, nthreads, work);
+    else
+      runRing<T, RedOp, ProtoSimple<ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS>, false>(tid, nthreads, work);
   }
 };
 
@@ -96,7 +120,7 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SIMPLE
     T *inputBuf = (T*)work->sendbuff;
     T *outputBuf = (T*)work->recvbuff;
     Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
-      (tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, false, false, 0, primsModePatAg);
+      (tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, nullptr, 0, primsModePatAg);
 
     PatAGAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
     int last = 0;
@@ -137,6 +161,7 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
           nelem = min(chunkCount, channelCount - elemOffset);
           prims.gather(offset, nvls->nHeads * count, nelem, count, -1, 0);
         }
+        // coverity[overrun-call] => Coverity think prims.index can be greater than 1
       } else if (tid < tidEndBcast) {
         // Bcast through NVLS
         using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
@@ -148,6 +173,7 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
           nelem = min(chunkCount, channelCount - elemOffset);
           prims.send(offset, nelem);
         }
+        // coverity[overrun-call] => Coverity think prims.index can be greater than 1
       }
     } else {
       /* direct allgather */
@@ -204,11 +230,11 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
       int part = ncclShmem.channelId - work->channelLo;
       char* inbuf = (char*)work->sendbuff;
       char* outbuf = (char*)work->recvbuff;
-      ssize_t sizePerRank = work->collnet.count*sizeof(T);
-      bool inPlace = (inbuf == outbuf + ncclShmem.comm.rank*sizePerRank);
+      ssize_t countPerRank = work->collnet.count*sizeof(T);
+      bool inPlace = (inbuf == outbuf + ncclShmem.comm.rank*countPerRank);
 
-      ssize_t railAllBeg = min(railGridOffset + part*chunkSize, nNodes*sizePerRank);
-      ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*sizePerRank);
+      ssize_t railAllBeg = min(railGridOffset + part*chunkSize, nNodes*countPerRank);
+      ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*countPerRank);
       int railAllSize = railAllEnd - railAllBeg;
       if (tid < nDsts) dstSizes[tid] = railAllSize;
 
@@ -221,15 +247,15 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
         if (rail == nRails) rail = 0;
       }
       do {
-        int node = railAllBeg/sizePerRank;
+        int node = railAllBeg/countPerRank;
         int railAllOffset = 0;
         while (railAllOffset < railAllSize) {
-          ssize_t railOneBeg = node*sizePerRank;
-          ssize_t railOneEnd = railOneBeg + sizePerRank;
+          ssize_t railOneBeg = node*countPerRank;
+          ssize_t railOneEnd = railOneBeg + countPerRank;
           ssize_t railOneOffset = (railAllBeg+railAllOffset) - railOneBeg;
           int delta = min(railAllEnd, railOneEnd) - (railAllBeg+railAllOffset);
           int rank = ncclShmem.comm.collNetDenseToUserRank[node*nRails + rail];
-          ssize_t userOneBeg = rank*sizePerRank + railOneOffset;
+          ssize_t userOneBeg = rank*countPerRank + railOneOffset;
           int outIsDst = (inPlace && rank == ncclShmem.comm.rank) ? 0 : 1;
           if (nSrcs != 0 && outIsDst+nDsts != 0) {
             reduceCopy<ncclCollUnroll(), RedOp, T,
@@ -238,11 +264,11 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
                      /*PreOpSrcs=*/0>
             (tid, tn, 0, nullptr, false,
              /*nSrcs=*/1, [=]__device__(int s/*==0*/) -> void* {
-               return work->regUsed && (recvDirectFlag & NCCL_DIRECT_READ) ? (char*)srcPtrs[src] + userOneBeg : (char*)srcPtrs[src] + railAllOffset;
+               return work->regUsed && (recvDirectFlag & NCCL_P2P_READ) ? (char*)srcPtrs[src] + userOneBeg : (char*)srcPtrs[src] + railAllOffset;
              },
              /*nDsts=*/outIsDst+nDsts, [=]__device__(int d) -> void* {
                return d < outIsDst ? outbuf + userOneBeg
-                                   : work->regUsed && (sendDirectFlag & NCCL_DIRECT_WRITE) ? (char*)dstPtrs[d-outIsDst] + userOneBeg
+                                   : work->regUsed && (sendDirectFlag & NCCL_P2P_WRITE) ? (char*)dstPtrs[d-outIsDst] + userOneBeg
                                    : (char*)dstPtrs[d-outIsDst] + railAllOffset;
              },
              delta);
@@ -262,8 +288,9 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
     const int nChannels = work->channelHi - work->channelLo + 1;
     struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
     int const &nNodes = ncclShmem.comm.nNodes;
-    ssize_t sizePerRank = work->collnet.count*sizeof(T);
+    ssize_t countPerRank = work->collnet.count;
     size_t chunkSize = work->collnet.chunkCount;
+    const int hasDn = (direct->down[0] >= 0) ? 1 : 0;
     bool isMultiRail = (direct->nHeads > 1);
     int nWarps1 = 1;
     int nWarps2 = (isMultiRail ? 2 : 1);
@@ -277,9 +304,12 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
 
     int tn = nWarps1*WARP_SIZE;
     if (tid < tn) {
-      if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
+      if (work->netRegUsed) {
         if (tid == 0) {
-          int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE);
+          // If this rank has local peers (i.e, hasDn == true), we cannot offload all data to network.
+          // In this case, steps should be computed based on chunkSize and so on; otherwise, we just
+          // bump the step by 1 to kick off collnet progress.
+          int steps = hasDn ? (int)divUp(nNodes * countPerRank, nChannels * chunkSize) : 1;
           Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, steps);
         }
         __syncwarp();
@@ -288,11 +318,11 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
         Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
           prims(tid, tn, nullptr, &direct->out, work->sendbuff, nullptr,
             /*redOpArg=*/0, 0 * Proto::MaxGroupWidth, 1, 1);
-        for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
+        for (ssize_t railGridOffset = 0; railGridOffset < nNodes * countPerRank; railGridOffset += nChannels * chunkSize) {
           ssize_t railAllBeg = railGridOffset + part * chunkSize;
-          ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes * sizePerRank);
-          ssize_t railOneBeg = ncclShmem.comm.node * sizePerRank;
-          ssize_t railOneEnd = railOneBeg + sizePerRank;
+          ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes * countPerRank);
+          ssize_t railOneBeg = ncclShmem.comm.node * countPerRank;
+          ssize_t railOneEnd = railOneBeg + countPerRank;
           ssize_t beg = max(railAllBeg, railOneBeg);
           ssize_t end = min(railAllEnd, railOneEnd);
           prims.send(beg - railOneBeg, max(ssize_t(0), end - beg));
@@ -304,10 +334,9 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
 
     tn = nWarps2*WARP_SIZE;
     if (tid < tn) {
-      if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
+      if (work->netRegUsed && !hasDn) {
         if (tid == 0) {
-          int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE);
-          Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, steps);
+          Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, 1);
         }
         __syncwarp();
       } else {
@@ -315,7 +344,7 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
         Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0>
           prims(tid, tn, &direct->out, direct->heads + 1, nullptr, work->recvbuff,
             /*redOpArg=*/0, 1 * Proto::MaxGroupWidth, 0, 0, work);
-        for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
+        for (ssize_t railGridOffset = 0; railGridOffset < nNodes * countPerRank; railGridOffset += nChannels * chunkSize) {
           Scatterer</*BcastSendNotRecv=*/true> scat;
           scat.work = work;
           scat.chunkSize = chunkSize;
@@ -333,7 +362,7 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
       Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 0>, /*Direct=*/1, Proto, 0>
         prims(tid, tn, direct->heads+1, nullptr, nullptr, work->recvbuff,
               /*redOpArg=*/0, 2*Proto::MaxGroupWidth, 0, 0, work);
-      for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
+      for (ssize_t railGridOffset=0; railGridOffset < nNodes*countPerRank; railGridOffset += nChannels*chunkSize) {
         Scatterer</*BcastSendNotRecv=*/false> scat;
         scat.work = work;
         scat.chunkSize = chunkSize;
diff --git a/src/device/all_reduce.h b/src/device/all_reduce.h
index 36b8d32066..c6c131517e 100644
--- a/src/device/all_reduce.h
+++ b/src/device/all_reduce.h
@@ -69,7 +69,7 @@ namespace {
         chunkOffset = chunk * chunkCount;
         offset = gridOffset + elemOffset + chunkOffset;
         nelem = (int)min(chunkCount, remCount - chunkOffset);
-        prims.directRecvCopyDirectSend(offset, nelem);
+        prims.directRecvCopyDirectSend(offset, offset, nelem);
       }
 
       // Make final copy from buffer to dest.
@@ -139,7 +139,7 @@ namespace {
         for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
           offset = gridOffset + elemOffset;
           nelem = min(chunkCount, channelCount - elemOffset);
-          prims.directRecvCopyDirectSend(offset, nelem);
+          prims.directRecvCopyDirectSend(offset, offset, nelem);
         }
       }
     }
@@ -222,7 +222,7 @@ namespace {
         for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
           offset = gridOffset + elemOffset;
           nelem = min(chunkCount, channelCount - elemOffset);
-          prims.directRecvCopyDirectSend(offset, nelem);
+          prims.directRecvCopyDirectSend(offset, offset, nelem);
         }
       }
     }
@@ -268,22 +268,30 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
     const int tidStartBcast = nThreadsGather;
     const int tidStartScatter = tidStartBcast + nThreadsBcast;
     const int tidStartReduce = tidStartScatter + nThreadsScatter;
-
     using Proto = ProtoSimple<1, 1>;
 
     if (tid >= tidStartScatter && tid < tidStartReduce && hasUp) {
       // Scatter
-      Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
+      Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0>
         prims(tid-tidStartScatter, nThreadsScatter, NULL, direct->up, work->sendbuff, work->recvbuff,
-           work->redOpArg, 2*Proto::MaxGroupWidth, 1, 1);
+           work->redOpArg, 2*Proto::MaxGroupWidth, 1, 1, work);
+      ssize_t offsetBase, peerOffset;
+      ssize_t maxNelems;
+      if (work->netRegUsed) {
+        offsetBase = bid * chunkSize;
+        maxNelems = size;  // never be the min
+        peerOffset = nChannels * chunkSize;
+      } else {
+        offsetBase = bid * direct->nHeads * chunkSize;
+        maxNelems = direct->nHeads * chunkSize;
+        peerOffset = chunkSize;
+      }
+      // For collnet UB case, we need to organize buffers differently for contiguous buffer access
+      // across channels. This access pattern should be consistent with code in coll_net.cc
       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-        ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
-        int nelem = min(direct->nHeads*chunkSize, size-offset);
-        if (work->regUsed) {
-          prims.directScatter(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift);
-        } else {
-          prims.scatter(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift);
-        }
+        ssize_t offset = gridOffset + offsetBase;
+        ssize_t nelem = min(maxNelems, size - offset);
+        prims.scatter(offset, nelem, chunkSize, peerOffset, direct->headRank, direct->shift);
       }
       // Coverity complains about a possible overrun inside the destructor of "prims", but that's actually
       // a false positive.
@@ -291,24 +299,20 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
     } else if (tid >= tidStartReduce && direct->out != -1) {
       if (hasDn) {
         // Reduce, send to network
-        Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>
+        Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/1, Proto, 0>
           prims(tid-tidStartReduce, nThreadsReduce, direct->down, &direct->out, work->sendbuff, work->recvbuff,
-             work->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
+             work->redOpArg, 3*Proto::MaxGroupWidth, 1, 1, work);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
-          int nelem = min(chunkSize, size-offset);
-          if (work->regUsed) {
-            prims.directRecvReduceSend(offset, nelem);
-          } else {
-            prims.recvReduceSend(offset, nelem);
-          }
+          ssize_t offset = work->netRegUsed ? gridOffset + (bid + direct->headRank * nChannels) * chunkSize
+                                    : gridOffset + (bid * direct->nHeads + direct->headRank) * chunkSize;
+          int nelem = min(chunkSize, size - offset);
+          prims.recvReduceDirectSend(offset, offset, nelem);
         }
       } else {
         // Directly send to network
-        if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
+        if (work->netRegUsed) {
           if (tid == tidStartReduce) {
-            int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE);
-            Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, steps);
+            Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, 1);
           }
           __syncwarp();
         } else {
@@ -316,8 +320,8 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
           prims(tid-tidStartReduce, nThreadsReduce, nullptr, &direct->out, work->sendbuff, work->recvbuff,
              work->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
           for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-            ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
-            int nelem = min(chunkSize, size-offset);
+            ssize_t offset = gridOffset + (bid * direct->nHeads + direct->headRank) * chunkSize;
+            int nelem = min(chunkSize, size - offset);
             prims.send(offset, nelem);
           }
         }
@@ -327,10 +331,21 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
       Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 0>, /*Direct=*/1, Proto, 0>
         prims(tid, nThreadsGather, direct->up, NULL, work->sendbuff, work->recvbuff,
            work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, work);
+      ssize_t offsetBase, peerOffset;
+      ssize_t maxNelems;
+      if (work->netRegUsed) {
+        offsetBase = bid * chunkSize;
+        maxNelems = size;  // never be the min
+        peerOffset = nChannels * chunkSize;
+      } else {
+        offsetBase = bid * direct->nHeads * chunkSize;
+        maxNelems = direct->nHeads * chunkSize;
+        peerOffset = chunkSize;
+      }
       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-        ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
-        int nelem = min(direct->nHeads*chunkSize, size-offset);
-        prims.directGather(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift);
+        ssize_t offset = gridOffset + offsetBase;
+        ssize_t nelem = min(maxNelems, size - offset);
+        prims.directGather(offset, nelem, chunkSize, peerOffset, direct->headRank, direct->shift);
       }
     } else if (tid >= tidStartBcast && tid < tidStartScatter && direct->out != -1) {
       if (hasDn) {
@@ -342,15 +357,15 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
           prims(tid-tidStartBcast, nThreadsBcast, &direct->out, direct->down, work->sendbuff, work->recvbuff,
              work->redOpArg, 1*Proto::MaxGroupWidth, 0, 0, work);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
-          int nelem = min(chunkSize, size-offset);
-          prims.recvCopyDirectSend(offset, nelem, /*postOp=*/true);
+          ssize_t offset = work->netRegUsed ? gridOffset + (bid + direct->headRank * nChannels) * chunkSize
+                                            : gridOffset + (bid * direct->nHeads + direct->headRank) * chunkSize;
+          int nelem = min(chunkSize, size - offset);
+          prims.directRecvCopyDirectSend(offset, offset, nelem, /*postOp=*/true);
         }
       } else {
-        if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
+        if (work->netRegUsed) {
           if (tid == tidStartBcast) {
-            int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE);
-            Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, steps);
+            Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, 1);
           }
           __syncwarp();
         } else {
@@ -394,8 +409,6 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
       ssize_t gridOffset, channelCount, chunkSize;
       ncclCollCbdPart(work, ncclShmem.channelId, NCCL_PROTO_SIMPLE, sizeof(T), (ssize_t*)nullptr, &gridOffset, &channelCount, &chunkSize);
       const ssize_t loopCount = nvls->nHeads * chunkSize;
-      ssize_t offset;
-      int nelem;
       int remCount = channelCount%(nvls->nHeads*chunkSize);
       int lastChunkSize = alignUp(divUp(remCount, nvls->nHeads), 16384/sizeof(T));
 
@@ -407,8 +420,8 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
             work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
         for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
           if (channelCount - elemOffset < loopCount) chunkSize = lastChunkSize;
-          offset = gridOffset + elemOffset;
-          nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
+          ssize_t offset = gridOffset + elemOffset;
+          int nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
           prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
         }
       } else if (tid < tidEndGather) {
@@ -419,8 +432,8 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
             work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
         for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
           if (channelCount - elemOffset < loopCount) chunkSize = lastChunkSize;
-          offset = gridOffset + elemOffset;
-          nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
+          ssize_t offset = gridOffset + elemOffset;
+          int nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
           prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
         }
       } else if (tid < tidEndReduce) {
@@ -430,7 +443,8 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
           prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL,
             work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work);
         for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
-          ssize_t chunkOffset;
+          ssize_t chunkOffset, offset;
+          int nelem;
           if (channelCount - elemOffset < loopCount) chunkSize = lastChunkSize;
           chunkOffset = elemOffset + nvls->headRank * chunkSize;
           offset = gridOffset + chunkOffset;
@@ -456,6 +470,7 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
           int nelem = work->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset);
           prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
         }
+        // coverity[overrun-call] => Coverity think prims.index can be greater than 1
       } else if (tid < tidEndGather) {
         // Gather
         using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
@@ -464,38 +479,23 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
             work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
-          int nelem = work->regUsed ? 0 :min(nvls->nHeads * chunkSize, size - offset);
+          int nelem = work->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset);
           prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
         }
       } else if (tid < tidEndReduce && nvls->headRank != -1) {
-        if (!hasOut) {
-          // Reduce, broadcast through NVLS
-          using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
-          // Coverity complains about a possible overrun inside the class below, but that's actually
-          // a false positive.
-          // coverity[identity_transfer:FALSE]
-          Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
-            prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL,
-              work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work);
-          for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-            ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
-            int nelem = min(chunkSize, size - offset);
-            prims.directRecvDirectSend(offset, offset, nelem);
-          }
-        } else {
-          // Reduce, send to network
-          using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
-          // Coverity complains about a possible overrun inside the class below, but that's actually
-          // a false positive.
-          // coverity[identity_transfer:FALSE]
-          Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
-            prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->out, NULL, NULL,
-              work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 1, work);
-          for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-            ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
-            int nelem = min(chunkSize, size - offset);
-            prims.directRecvDirectSend(offset, offset, nelem);
-          }
+        // Reduce, send to network
+        using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
+        // Coverity complains about a possible overrun inside the class below, but that's actually
+        // a false positive.
+        // coverity[identity_transfer:FALSE]
+        Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
+        prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->out, NULL, work->recvbuff,
+          work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 1, work);
+        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+          ssize_t offset = work->regUsed && work->netRegUsed ? gridOffset + (nvls->headRank * nChannels + bid) * chunkSize
+                                                             : gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
+          int nelem = min(chunkSize, size - offset);
+          prims.directRecvDirectSend(offset, offset, nelem);
         }
       } else if (tid < tidEndBcast && nvls->headRank != -1) {
         // Recv from network, broadcast
@@ -504,10 +504,11 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
         // a false positive.
         // coverity[identity_transfer:FALSE]
         Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
-          prims(tid - tidEndReduce, nThreadsBcast, &nvls->out, &nvls->down, NULL, NULL,
+          prims(tid - tidEndReduce, nThreadsBcast, &nvls->out, &nvls->down, NULL, work->recvbuff,
             work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, work);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
+          ssize_t offset = work->regUsed && work->netRegUsed ? gridOffset + (nvls->headRank * nChannels + bid) * chunkSize
+                                                             : gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
           int nelem = min(chunkSize, size - offset);
           prims.directRecvDirectSend(offset, offset, nelem);
         }
@@ -660,10 +661,9 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PR
 
     if (tid < nthreadsSplit) {
       if (recv == -1) {
-        if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
+        if (work->netRegUsed) {
           if (groupTid == 0) {
-            int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE);
-            Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>::sendPeerNotify(send, connIndex, steps);
+            Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>::sendPeerNotify(send, connIndex, 1);
           }
           __syncwarp();
         } else {
@@ -673,8 +673,10 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PR
           for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
             ssize_t offset = gridOffset + bid * int(chunkSize);
             int nelem = min(chunkSize, size - offset);
+            // coverity[overrun-call] => Coverity think prims.index can be greater than 1
             prims.directSend(offset, offset, nelem);
           }
+          // coverity[overrun-call] => Coverity think prims.index can be greater than 1
         }
       } else {
         Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
@@ -683,18 +685,19 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PR
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + bid * int(chunkSize);
           int nelem = min(chunkSize, size - offset);
+          // coverity[overrun-call] => Coverity think prims.index can be greater than 1
           prims.directRecvReduceDirectSend(offset, offset, nelem);
         }
+        // coverity[overrun-call] => Coverity think prims.index can be greater than 1
       }
     }
     else {
       if (recv == nranks) {
         // I'm the first in the broadcast chain, I need to perform the division (postOp)
         if (send == -1) {
-          if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
+          if (work->netRegUsed) {
             if (groupTid == 0) {
-              int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE);
-              Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>::recvPeerNotify(recv, connIndex, steps);
+              Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>::recvPeerNotify(recv, connIndex, 1);
             }
             __syncwarp();
           } else {
@@ -720,7 +723,7 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PR
           for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
             ssize_t offset = gridOffset + bid * int(chunkSize);
             int nelem = min(chunkSize, size - offset);
-            prims.directRecvCopyDirectSend(offset, nelem, /*postOp*/true);
+            prims.directRecvCopyDirectSend(offset, offset, nelem, /*postOp*/true);
           }
         }
       } else {
@@ -740,7 +743,7 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PR
           for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
             ssize_t offset = gridOffset + bid*int(chunkSize);
             int nelem = min(chunkSize, size-offset);
-            prims.directRecvCopyDirectSend(offset, nelem);
+            prims.directRecvCopyDirectSend(offset, offset, nelem);
           }
         }
       }
diff --git a/src/device/broadcast.h b/src/device/broadcast.h
index 851b01d948..017d379ada 100644
--- a/src/device/broadcast.h
+++ b/src/device/broadcast.h
@@ -15,37 +15,49 @@ namespace {
     const int rank = ring->userRanks[0];
     const int nextRank = ring->userRanks[1];
     const int root = work->root;
-    size_t chunkCount;
-    size_t channelCount;
-    size_t gridOffset;
-    ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
+    ssize_t chunkCount;
+    ssize_t channelCount;
+    ssize_t gridOffset;
+    ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (ssize_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
     size_t offset;
     int nelem;
+    int workNthreads;
+    bool isNetOffload = work->isOneRPN && work->netRegUsed;
 
     T *inputBuf = (T*)work->sendbuff;
     T *outputBuf = (T*)work->recvbuff;
-    // Coverity reports that the callee treats &ring->next as an array.  However, due to the use of
-    // FanSymmetric<1>, only the first element is ever accessed, so it's fine.
-    // coverity[callee_ptr_arith:FALSE]
-    Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0>
-      prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg, 0, 0, 0, work);
+    workNthreads = isNetOffload ? WARP_SIZE : nthreads;
 
-    for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
-      offset = gridOffset + elemOffset;
-      nelem = min(chunkCount, channelCount - elemOffset);
+    if (tid < workNthreads) {
+      // Coverity reports that the callee treats &ring->next as an array.  However, due to the use of
+      // FanSymmetric<1>, only the first element is ever accessed, so it's fine.
+      // coverity[callee_ptr_arith:FALSE]
+      Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0>
+        prims(tid, workNthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg, 0, 0, 0, work);
 
-      if (rank == root) {
-        if (inputBuf == outputBuf) {
-          prims.directSend(offset, offset, nelem);
+      for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
+        offset = gridOffset + elemOffset;
+        nelem = min(chunkCount, channelCount - elemOffset);
+
+        if (rank == root) {
+          if (inputBuf == outputBuf || isNetOffload) {
+            prims.directSend(offset, offset, nelem);
+          } else {
+            prims.directCopySend(offset, offset, nelem);
+          }
+        } else if (nextRank == root) {
+          prims.directRecv(offset, offset, nelem);
         } else {
-          prims.directCopySend(offset, offset, nelem);
+          prims.directRecvCopyDirectSend(offset, offset, nelem);
         }
-      } else if (nextRank == root) {
-        prims.directRecv(offset, offset, nelem);
-      } else {
-        prims.directRecvCopyDirectSend(offset, nelem);
       }
+    } else if (inputBuf != outputBuf && rank == root) {
+      inputBuf = inputBuf + gridOffset;
+      outputBuf = outputBuf + gridOffset;
+      reduceCopy<COLL_UNROLL, RedOp, T, 0, 1, 1, 0, 1, 1, /*PreOpSrcs=*/0>
+        (tid - workNthreads, nthreads - workNthreads, work->redOpArg, &work->redOpArg, false, 1, (void**)&inputBuf, 1, (void**)&outputBuf, channelCount);
     }
+    if (isNetOffload) barrier_sync(14, nthreads);
   }
 }
 
diff --git a/src/device/common.h b/src/device/common.h
index 967421b7dc..05465ff5a6 100644
--- a/src/device/common.h
+++ b/src/device/common.h
@@ -396,6 +396,9 @@ __device__ void ncclDevFunc_Nop();
     ncclKernelMain<specializedFnId, RunWorkBatch<coll, ty, redop<ty>, algo, proto>>(&args4K.args); \
   }
 
+#define DEFINE_ncclDevKernel_nop(suffix, coll, redop, ty, algo, proto, specializedFnId) \
+  __global__ void ncclDevKernel_##suffix(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {}
+
 #define DEFINE_ncclDevFunc(suffix, coll, redop, ty, algo, proto) \
   __device__ void ncclDevFunc_##suffix() { \
     RunWorkBatch<coll, ty, redop<ty>, algo, proto>().run(); \
diff --git a/src/device/common_kernel.h b/src/device/common_kernel.h
index f932f51f00..00bb1e3334 100644
--- a/src/device/common_kernel.h
+++ b/src/device/common_kernel.h
@@ -65,19 +65,23 @@ __device__ __forceinline__ void reduceCopyPacks(
   uintptr_t minSrcs[MinSrcs + !MinSrcs];
   uintptr_t minDsts[MinDsts + !MinDsts];
   #pragma unroll
-  for (int s=0; s < MinSrcs; s++)
+  for (int s=0; s < MinSrcs; s++) {
     minSrcs[s] = cvta_to_global(srcPtrFn(s)) + threadBytesBehind;
+  }
+
   #pragma unroll
-  for (int d=0; d < MinDsts; d++)
+  for (int d=0; d < MinDsts; d++) {
     // Yes, for some template arguments this code will be unreachable.  That's fine.
     // coverity[dead_error_line]
     minDsts[d] = cvta_to_global(dstPtrFn(d)) + threadBytesBehind;
+  }
 
   // We dictate loop termination condition according to whether partial hunks
   // can be handled or not.
   while (Unroll==1 ? (BytePerPack <= threadBytesAhead) : (0 < nHunksAhead)) {
     BytePack<BytePerPack> acc[Unroll];
 
+    // minSrcs[0] cannot be nullptr so we always process it
     { RedFn preFn(0 < PreOpSrcs ? preOpArgs[0] : 0);
       #pragma unroll Unroll
       for (int u=0; u < Unroll; u++) {
@@ -163,7 +167,8 @@ __device__ __forceinline__ void reduceCopyPacks(
       }
     }
     for (int d=MinDsts; (MinDsts < MaxDsts) && (d < MaxDsts) && (d < nDsts); d++) {
-      uintptr_t dst = cvta_to_global(dstPtrFn(d)) + threadBytesBehind;
+      uintptr_t dstPtr = cvta_to_global(dstPtrFn(d));
+      uintptr_t dst = dstPtr + threadBytesBehind;
       #pragma unroll Unroll
       for (int u=0; u < Unroll; u++) {
         st_global<BytePerPack>(dst, acc[u]);
@@ -173,11 +178,15 @@ __device__ __forceinline__ void reduceCopyPacks(
 
     nWarps = nThreads/WARP_SIZE;
     #pragma unroll
-    for (int s=0; s < MinSrcs; s++) minSrcs[s] += (nWarps-1)*BytePerHunk;
+    for (int s=0; s < MinSrcs; s++) {
+      minSrcs[s] += (nWarps-1)*BytePerHunk;
+    }
     #pragma unroll
     // Yes, for some template arguments this code will be unreachable.  That's fine.
     // coverity[dead_error_line]
-    for (int d=0; d < MinDsts; d++) minDsts[d] += (nWarps-1)*BytePerHunk;
+    for (int d=0; d < MinDsts; d++) {
+      minDsts[d] += (nWarps-1)*BytePerHunk;
+    }
     threadBytesBehind += nWarps*BytePerHunk;
     threadBytesAhead -= nWarps*BytePerHunk;
     nHunksAhead -= nWarps;
diff --git a/src/device/generate.py b/src/device/generate.py
index a0d2259466..b69a2d7cc8 100755
--- a/src/device/generate.py
+++ b/src/device/generate.py
@@ -5,7 +5,7 @@ import sys
 # Order of redops, tys, protos, algos must match src/include/device.h
 all_colls =  ["Broadcast","Reduce","AllGather","ReduceScatter","AllReduce","SendRecv"]
 all_redops = ["Sum","Prod","MinMax","PreMulSum","SumPostDiv"]
-all_tys =    ["i8","u8","i32","u32","i64","u64","f16","f32","f64","bf16"]
+all_tys =    ["i8","u8","i32","u32","i64","u64","f16","f32","f64","bf16","f8e4m3","f8e5m2"]
 all_protos = ["LL","LL128","SIMPLE"]
 all_algos =  ["TREE","RING","COLLNET_DIRECT","COLLNET_CHAIN","NVLS","NVLS_TREE","PAT"]
 
@@ -107,6 +107,9 @@ def required_cuda(coll, redop, ty, algo, proto):
   if coll in ("AllReduce","Reduce","ReduceScatter"):
     if redop=="SumPostDiv" and ty[0] not in ("i","u"): return None
     if ty=="bf16": cudart = max(cudart, 11000)
+    if ty.startswith("f8"):
+      cudart = max(cudart, 11080)
+      arch = max(arch, 900)
 
   if "NVLS" in algo:
     if coll in ("AllReduce","Reduce","ReduceScatter"):
@@ -125,7 +128,7 @@ def required_cuda(coll, redop, ty, algo, proto):
 def equivalent_primary(coll, redop, ty, algo, proto):
   if coll in ("AllReduce", "Reduce", "ReduceScatter"):
     # map signed integer sum/prod to unsigned
-    if redop in ("Sum","Prod","PreMulSum") and ty[0]=="i":
+    if redop in ("Sum","Prod","PreMulSum","SumPostDiv") and ty[0]=="i":
       return (coll, redop, "u"+ty[1:], algo, proto)
     # map signed integer min/max to unsigned for non-NVLS
     if redop=="MinMax" and ty[0]=="i" and ("NVLS" not in algo):
@@ -365,7 +368,9 @@ ty_to_cxx = {
   "f16": "half",
   "f32": "float",
   "f64": "double",
-  "bf16": "__nv_bfloat16"
+  "bf16": "__nv_bfloat16",
+  "f8e4m3": "__nv_fp8_e4m3",
+  "f8e5m2": "__nv_fp8_e5m2"
 }
 
 # Generate each <gensrc>/<impl>.cu:
@@ -385,15 +390,23 @@ for name in name_to_funcs.keys():
       sym = paste("_", coll, redop, ty, algo, proto)
       fn_id = primary_to_index[kfn]
       cudart, arch = required_cuda(*kfn)
+      s = "DEFINE_ncclDevKernel({sym}, ncclFunc{coll}, {redop_cxx}, {ty_cxx}, NCCL_ALGO_{algo}, NCCL_PROTO_{proto}, {fn_id})\n"
       if (cudart, arch) != (0, 0):
-        out("#if CUDART_VERSION >= %d && __CUDA_ARCH__ >= %d\n" % (cudart, arch))
-      out(
-        "DEFINE_ncclDevKernel({sym}, ncclFunc{coll}, {redop_cxx}, {ty_cxx}, NCCL_ALGO_{algo}, NCCL_PROTO_{proto}, {fn_id})\n"
-        .format(sym=sym, coll=coll, redop_cxx=redop_to_cxx[redop], ty_cxx=ty_to_cxx[ty],
-                algo=(algo or "RING"), proto=(proto or "SIMPLE"), fn_id=fn_id)
-      )
-      if (cudart, arch) != (0, 0):
-        out("#endif\n")
+        # Add conditional compilation logic around s. If CUDART_VERSION is satisfactory
+        # we must compile a kernel regardless of __CUDA_ARCH__ since the host code has
+        # to link against some stub.
+        s = "#if CUDART_VERSION >= {cudart}\n" \
+            "  #if __CUDA_ARCH__ < {arch}\n" \
+            "    DEFINE_ncclDevKernel_nop({sym}, ncclFunc{coll}, {redop_cxx}, {ty_cxx}, NCCL_ALGO_{algo}, NCCL_PROTO_{proto}, {fn_id})\n" \
+            "  #else\n" \
+            "    " + s + \
+            "  #endif\n" \
+            "#endif\n"
+      out(s.format(
+        cudart=cudart, arch=arch, sym=sym, coll=coll,
+        redop_cxx=redop_to_cxx[redop], ty_cxx=ty_to_cxx[ty],
+        algo=(algo or "RING"), proto=(proto or "SIMPLE"), fn_id=fn_id
+      ))
 
     for fn in fns:
       (coll, redop, ty, algo, proto) = fn
diff --git a/src/device/network/unpack/unpack.h b/src/device/network/unpack/unpack.h
index e76099821c..941b4328df 100644
--- a/src/device/network/unpack/unpack.h
+++ b/src/device/network/unpack/unpack.h
@@ -33,17 +33,21 @@ inline __device__ void load64gpu(const uint64_t* ptr, uint64_t &v) {
 // Map internal association of handle with group and peer index (called once at init time)
 inline __device__ void ncclNetDeviceUnpackSetup(void* ohandle, const int group, const int index) {
   struct unpackNetDeviceHandle* handle = (struct unpackNetDeviceHandle*) ohandle;
+  // coverity[index_parm:FALSE]
   ncclShmem.groups[group].devicePlugin.unpack.g_meta[index] = handle->meta;
   ncclShmem.devicePlugin.unpack.bounce_buf = handle->bounce_buf;
+  // coverity[index_parm:FALSE]
   ncclShmem.groups[group].devicePlugin.unpack.head[index] = handle->head;
 }
 
 inline __device__ void ncclNetDeviceIncrementHead(const int group, const int index) {
+  // coverity[index_parm:FALSE]
   ncclShmem.groups[group].devicePlugin.unpack.head[index]++;
 }
 
 inline __device__ void ncclNetDeviceSaveHead(void* ohandle, const int group, const int index) {
   struct unpackNetDeviceHandle* handle = (struct unpackNetDeviceHandle*) ohandle;
+  // coverity[index_parm:FALSE]
   handle->head = ncclShmem.groups[group].devicePlugin.unpack.head[index];
 }
 
diff --git a/src/device/onerank.cu b/src/device/onerank.cu
index 5ff4a85b10..c187dcc44d 100644
--- a/src/device/onerank.cu
+++ b/src/device/onerank.cu
@@ -62,6 +62,10 @@ ncclResult_t ncclLaunchOneRank(void* dst, void const* src, size_t nElts, struct
   case ncclUint32:   kernel = (void const*)&oneRankReduce<FuncPreMulSum<uint32_t>>; break;
   case ncclInt64:    kernel = (void const*)&oneRankReduce<FuncPreMulSum<int64_t>>; break;
   case ncclUint64:   kernel = (void const*)&oneRankReduce<FuncPreMulSum<uint64_t>>; break;
+  #if defined(__CUDA_FP8_TYPES_EXIST__) && __CUDA_ARCH__ >= 900
+  case ncclFloat8e4m3: kernel = (void const*)&oneRankReduce<FuncPreMulSum<__nv_fp8_e4m3>>; break;
+  case ncclFloat8e5m2: kernel = (void const*)&oneRankReduce<FuncPreMulSum<__nv_fp8_e5m2>>; break;
+  #endif
   case ncclFloat16:  kernel = (void const*)&oneRankReduce<FuncPreMulSum<half>>; break;
   #if defined(__CUDA_BF16_TYPES_EXIST__)
   case ncclBfloat16: kernel = (void const*)&oneRankReduce<FuncPreMulSum<__nv_bfloat16>>; break;
diff --git a/src/device/primitives.h b/src/device/primitives.h
index 1913640e89..73c10c2645 100644
--- a/src/device/primitives.h
+++ b/src/device/primitives.h
@@ -103,7 +103,7 @@ struct FanSymmetric {
 };
 
 // The primitives class. Specialized per protocol in the other headers.
-template<typename T, typename RedOp, typename Fan, int Direct, typename Proto, int P2p>
+template<typename T, typename RedOp, typename Fan, int Direct, typename Proto, int P2p, bool isNetOffload = false>
 class Primitives;
 
 // Used by LL & LL128 to implement direct members in the naive way.
@@ -121,9 +121,12 @@ struct PrimitivesWithoutDirect {
   __device__ void directCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
     static_cast<RealPrimitives*>(this)->copySend(inpIx, outIx, eltN, postOp);
   }
-  __device__ void directRecvCopyDirectSend(intptr_t outIx, int eltN, bool postOp=false) {
+  __device__ void directRecvCopyDirectSend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
     static_cast<RealPrimitives*>(this)->recvCopySend(outIx, eltN, /*postOp=*/false);
   }
+  __device__ void directRecvDirectSend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
+    return;
+  }
   __device__ void recvReduceCopyDirectSend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
     // Direct is only for the send part
     static_cast<RealPrimitives*>(this)->recvReduceCopySend(inpIx, outIx, eltN, postOp);
diff --git a/src/device/prims_ll.h b/src/device/prims_ll.h
index 1a1307f5c8..3e00f3b851 100644
--- a/src/device/prims_ll.h
+++ b/src/device/prims_ll.h
@@ -4,9 +4,9 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-template<typename T, typename RedOp, typename Fan, int Direct, int P2p>
-class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
-  public PrimitivesWithoutDirect<Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>> {
+template<typename T, typename RedOp, typename Fan, int Direct, int P2p, bool isNetOffload>
+class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p, isNetOffload>:
+  public PrimitivesWithoutDirect<Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p, isNetOffload>> {
 
   // In the case of Fan::MaxRecv == 0, we need to force MaxRecv to 1 for this to compile
   // This is because of a recv buffer which is allocated to MaxRecv length in send-only cases
diff --git a/src/device/prims_ll128.h b/src/device/prims_ll128.h
index 2cb10cc499..617b7acf34 100644
--- a/src/device/prims_ll128.h
+++ b/src/device/prims_ll128.h
@@ -8,9 +8,9 @@
 
 #define NCCL_LL128_FLAGTHREAD (NCCL_LL128_LINEELEMS-1)
 
-template<typename T, typename RedOp, typename Fan, int Direct, int P2p>
-class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
-  public PrimitivesWithoutDirect<Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>> {
+template<typename T, typename RedOp, typename Fan, int Direct, int P2p, bool isNetOffload>
+class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p, isNetOffload>:
+  public PrimitivesWithoutDirect<Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p, isNetOffload>> {
 
   static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend;
   static constexpr int Input=0, Output=1;
diff --git a/src/device/prims_simple.h b/src/device/prims_simple.h
index 945878b762..0051019400 100644
--- a/src/device/prims_simple.h
+++ b/src/device/prims_simple.h
@@ -14,9 +14,9 @@ enum primsMode {
 };
 
 template<typename T, typename RedOp, typename Fan, int Direct,
-         int SlicePerChunk, int StepPerSlice, int Unroll, int P2p, int MultimemSrcs, int MultimemDsts>
+         int SlicePerChunk, int StepPerSlice, int Unroll, int P2p, int MultimemSrcs, int MultimemDsts, bool isNetOffload>
 class Primitives<
-    T, RedOp, Fan, Direct, ProtoSimple<SlicePerChunk, StepPerSlice, Unroll, MultimemSrcs, MultimemDsts>, P2p
+    T, RedOp, Fan, Direct, ProtoSimple<SlicePerChunk, StepPerSlice, Unroll, MultimemSrcs, MultimemDsts>, P2p, isNetOffload
   > {
   static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend;
   static constexpr int Input=0, Output=1;
@@ -34,11 +34,7 @@ class Primitives<
                        PatMode = 0x800,
                        NvlsMinPolling = 0x1000,
                        NetDeviceUnpack = 0x2000,
-                       AnyNetDeviceUnpack = 0x4000,
-                       NvlsDirectRead = 0x8000,
-                       NvlsDirectWrite = 0x10000,
-                       IpcWrite = 0x20000,
-                       IpcRead = 0x40000;
+                       AnyNetDeviceUnpack = 0x4000;
   const int tid, tidInBlock;
   const int nthreads;
   int nworkers;
@@ -119,12 +115,9 @@ class Primitives<
   template <int DirectRecv, int DirectSend, int Recv, int Send, int Src, int Dst>
   __device__ __forceinline__ void waitPeer(intptr_t srcIx, intptr_t dstIx, int offset, int nelts) {
     const bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send;
-    const bool noRecvWait = DirectRecv && Src && (flags & (DirectRead | IpcRead));        // no wait when directly reading from remote input
-    const bool noSendWait = DirectSend && (flags & (DirectRead|DirectWrite)); // no wait in empty send (e.g. directScatter) or direct remote write
     // Yes, for some template arguments this code will be unreachable.  That's fine.
     // coverity[dead_error_line]
-    if (((flags & (Recv*RoleWaitRecv)) && !noRecvWait) ||
-        ((flags & (Send*RoleWaitSend)) && !noSendWait)) {
+    if ((flags & (Recv * RoleWaitRecv)) || (flags & (Send * RoleWaitSend))) {
       int spins = 0;
       while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) {
         connStepCache = loadStepValue(connStepPtr);
@@ -134,27 +127,38 @@ class Primitives<
     }
 
     if (flags & (Recv*RoleWaitRecv | Send*RoleWaitSend)) {
-      if (flags & ConnFifoEnabled)
+      if ((flags & ConnFifoEnabled) && (flags & (Send * RoleWaitSend)))
         connFifo[step%NCCL_STEPS].size = nelts*sizeof(T);
 
       void **ptrs = isSendNotRecv ? (ncclShmem.groups[group].dsts + Dst)
                                   : (ncclShmem.groups[group].srcs + Src);
       if (flags & NetRegMode) {
-         // Do nothing
+        if (P2p) {
+          ptrs[index] = NULL;
+        } else {
+          if (isSendNotRecv) {
+            if (!Recv)
+              ptrs[index] = NULL;
+            else
+              ptrs[index] = (T*)ncclShmem.groups[group].userOutput + dstIx + offset;
+          } else {
+            ptrs[index] = (T*)ncclShmem.groups[group].userOutput + srcIx + offset;
+          }
+        }
       } else if ((flags & ConnFifoEnabled) && connFifo[step%NCCL_STEPS].mode == NCCL_MODE_OFFSET) {
         ptrs[index] = connEltsFifo + loadInt(&connFifo[step%NCCL_STEPS].offset)/sizeof(T);
       } else if (isSendNotRecv && DirectSend) {
-        if (flags & (DirectWrite | NvlsDirectWrite | IpcWrite)) {
+        if (flags & DirectWrite) {
           ptrs[index] = directBuff + dstIx + offset;
-        } else if ((flags & DirectRead) || (flags & IpcRead)) {  // empty send
+        } else if (flags & DirectRead) {  // empty send
           ptrs[index] = nullptr;
         } else {
           ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize;
         }
       } else if (!isSendNotRecv && DirectRecv) {
-        if (flags & (DirectRead | NvlsDirectRead | IpcRead)) {
+        if (flags & DirectRead) {
           ptrs[index] = directBuff + srcIx + offset;
-        } else if ((flags & DirectWrite) || (flags & IpcWrite)) {
+        } else if (flags & DirectWrite) {
           ptrs[index] = directBuff + dstIx + offset;  // send to next from my output buffer
         } else {
           ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize;
@@ -198,7 +202,7 @@ class Primitives<
     int slice = 0;
     int offset = 0;
 
-    if (tid < nworkers && offset < nelem && ((flags & NetRegMode) == 0)) {
+    if (tid < nworkers && offset < nelem && !isNetOffload) {
       // Worker-only loop for non-empty slices. Non-workers and empty slices are
       // processed in the loop following this if block. The benefit of splitting
       // the loop like this is we pull two branches out of the critical path.
@@ -252,7 +256,7 @@ class Primitives<
              * so we need to check whether MultimemSrcs and MultimemDsts are 0. */
             && MultimemSrcs == 0 && MultimemDsts == 0 && !Src) {
           // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
-          if (Send) {
+          if (Send && Dst && ncclShmem.groups[group].srcs[0] != ncclShmem.groups[group].dsts[1]) {
             reduceCopy<Unroll, RedOp, T, 0, 1, 1, 0, 1, MaxSend, /*PreOpSrcs*/0>
               (tid, nworkers, /*redArg*/0, /*preOpArgs*/nullptr, /*postOp*/false,
                1, ncclShmem.groups[group].srcs,
@@ -269,16 +273,32 @@ class Primitives<
         } else if (ncclShmem.groups[group].srcs[0] && ncclShmem.groups[group].dsts[0]) {
           constexpr int PreOpSrcs = SrcBuf != Input ? 0 :
                                     DirectRecv*MaxRecv == NCCL_MAX_DIRECT_ARITY ? (1+NCCL_MAX_DIRECT_ARITY) : 1;
-          reduceCopy<Unroll, RedOp, T,
-            MultimemSrcs, Recv+Src, Recv*MaxRecv+Src,
-            MultimemDsts, Send+Dst, Send*MaxSend+Dst, PreOpSrcs>
-            (tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp,
-             Recv*fan.nrecv()+Src, ncclShmem.groups[group].srcs,
-             Send*fan.nsend()+Dst, ncclShmem.groups[group].dsts,
-             workSize);
+          if (Send && Dst && ncclShmem.groups[group].dsts[1] == nullptr) {
+            // this case should only be directCopySend() with registered buffers and send to net peer
+            reduceCopy<Unroll, RedOp, T,
+              0, Recv + Src, Recv * MaxRecv + Src,
+              0, 1, 1, PreOpSrcs>
+              (tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp,
+                Recv * fan.nrecv() + Src, ncclShmem.groups[group].srcs,
+                1, ncclShmem.groups[group].dsts,
+                workSize);
+          } else {
+            reduceCopy<Unroll, RedOp, T,
+              MultimemSrcs, Recv + Src, Recv * MaxRecv + Src,
+              MultimemDsts, Send + Dst, Send * MaxSend + Dst, PreOpSrcs>
+              (tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp,
+                Recv * fan.nrecv() + Src, ncclShmem.groups[group].srcs,
+                Send * fan.nsend() + Dst, ncclShmem.groups[group].dsts,
+                workSize);
+          }
+        } else {
+          // we will come here when calling prims.directSend with net peer,
+          // in this case, ncclShmem.groups[group].dsts[0] == NULL, so we
+          // skip data flush.
+          workSize = 0;
         }
         barrier(); // This barrier has a counterpart in following loop
-        postPeer<Recv, Send>(0 < sliceSize);
+        postPeer<Recv, Send>(0 < workSize);
         offset += sliceSize;
         slice += 1;
         // Yes, for some template arguments this code will be unreachable.  That's fine.
@@ -295,10 +315,11 @@ class Primitives<
       sliceSize = sliceSize < nelem-offset ? sliceSize : nelem-offset;
       { // Only workers could have Wait roles so we know the slice must be empty
         // since we've exited the loop above.
-        waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(0, 0, 0, 0);
+        waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(0, 0, 0, sliceSize);
       }
       barrier(); // Has couterpart in preceding worker-only loop.
-      postPeer<Recv, Send>(0 < sliceSize);
+      int workSize = ncclShmem.aborted ? 0 : sliceSize;
+      postPeer<Recv, Send>(0 < workSize);
       offset += sliceSize;
       slice += 1;
     }
@@ -347,17 +368,17 @@ public:
             ptrs[index] = connEltsFifo + offset/sizeof(T);
           } else if (Direct && fn.work->regUsed) {
             if (isSendNotRecv) {
-              if (flags & (DirectWrite | IpcWrite)) {
+              if (flags & DirectWrite) {
                 ptrs[index] = directBuff;
-              } else if (flags & (DirectRead | IpcRead)) {  // empty send
+              } else if (flags & DirectRead) {  // empty send
                 ptrs[index] = nullptr;
               } else {
                 ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
               }
             } else {
-              if (flags & (DirectRead | IpcRead)) {
+              if (flags & DirectRead) {
                 ptrs[index] = directBuff;
-              } else if (flags & (DirectWrite | IpcWrite)) {
+              } else if (flags & DirectWrite) {
                 if (Send)
                   ptrs[index] = directBuff;  // send to next from my output buffer
                 else
@@ -440,7 +461,7 @@ private:
             int i = (j+shift)%fan.nsend();
             ssize_t pOffset = i*peerOffset;
             // Skip the data I am responsible of reducing myself
-            if (skip >= 0 && i >= skip) pOffset += peerElem;
+            if (skip >= 0 && i >= skip) pOffset += peerOffset;
             void* src0 = (T*)ncclShmem.groups[group].srcs[0] + pOffset;
             ssize_t realPeerSize = min(realSize, totalElem-pOffset);
             if (realPeerSize > 0 && ncclShmem.groups[group].dsts[i] != nullptr) {
@@ -452,7 +473,7 @@ private:
         } else if (Recv) {
           if (tid==0) ncclShmem.groups[group].dsts[0] = (T*)ncclShmem.groups[group].userOutput + outIx + offset;
           ssize_t pOffset = index*peerOffset;
-          if (skip >= 0 && index >= skip) pOffset += peerElem;
+          if (skip >= 0 && index >= skip) pOffset += peerOffset;
           // Adjust remote index with peer offset in case we are directly pulling from peer's output buffer
           waitPeer<DirectRecv, 0, 1, 0, 0, 1>(outIx+pOffset, outIx+pOffset, offset, realSize);
           subBarrier();
@@ -460,7 +481,7 @@ private:
           for (int j=0; j<fan.nrecv(); j++) {
             int i = (j+shift)%fan.nrecv();
             pOffset = i*peerOffset;
-            if (skip >= 0 && i >= skip) pOffset += peerElem;
+            if (skip >= 0 && i >= skip) pOffset += peerOffset;
             void* dst0 = (T*)ncclShmem.groups[group].dsts[0] + pOffset;
             ssize_t realPeerSize = min(realSize, totalElem-pOffset);
             if (DirectRecv && ncclShmem.groups[group].srcs[i] == dst0) realPeerSize = 0;
@@ -474,7 +495,7 @@ private:
     }
   }
 
-  __device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, uint32_t direct, int regFlag) {
+  __device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, uint32_t direct, int ipcRegFlag, int netRegFlag) {
     conn = &peer->recv[connIndex];
     if (conn->netDeviceHandle.netDeviceType == NCCL_NET_DEVICE_UNPACK) {
       // handle must be a device ptr
@@ -499,33 +520,34 @@ private:
       if (conn->connFifo != nullptr) {
         flags |= ConnFifoEnabled;
         connFifo = conn->connFifo;
-      } else if (Direct && regFlag) {
-        // User buffers have been registered
-        if (conn->flags & (NCCL_IPC_READ | NCCL_IPC_WRITE)) {
-          if (P2p) {
-            flags |= conn->flags & NCCL_IPC_WRITE ? IpcWrite : IpcRead;
-          } else if (connIndex == 1 && direct) {
-            flags |= IpcRead;
-          } else {
-            flags |= direct & NCCL_DIRECT_READ ? IpcRead : IpcWrite;
+      }
+      if (Direct) {
+        if (ipcRegFlag) {
+          // User buffers have been registered
+          if (conn->flags & (NCCL_P2P_READ | NCCL_P2P_WRITE)) {
+            if (P2p) {
+              flags |= conn->flags & NCCL_P2P_WRITE ? DirectWrite : DirectRead;
+            } else if (connIndex == 1 && direct) {
+              flags |= DirectRead;
+            } else {
+              flags |= direct & NCCL_P2P_READ ? DirectRead : DirectWrite;
+            }
+          } else if ((conn->flags & NCCL_NVLS_MIN_POLL)) {
+            /* NVLS direct */
+            flags |= DirectRead;
           }
-        } else if (conn->flags & (NCCL_DIRECT_WRITE | NCCL_DIRECT_READ)) {
-          if (P2p) {
-            flags |= conn->flags & NCCL_DIRECT_WRITE ? DirectWrite : DirectRead;
-          } else if (connIndex == 1 && direct) {
-            flags |= DirectRead;  // scatter-reduce use direct pull
-          } else {
-            flags |= direct & NCCL_DIRECT_READ ? DirectRead : DirectWrite;
+        }
+        if (netRegFlag) {
+          if (conn->flags & NCCL_DIRECT_NIC) {
+            flags |= NetRegMode;
+            connFifo[step % NCCL_STEPS].size = 0;
           }
-        } else if ((conn->flags & NCCL_NVLS_MIN_POLL)) {
-          /* NVLS direct */
-          flags |= NvlsDirectRead;
         }
       }
     }
   }
 
-  __device__ __forceinline__ void loadSendConn(ncclDevChannelPeer *peer, int connIndex, uint32_t direct, int regFlag) {
+  __device__ __forceinline__ void loadSendConn(ncclDevChannelPeer *peer, int connIndex, uint32_t direct, int ipcRegFlag, int netRegFlag) {
     conn = &peer->send[connIndex];
     step = conn->step;
     step = roundUp(step, SlicePerChunk*StepPerSlice);
@@ -544,27 +566,26 @@ private:
       connStepCache = loadStepValue(connStepPtr);
       connStepSize = conn->stepSize/sizeof(T);
       connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
-      if (connFifo == nullptr && Direct && regFlag) {
-        // User buffers have been registered
-        if (conn->flags & (NCCL_IPC_READ | NCCL_IPC_WRITE)) {
-          if (P2p) {
-            flags |= conn->flags & NCCL_IPC_WRITE ? IpcWrite : IpcRead;
-          } else if (connIndex == 1 && direct) {
-            flags |= IpcRead;
-          } else {
-            flags |= direct & NCCL_DIRECT_READ ? IpcRead : IpcWrite;
+      if (Direct) {
+        if (ipcRegFlag) {
+          // User buffers have been registered
+          if (conn->flags & (NCCL_P2P_WRITE | NCCL_P2P_READ)) {
+            if (P2p) {
+              flags |= conn->flags & NCCL_P2P_WRITE ? DirectWrite : DirectRead;
+            } else if (connIndex == 1 && direct) {
+              flags |= DirectRead;  // scatter-reduce use direct pull
+            } else {
+              flags |= direct & NCCL_P2P_READ ? DirectRead : DirectWrite;
+            }
+          } else if ((conn->flags & NCCL_NVLS_MIN_POLL)) {
+            /* NVLS direct */
+            flags |= DirectWrite;
           }
-        } else if (conn->flags & (NCCL_DIRECT_WRITE | NCCL_DIRECT_READ)) {
-          if (P2p) {
-            flags |= conn->flags & NCCL_DIRECT_WRITE ? DirectWrite : DirectRead;
-          } else if (connIndex == 1 && direct) {
-            flags |= DirectRead;  // scatter-reduce use direct pull
-          } else {
-            flags |= direct & NCCL_DIRECT_READ ? DirectRead : DirectWrite;
+        }
+        if (netRegFlag) {
+          if (conn->flags & NCCL_DIRECT_NIC) {
+            flags |= NetRegMode;
           }
-        } else if ((conn->flags & NCCL_NVLS_MIN_POLL)) {
-          /* NVLS direct */
-          flags |= NvlsDirectWrite;
         }
       }
     }
@@ -574,8 +595,8 @@ private:
   __device__ Primitives(
       int tid, int nthreads, int const *recvPeers, int const *sendPeers,
       void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
-      uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclDevWorkColl* e = nullptr,
-      bool ipcReg = false, bool netReg = false, int stepSize_ = 0, int mode = primsModeDefault
+      uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclDevWorkColl* collWork = nullptr,
+      struct ncclDevWorkP2p* p2pWork = nullptr, int stepSize_ = 0, int mode = primsModeDefault
     ):
     tid(tid), nthreads(nthreads), tidInBlock(threadIdx.x), group(group),
     stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_) {
@@ -643,11 +664,23 @@ private:
 
     // Coverity thinks that index could be -1 here but that's not actually the case.
     // coverity[negative_returns:FALSE]
-    if (flags & (RoleWaitRecv|RolePostRecv)) loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, e ? e->direct : 0, e ? e->regUsed : ipcReg);
-    // coverity[negative_returns:FALSE]
-    if (flags & (RoleWaitSend|RolePostSend)) loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, e ? e->direct : 0, e ? e->regUsed : ipcReg);
-
-    if (netReg) flags |= NetRegMode;
+    int sendIpcReg;
+    int recvIpcReg;
+    int sendNetReg;
+    int recvNetReg;
+    if (P2p) {
+      sendIpcReg = p2pWork ? p2pWork->sendIpcReg : 0;
+      recvIpcReg = p2pWork ? p2pWork->recvIpcReg : 0;
+      sendNetReg = p2pWork ? p2pWork->sendNetReg : 0;
+      recvNetReg = p2pWork ? p2pWork->recvNetReg : 0;
+    } else {
+      recvIpcReg = sendIpcReg = collWork ? collWork->regUsed : 0;
+      recvNetReg = sendNetReg = collWork ? collWork->netRegUsed : 0;
+    }
+    // coverity[overrun-call] => Coverity think prims.index can be greater than 1
+    if (flags & (RoleWaitRecv|RolePostRecv)) loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, collWork ? collWork->direct : 0, recvIpcReg, recvNetReg);
+    // coverity[overrun-call] => Coverity think prims.index can be greater than 1
+    if (flags & (RoleWaitSend|RolePostSend)) loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, collWork ? collWork->direct : 0, sendIpcReg, sendNetReg);
 
     if (barrierAny(flags & NetDeviceUnpack)) {
       flags |= AnyNetDeviceUnpack;
@@ -659,8 +692,10 @@ private:
       }
     }
 
-    // coverity[negative_returns:FALSE]
-    setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclDevWorkCollReg*)e, (uint8_t)(e ? e->regUsed : ipcReg), peer);
+    // coverity[negative_returns:FALSE] => coverity thinks that index could be -1 but that's not actually the case
+    // coverity[var_deref_model] => coverity thinks work can dereferenced if NULL but this is not the case
+    setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclDevWorkCollReg*)collWork, sendIpcReg || recvIpcReg, peer);
+    // coverity[uninit_member] => coverity thinks fan.n is not initialized
   }
 
   __device__ ~Primitives() {
@@ -683,6 +718,16 @@ private:
     // Make sure all threads are done writing back conn->step and done using
     // ncclShmem.groups[group]
     barrier();
+
+    if ((flags & DirectRead) && (flags & RoleWaitSend) && P2p) {
+      // For sendrecv DirectRead, sender needs to wait for receiver reading data from src.
+      // This has to be done after barrier() since post thread might have contention with
+      // this check.
+      int spins = 0;
+      volatile uint64_t* tail = conn->tail;
+      volatile uint64_t* head = conn->head;
+      while (*tail > *head) if (checkAbort(spins)) break;
+    }
   }
 
   __device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclDevWorkCollReg* work, uint8_t ipcReg, int peer) {
@@ -693,10 +738,10 @@ private:
     }
 
     if (Direct && ipcReg) {
-      bool recvProvider = (flags & RoleWaitRecv) && (flags & DirectWrite || flags & IpcWrite);
-      bool sendAcceptor = (flags & RoleWaitSend) && (flags & DirectWrite || flags & IpcWrite || flags & NvlsDirectWrite);
-      bool sendProvider = (flags & RoleWaitSend) && (flags & DirectRead || flags & IpcRead); // sender provides direct buffer (to be fetched)
-      bool recvAcceptor = (flags & RoleWaitRecv) && (flags & DirectRead || flags & IpcRead || flags & NvlsDirectRead); // receiver accepts direct buffer
+      bool recvProvider = (flags & RoleWaitRecv) && (flags & DirectWrite);
+      bool sendAcceptor = (flags & RoleWaitSend) && (flags & DirectWrite);
+      bool sendProvider = (flags & RoleWaitSend) && (flags & DirectRead); // sender provides direct buffer (to be fetched)
+      bool recvAcceptor = (flags & RoleWaitRecv) && (flags & DirectRead); // receiver accepts direct buffer
       if (recvProvider) {
         int spins = 0;
         void* volatile* slot = ncclShmem.groups[group].recvConns[index]->ptrExchange;
@@ -709,6 +754,7 @@ private:
             exchgPtr = (T*)outputBuf;
           } else {
             int localPeer = ncclShmem.comm.rankToLocalRank[peer];
+            // coverity[deref_parm:FALSE] => work cannot be NULL if ipcReg != NULL
             exchgPtr = (T*)(work->coll.recvbuffOffset + work->coll.recvbuffRmtAddrs[localPeer]);
           }
           *slot = reinterpret_cast<void*>(exchgPtr);
@@ -727,6 +773,7 @@ private:
           directBuff = reinterpret_cast<T*>(ptr);
           *slot = nullptr;
         } else {
+          // coverity[var_deref_op]
           directBuff = (T*)work->dnOutputs[index];
         }
       }
@@ -747,8 +794,10 @@ private:
           } else {
             int localPeer = ncclShmem.comm.rankToLocalRank[peer];
             if (MaxRecv == 0)
+              // coverity[var_deref_op]
               exchgPtr = (T*)(work->coll.sendbuffOffset + work->coll.sendbuffRmtAddrs[localPeer]);
             else
+              // coverity[var_deref_op]
               exchgPtr = (T*)(work->coll.recvbuffOffset + work->coll.recvbuffRmtAddrs[localPeer]);
           }
 
@@ -837,11 +886,11 @@ private:
   __device__ __forceinline__ void recvCopySend(intptr_t outIx, int eltN, bool postOp=false) {
     genericOp<0, 0, 1, 1, -1, Output>(-1, outIx, eltN, postOp);
   }
-  __device__ __forceinline__ void directRecvCopyDirectSend(intptr_t outIx, int eltN, bool postOp=false) {
-    genericOp<1, 1, 1, 1, -1, Output>(-1, outIx, eltN, postOp);
+  __device__ __forceinline__ void directRecvCopyDirectSend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
+    genericOp<1, 1, 1, 1, -1, Output>(inpIx, outIx, eltN, postOp);
   }
-  __device__ __forceinline__ void directRecvDirectSend(intptr_t inpIx, intptr_t outIx, int eltN) {
-    genericOp<1, 1, 1, 1, -1, -1>(inpIx, outIx, eltN, false);
+  __device__ __forceinline__ void directRecvDirectSend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
+    genericOp<1, 1, 1, 1, -1, -1>(inpIx, outIx, eltN, postOp);
   }
   __device__ __forceinline__ void recvCopyDirectSend(intptr_t outIx, int eltN, bool postOp=false) {
     genericOp<0, 1, 1, 1, -1, Output>(-1, outIx, eltN, postOp);
@@ -860,6 +909,9 @@ private:
   __device__ __forceinline__ void directRecvReduceSend(intptr_t inpIx, int eltN, bool postOp=false) {
     genericOp<1, 0, 1, 1, Input, -1>(inpIx, -1, eltN, postOp);
   }
+  __device__ __forceinline__ void recvReduceDirectSend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
+    genericOp<0, 1, 1, 1, Input, -1>(inpIx, outIx, eltN, postOp);
+  }
   __device__ __forceinline__ void directRecvReduceDirectSend(intptr_t inpIx, intptr_t outIx, ssize_t eltN, bool postOp=false) {
     genericOp<1, 1, 1, 1, Input, -1>(inpIx, outIx, eltN, postOp);
   }
diff --git a/src/device/reduce_kernel.h b/src/device/reduce_kernel.h
index b069c07ec9..c2378e3dff 100644
--- a/src/device/reduce_kernel.h
+++ b/src/device/reduce_kernel.h
@@ -20,6 +20,12 @@ struct IsFloatingPoint<half>: std::true_type {};
 template<>
 struct IsFloatingPoint<__nv_bfloat16>: std::true_type {};
 #endif
+#if defined(__CUDA_FP8_TYPES_EXIST__)
+template<>
+struct IsFloatingPoint<__nv_fp8_e4m3>: std::true_type {};
+template<>
+struct IsFloatingPoint<__nv_fp8_e5m2>: std::true_type {};
+#endif
 template<>
 struct IsFloatingPoint<float>: std::true_type {};
 template<>
@@ -298,6 +304,24 @@ SPECIALIZE_REDUCE(FuncMinMax, double, 1, double, fn.isMinNotMax ? fmin(x, y) : f
 #endif
 #endif
 
+#if defined(__CUDA_FP8_TYPES_EXIST__)
+#if __CUDA_ARCH__ >= 900
+  SPECIALIZE_REDUCE(FuncSum, __nv_fp8_e4m3, 1, __nv_fp8_e4m3, __nv_fp8_e4m3(__hadd(__half(x),__half(y))))
+  SPECIALIZE_REDUCE(FuncSum, __nv_fp8_e4m3, 2, __nv_fp8x2_e4m3, __nv_fp8x2_e4m3(__hadd2(__half2(x),__half2(y))))
+  SPECIALIZE_REDUCE(FuncProd, __nv_fp8_e4m3, 1, __nv_fp8_e4m3, __nv_fp8_e4m3(__hmul(__half(x),__half(y))))
+  SPECIALIZE_REDUCE(FuncProd, __nv_fp8_e4m3, 2, __nv_fp8x2_e4m3, __nv_fp8x2_e4m3(__hmul2(__half2(x),__half2(y))))
+  SPECIALIZE_REDUCE(FuncMinMax, __nv_fp8_e4m3, 1, __nv_fp8_e4m3, __nv_fp8_e4m3(fn.isMinNotMax ? __hmin(__half(x),__half(y)) : __hmax(__half(x),__half(y))))
+  SPECIALIZE_REDUCE(FuncMinMax, __nv_fp8_e4m3, 2, __nv_fp8x2_e4m3, __nv_fp8x2_e4m3(fn.isMinNotMax ? __hmin2(__half2(x),__half2(y)) : __hmax2(__half2(x),__half2(y))))
+
+  SPECIALIZE_REDUCE(FuncSum, __nv_fp8_e5m2, 1, __nv_fp8_e5m2, __nv_fp8_e5m2(__hadd(__half(x),__half(y))))
+  SPECIALIZE_REDUCE(FuncSum, __nv_fp8_e5m2, 2, __nv_fp8x2_e5m2, __nv_fp8x2_e5m2(__hadd2(__half2(x),__half2(y))))
+  SPECIALIZE_REDUCE(FuncProd, __nv_fp8_e5m2, 1, __nv_fp8_e5m2, __nv_fp8_e5m2(__hmul(__half(x),__half(y))))
+  SPECIALIZE_REDUCE(FuncProd, __nv_fp8_e5m2, 2, __nv_fp8x2_e5m2, __nv_fp8x2_e5m2(__hmul2(__half2(x),__half2(y))))
+  SPECIALIZE_REDUCE(FuncMinMax, __nv_fp8_e5m2, 1, __nv_fp8_e5m2, __nv_fp8_e5m2(fn.isMinNotMax ? __hmin(__half(x), __half(y)) : __hmax(__half(x), __half(y))))
+  SPECIALIZE_REDUCE(FuncMinMax, __nv_fp8_e5m2, 2, __nv_fp8x2_e5m2, __nv_fp8x2_e5m2(fn.isMinNotMax ? __hmin2(__half2(x), __half2(y)) : __hmax2(__half2(x), __half2(y))))
+#endif
+#endif
+
 #undef SPECIALIZE_REDUCE
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -416,9 +440,9 @@ template<>
 struct FuncPreMulSum<half> {
   using EltType = half;
 #if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
-  half2 scalar;
+  __half2 scalar;
   __device__ FuncPreMulSum(uint64_t opArg=0) {
-    union { uint64_t u64; half val; };
+    union { uint64_t u64; __half val; };
     u64 = opArg;
     scalar.x = val;
     scalar.y = val;
@@ -426,9 +450,9 @@ struct FuncPreMulSum<half> {
 #else
   float scalar;
   __device__ FuncPreMulSum(uint64_t opArg=0) {
-    union { uint64_t u64; half val; };
+    union { uint64_t u64; __half val; };
     u64 = opArg;
-    scalar = __half2float(val);
+    scalar = (float)val;
   }
 #endif
 };
@@ -459,11 +483,39 @@ struct FuncPreMulSum<half> {
   };
 #endif
 
-template<typename T>
-struct Apply_Reduce<FuncPreMulSum<T>, /*EltPerPack=*/1> {
-  __device__ static BytePack<sizeof(T)> reduce(FuncPreMulSum<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
+#if defined(__CUDA_FP8_TYPES_EXIST__)
+#if __CUDA_ARCH__ >= 900
+  template<>
+  struct FuncPreMulSum<__nv_fp8_e4m3> {
+    using EltType = __nv_fp8_e4m3;
+    __half2 scalar2;
+    __device__ FuncPreMulSum(uint64_t opArg) {
+      union { uint64_t u64; __nv_fp8_storage_t val; };
+      u64 = opArg;
+      scalar2.x = __half(__nv_cvt_fp8_to_halfraw(val, __NV_E4M3));
+      scalar2.y = scalar2.x;
+    }
+  };
+
+  template<>
+  struct FuncPreMulSum<__nv_fp8_e5m2> {
+    using EltType = __nv_fp8_e5m2;
+    __half2 scalar2;
+    __device__ FuncPreMulSum(uint64_t opArg) {
+      union { uint64_t u64; __nv_fp8_storage_t val; };
+      u64 = opArg;
+      scalar2.x = __half(__nv_cvt_fp8_to_halfraw(val, __NV_E5M2));
+      scalar2.y = scalar2.x;
+    }
+  };
+#endif
+#endif
+
+template<typename T, int EltPerPack>
+struct Apply_Reduce<FuncPreMulSum<T>, EltPerPack> {
+  __device__ static BytePack<EltPerPack*sizeof(T)> reduce(FuncPreMulSum<T> fn, BytePack<EltPerPack*sizeof(T)> a, BytePack<EltPerPack*sizeof(T)> b) {
     // FuncPreMulSum reduce dispatches to FuncSum.
-    return Apply_Reduce<FuncSum<T>, 1>::reduce(FuncSum<T>(), a, b);
+    return Apply_Reduce<FuncSum<T>, EltPerPack>::reduce(FuncSum<T>(), a, b);
   }
 };
 
@@ -530,6 +582,51 @@ struct Apply_PreOp<FuncPreMulSum<half>, /*EltPerPack=*/1> {
   #endif
 #endif
 
+////////////////////////////////////////////////////////////////////////////////
+// Apply_PreOp of FuncPreMulSum for fp8.
+
+#if defined(__CUDA_FP8_TYPES_EXIST__)
+#if __CUDA_ARCH__ >= 900
+  template<>
+  struct Apply_PreOp<FuncPreMulSum<__nv_fp8_e4m3>, /*EltPerPack=*/1> {
+    static constexpr bool IsIdentity = false;
+    __device__ static BytePack<sizeof(__nv_fp8_e4m3)> preOp(
+        FuncPreMulSum<__nv_fp8_e4m3> fn, BytePack<sizeof(__nv_fp8_e4m3)> a
+      ) {
+      return toPack<__nv_fp8_e4m3>(__nv_fp8_e4m3(__hmul(__half(fromPack<__nv_fp8_e4m3>(a)), fn.scalar2.x)));
+    }
+  };
+  template<>
+  struct Apply_PreOp<FuncPreMulSum<__nv_fp8_e4m3>, /*EltPerPack=*/2> {
+    static constexpr bool IsIdentity = false;
+    __device__ static BytePack<sizeof(__nv_fp8x2_e4m3)> preOp(
+        FuncPreMulSum<__nv_fp8_e4m3> fn, BytePack<sizeof(__nv_fp8x2_e4m3)> a
+      ) {
+      return toPack<__nv_fp8x2_e4m3>(__nv_fp8x2_e4m3(__hmul2(__half2(fromPack<__nv_fp8x2_e4m3>(a)), fn.scalar2)));
+    }
+  };
+
+  template<>
+  struct Apply_PreOp<FuncPreMulSum<__nv_fp8_e5m2>, /*EltPerPack=*/1> {
+    static constexpr bool IsIdentity = false;
+    __device__ static BytePack<sizeof(__nv_fp8_e5m2)> preOp(
+        FuncPreMulSum<__nv_fp8_e5m2> fn, BytePack<sizeof(__nv_fp8_e5m2)> a
+      ) {
+      return toPack<__nv_fp8_e5m2>(__nv_fp8_e5m2(__hmul(__half(fromPack<__nv_fp8_e5m2>(a)), fn.scalar2.x)));
+    }
+  };
+  template<>
+  struct Apply_PreOp<FuncPreMulSum<__nv_fp8_e5m2>, /*EltPerPack=*/2> {
+    static constexpr bool IsIdentity = false;
+    __device__ static BytePack<sizeof(__nv_fp8x2_e5m2)> preOp(
+        FuncPreMulSum<__nv_fp8_e5m2> fn, BytePack<sizeof(__nv_fp8x2_e5m2)> a
+      ) {
+      return toPack<__nv_fp8x2_e5m2>(__nv_fp8x2_e5m2(__hmul2(__half2(fromPack<__nv_fp8x2_e5m2>(a)), fn.scalar2)));
+    }
+  };
+#endif
+#endif
+
 ////////////////////////////////////////////////////////////////////////////////
 // FuncSumPostDiv
 
@@ -541,34 +638,44 @@ struct RedOpArg<FuncSumPostDiv<T>> {
   }
 };
 
-template<typename T, bool IsFloating=IsFloatingPoint<T>::value>
-struct FuncSumPostDiv_IntOnly;
-
 template<typename T>
-struct FuncSumPostDiv: FuncSumPostDiv_IntOnly<T> {
-  __device__ FuncSumPostDiv(uint64_t opArg=0):
-    FuncSumPostDiv_IntOnly<T>(opArg) {
+struct FuncSumPostDiv {
+  static_assert(T(0) < T(-1), "FuncSumPostDiv is only for implementing ncclAvg on uint types.");
+  using EltType = T;
+  using UintType = typename std::conditional<sizeof(T)==8, uint64_t, uint32_t>::type;
+  uint32_t divisor:31, isSigned:1;
+  UintType recip;
+  
+  __device__ FuncSumPostDiv(uint64_t opArg=0) {
+    isSigned = opArg & 1;
+    divisor = opArg >> 1;
+    recip =  UintType(-1)/divisor;
+  }
+  __device__ T divide(T x) {
+    // x is negative iff we are in signed mode and the top bit is set
+    bool xneg = isSigned && (x & ~(T(-1)>>1));
+    // Compute abs(x):
+    // T(-x) vs -T(x) is critical. We have to negate then truncate the bits. Consider
+    // if we are doing signed 8-bit types, thus T=uint8_t. The value -1 is encoded
+    // as 0xff. -T(0xff) when promoted to 32-bit (which is implicit by compiler)
+    // gives 0xffffff01, but T(-0xff) is 0x1, and that is the abs value we want.
+    UintType xabs = xneg ? T(-x) : x;
+    // Compute quotient by multiplying by reciprical.
+    UintType q = sizeof(T)==8 ? __umul64hi(xabs, recip) : __umulhi(xabs, recip);
+    // Quotient may be off by one so do a fixup.
+    if (xabs - q*divisor >= divisor) q += 1;
+    // If original x was negative then we have to negate it back since we were
+    // working with its abs val.
+    return xneg ? -T(q) : T(q);
   }
 };
 
-template<typename T>
-struct FuncSumPostDiv_IntOnly<T, /*IsFloating=*/false>: FuncSum<T> {
-  using EltType = T;
-  int divisor;
-  __device__ FuncSumPostDiv_IntOnly(uint64_t opArg=0): divisor(opArg) {}
-};
-
-template<typename T>
-struct FuncSumPostDiv_IntOnly<T, /*IsFloating=*/true> {
-  static_assert(sizeof(T)!=sizeof(T), "FuncSumPostDiv is only for implementing ncclAvg on integral types.");
-};
-
-template<typename T>
-struct Apply_Reduce<FuncSumPostDiv<T>, /*EltPerPack=*/1>:
-    Apply_Reduce<FuncSum<T>, 1> {
-  __device__ static BytePack<sizeof(T)> reduce(FuncSumPostDiv<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
+template<typename T, int EltPerPack>
+struct Apply_Reduce<FuncSumPostDiv<T>, EltPerPack>:
+    Apply_Reduce<FuncSum<T>, EltPerPack> {
+  __device__ static BytePack<EltPerPack*sizeof(T)> reduce(FuncSumPostDiv<T> fn, BytePack<EltPerPack*sizeof(T)> a, BytePack<EltPerPack*sizeof(T)> b) {
     // FuncSumPostDiv reduce dispatches to FuncSum.
-    return Apply_Reduce<FuncSum<T>, 1>::reduce(FuncSum<T>(), a, b);
+    return Apply_Reduce<FuncSum<T>, EltPerPack>::reduce(FuncSum<T>(), a, b);
   }
 };
 
@@ -576,7 +683,7 @@ template<typename T>
 struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
   static constexpr bool IsIdentity = false;
   __device__ static BytePack<sizeof(T)> postOp(FuncSumPostDiv<T> fn, BytePack<sizeof(T)> a) {
-    return toPack<T>(fromPack<T>(a) / fn.divisor);
+    return toPack<T>(fn.divide(fromPack<T>(a)));
   }
 };
 
diff --git a/src/device/reduce_scatter.h b/src/device/reduce_scatter.h
index f7b3c25e58..70538b117f 100644
--- a/src/device/reduce_scatter.h
+++ b/src/device/reduce_scatter.h
@@ -89,7 +89,7 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SI
     T *inputBuf = (T*)work->sendbuff;
     T *outputBuf = (T*)work->recvbuff;
     Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
-      (tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, false, false, 0, primsModePatRs);
+      (tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, nullptr, 0, primsModePatRs);
 
     PatRSAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
     int last = 0;
@@ -137,6 +137,7 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_S
           nelem = min(chunkCount, channelCount - elemOffset);
           prims.scatter(offset, nvls->nHeads * count, nelem, count, -1, 0);
         }
+        // coverity[overrun-call] => Coverity think prims.index can be greater than 1
       } else if (tid < tidEndReduce) {
         // Reduce through NVLS
         using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
@@ -206,10 +207,10 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
       int nRails = direct->nHeads;
       int part = ncclShmem.channelId - work->channelLo;
       void* inbuf = (void*)work->sendbuff;
-      ssize_t sizePerRank = work->collnet.count;
+      ssize_t countPerRank = work->collnet.count;
 
-      ssize_t railAllBeg = min(railGridOffset + part*chunkSize, nNodes*sizePerRank);
-      ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*sizePerRank);
+      ssize_t railAllBeg = min(railGridOffset + part*chunkSize, nNodes*countPerRank);
+      ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*countPerRank);
       int railAllSize = railAllEnd - railAllBeg;
       if (tid < nDsts) dstSizes[tid] = railAllSize;
 
@@ -222,15 +223,15 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
         if (rail == nRails) rail = 0;
       }
       do {
-        int node = railAllBeg/sizePerRank;
+        int node = railAllBeg/countPerRank;
         int railAllOffset = 0;
         while (railAllOffset < railAllSize) {
-          ssize_t railOneBeg = node*sizePerRank;
-          ssize_t railOneEnd = railOneBeg + sizePerRank;
+          ssize_t railOneBeg = node*countPerRank;
+          ssize_t railOneEnd = railOneBeg + countPerRank;
           ssize_t railOneOffset = (railAllBeg+railAllOffset) - railOneBeg;
           int delta = min(railAllEnd, railOneEnd) - (railAllBeg+railAllOffset);
           int rank = ncclShmem.comm.collNetDenseToUserRank[node*nRails + rail];
-          ssize_t userOneBeg = rank*sizePerRank + railOneOffset;
+          ssize_t userOneBeg = rank*countPerRank + railOneOffset;
           if (nDsts != 0) {
             reduceCopy<ncclCollUnroll(), RedOp, T,
                      /*MultimemSrcs=*/0, 1+MinSrcs, 1+MaxSrcs,
@@ -239,7 +240,7 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
             (tid, tn, work->redOpArg, &work->redOpArg, false,
              /*nSrcs=*/1+nSrcs, [=]__device__(int s) {
                return s==0 ? (T*)inbuf + userOneBeg
-                           : work->regUsed && (recvDirectFlag & NCCL_DIRECT_READ)
+                           : work->regUsed && (recvDirectFlag & NCCL_P2P_READ)
                            ? (T*)srcPtrs[s-1] + userOneBeg
                            : (T*)srcPtrs[s-1] + railAllOffset;
              },
@@ -264,7 +265,8 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
     struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
     int const &nNodes = ncclShmem.comm.nNodes;
     ssize_t chunkSize = int(work->collnet.chunkCount);
-    ssize_t sizePerRank = work->collnet.count;
+    ssize_t countPerRank = work->collnet.count;
+    const int hasDn = (direct->down[0] >= 0) ? 1 : 0;
 
     if (direct->out == -1) __trap();
     bool isMultiRail = (direct->nHeads > 1);
@@ -281,15 +283,15 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
     int tn = nWarps1*WARP_SIZE;
     if (tid < tn) {
       // Phase 1: Scatter inputs to peers
-      Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0>
+      Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
         prims(tid, tn, nullptr, direct->heads+1, work->sendbuff, nullptr,
-              work->redOpArg, 0*Proto::MaxGroupWidth, 1, 1, work);
-      for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
+              work->redOpArg, 0*Proto::MaxGroupWidth, 1, 1);
+      for (ssize_t railGridOffset=0; railGridOffset < nNodes*countPerRank; railGridOffset += nChannels*chunkSize) {
         Scatterer</*ReduceSendNotRecv=*/true> scat;
         scat.work = work;
         scat.chunkSize = chunkSize;
         scat.railGridOffset = railGridOffset;
-        prims.template process</*Recv=*/0, /*Send=*/1>(scat, NCCL_DIRECT_READ, 0);
+        prims.template process</*Recv=*/0, /*Send=*/1>(scat, 0, 0);
       }
       return;
     }
@@ -297,23 +299,22 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
 
     tn = nWarps2*WARP_SIZE;
     if (tid < tn) {
-      if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
+      if (work->netRegUsed && !hasDn) {
         if (tid == 0) {
-          int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE);
-          Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, steps);
+          Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, 1);
         }
         __syncwarp();
       } else {
         // Phase 2: Reduce from peers + local input -> send to network
-        Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/1, Proto, 0>
+        Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>
           prims(tid, tn, direct->heads + 1, &direct->out, nullptr, nullptr,
-            work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1, work);
-        for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
+            work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
+        for (ssize_t railGridOffset = 0; railGridOffset < nNodes * countPerRank; railGridOffset += nChannels * chunkSize) {
           Scatterer</*ReduceSendNotRecv=*/false> scat;
           scat.work = work;
           scat.chunkSize = chunkSize;
           scat.railGridOffset = railGridOffset;
-          prims.template process</*Recv=*/1, /*Send=*/1>(scat, 0, NCCL_DIRECT_READ);
+          prims.template process</*Recv=*/1, /*Send=*/1>(scat, 0, 0);
         }
       }
       return;
@@ -322,9 +323,9 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
 
     tn = nWarps3*WARP_SIZE;
     if (tid < tn) {
-      if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
+      if (work->netRegUsed) {
         if (tid == 0) {
-          int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE);
+          int steps = hasDn ? (int)divUp(nNodes * countPerRank, nChannels * chunkSize) : 1;
           Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, steps);
         }
         __syncwarp();
@@ -333,11 +334,11 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
         Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
           prims(tid, tn, &direct->out, nullptr, nullptr, work->recvbuff,
             work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0);
-        for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
+        for (ssize_t railGridOffset = 0; railGridOffset < nNodes * countPerRank; railGridOffset += nChannels * chunkSize) {
           ssize_t railAllBeg = railGridOffset + part * chunkSize;
-          ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes * sizePerRank);
-          ssize_t railOneBeg = ncclShmem.comm.node * sizePerRank;
-          ssize_t railOneEnd = railOneBeg + sizePerRank;
+          ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes * countPerRank);
+          ssize_t railOneBeg = ncclShmem.comm.node * countPerRank;
+          ssize_t railOneEnd = railOneBeg + countPerRank;
           ssize_t beg = max(railAllBeg, railOneBeg);
           ssize_t end = min(railAllEnd, railOneEnd);
           prims.recv(beg - railOneBeg, max(ssize_t(0), end - beg), /*postOp=*/true);
diff --git a/src/device/sendrecv.h b/src/device/sendrecv.h
index 9b039a41a0..fe3b9ca77a 100644
--- a/src/device/sendrecv.h
+++ b/src/device/sendrecv.h
@@ -15,33 +15,35 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
   template<typename Proto>
   __device__ void runSend(int tid, int tn, int group, struct ncclDevWorkP2p* work) {
     size_t bytes = work->sendBytes;
-    int chunkSize = work->sendIpcReg && ncclShmem.comm.isNvlink ? (1 << 30) : u32fp8Decode(work->sendChunkSize_u32fp8);
+    bool useLargeChunk = (work->sendIpcReg && ncclShmem.comm.isAllNvlink) || work->sendNetReg;
+    int chunkSize = useLargeChunk ? NCCL_MAX_NET_SIZE : u32fp8Decode(work->sendChunkSize_u32fp8);
+    int stepSize = useLargeChunk ? NCCL_MAX_NET_SIZE : ncclShmem.comm.p2pChunkSize;
     Primitives<T, RedOp, FanAsymmetric<0, 1>, 1, Proto, 1>
       prims(tid, tn, nullptr, &work->sendRank, work->sendAddr, nullptr,
-            /*redOpArg(ignored)=*/0, group, 1, 1, nullptr,
-            /*ipcReg=*/work->sendIpcReg, /*netReg=*/work->sendRegistered, ncclShmem.comm.p2pChunkSize);
+            /*redOpArg(ignored)=*/0, group, 1, 1, nullptr, work, stepSize);
     size_t cursor = 0;
     do {
       int n = min(size_t(chunkSize), bytes-cursor);
       prims.directSend(cursor, cursor, n);
       cursor += n;
-    } while (cursor < bytes && work->sendRegistered == 0);
+    } while (cursor < bytes);
   }
 
   template<typename Proto>
   __device__ void runRecv(int tid, int tn, int group, struct ncclDevWorkP2p* work) {
     size_t bytes = work->recvBytes;
-    int chunkSize = work->recvIpcReg && ncclShmem.comm.isNvlink ? (1 << 30) : u32fp8Decode(work->recvChunkSize_u32fp8);
+    bool useLargeChunk = (work->recvIpcReg && ncclShmem.comm.isAllNvlink) || work->recvNetReg;
+    int chunkSize = useLargeChunk ? NCCL_MAX_NET_SIZE : u32fp8Decode(work->recvChunkSize_u32fp8);
+    int stepSize = useLargeChunk ? NCCL_MAX_NET_SIZE : ncclShmem.comm.p2pChunkSize;
     Primitives<T, RedOp, FanAsymmetric<1, 0>, 1, Proto, 1>
       prims(tid, tn, &work->recvRank, nullptr, nullptr, work->recvAddr,
-            /*redOpArg(ignored)=*/0, group, 1, 1, nullptr,
-            /*ipcReg=*/work->recvIpcReg, /*netReg=*/work->recvRegistered, ncclShmem.comm.p2pChunkSize);
+            /*redOpArg(ignored)=*/0, group, 1, 1, nullptr, work, stepSize);
     size_t cursor = 0;
     do {
       int n = min(size_t(chunkSize), bytes-cursor);
       prims.directRecv(cursor, cursor, n);
       cursor += n;
-    } while (cursor < bytes && work->recvRegistered == 0);
+    } while (cursor < bytes);
   }
 
   __device__ __forceinline__ void run() {
diff --git a/src/enqueue.cc b/src/enqueue.cc
index 4edb42decc..285e17f69c 100644
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@@ -16,6 +16,7 @@
 
 #include <cstring> // std::memcpy
 #include <cinttypes> // PRIx64
+#include <cassert>
 
 NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0);
 
@@ -63,15 +64,6 @@ static inline int ncclFuncTrafficPerByte(ncclFunc_t func, int nRanks) {
   default: return 1;
   }
 }
-static inline size_t ncclFuncSendCount(ncclFunc_t func, int nRanks, size_t count) {
-  return func == ncclFuncReduceScatter ? nRanks*count : count;
-}
-static inline size_t ncclFuncRecvCount(ncclFunc_t func, int nRanks, size_t count) {
-  return func == ncclFuncAllGather ? nRanks*count : count;
-}
-static inline size_t ncclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count) {
-  return func == ncclFuncAllGather || func == ncclFuncReduceScatter ? nRanks*count : count;
-}
 
 /*****************************************************************************/
 /*       Launch system : synchronization and CUDA kernel launch              */
@@ -230,301 +222,8 @@ static void finishPlan(struct ncclComm* comm, struct ncclKernelPlan* plan) {
   }
 }
 
-int64_t ncclParamLocalRegister();
 NCCL_PARAM(GraphRegister, "GRAPH_REGISTER", 1);
 
-struct ncclIpcCleanupCallback {
-  struct ncclCommCallback base;
-  void* ptr;
-};
-static ncclResult_t cleanupIpc(struct ncclComm* comm, struct ncclCommCallback* cb) {
-  struct ncclIpcCleanupCallback* me = (struct ncclIpcCleanupCallback*)cb;
-  CUDACHECKIGNORE(cudaIpcCloseMemHandle(me->ptr));
-  free(me);
-  return ncclSuccess;
-}
-
-static ncclResult_t registerCheckP2PConnection(struct ncclComm* comm, struct ncclConnector* conn, struct ncclTopoGraph* graph, int peer, bool* needReg) {
-  if (conn->connected) {
-    if (conn->conn.flags & (NCCL_IPC_READ | NCCL_IPC_WRITE | NCCL_DIRECT_READ | NCCL_DIRECT_WRITE)) {
-      *needReg = true;
-    } else {
-      // network connection
-      *needReg = false;
-    }
-  } else {
-    struct ncclPeerInfo* peerInfo = &comm->peerInfo[peer];
-    struct ncclPeerInfo* myInfo = &comm->peerInfo[comm->rank];
-    int canConnect = 0;
-    NCCLCHECK(ncclTransports[0]->canConnect(&canConnect, comm, graph, myInfo, peerInfo));
-    if (canConnect) {
-      *needReg = true;
-    } else {
-      *needReg = false;
-    }
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t registerCollBuffers(
-    struct ncclComm* comm, struct ncclTaskColl* info,
-    void* outRegBufSend[NCCL_MAX_LOCAL_RANKS],
-    void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS],
-    struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue,
-    bool* regNeedConnect
-  ) {
-  ncclResult_t result = ncclSuccess;
-
-  info->regBufType = NCCL_REGULAR_BUFFER;
-  *regNeedConnect = true;
-  if (!(ncclParamLocalRegister() || (comm->planner.persistent && ncclParamGraphRegister()))) goto exit;
-#if CUDART_VERSION >= 11030
-  if (info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) {
-    if (!comm->nvlsRegSupport || info->opDev.op == ncclDevPreMulSum) goto exit;
-    bool regBufUsed = false;
-    const void *sendbuff = info->sendbuff;
-    void *recvbuff = info->recvbuff;
-    if (info->func == ncclFuncAllGather) sendbuff = NULL;
-    if (info->func == ncclFuncReduceScatter) recvbuff = NULL;
-    size_t elementSize = ncclTypeSize(info->datatype);
-    size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count);
-    size_t recvbuffSize = elementSize*ncclFuncRecvCount(info->func, comm->nRanks, info->count);
-
-    /* first try local registration. */
-    if (ncclParamLocalRegister()) {
-      ncclNvlsLocalRegisterBuffer(comm, sendbuff, recvbuff, sendbuffSize, recvbuffSize, &regBufUsed, outRegBufSend, outRegBufRecv);
-    }
-
-    if (regBufUsed == false && comm->planner.persistent && ncclParamGraphRegister()) {
-      ncclNvlsGraphRegisterBuffer(comm, sendbuff, recvbuff, sendbuffSize, recvbuffSize, &regBufUsed, outRegBufSend, outRegBufRecv, cleanupQueue, &info->nCleanupQueueElts);
-    }
-
-    if (regBufUsed) {
-      *regNeedConnect = false;
-      /* tweak NVLS channels usage; for registered NVLS buffer, we only need 4/5 channels to
-       * saturate bandwidth. */
-      if (comm->nNodes == 1) {
-        if (info->func == ncclFuncReduceScatter)
-          info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 5));
-        else
-          info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 4));
-      } else {
-        info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 6));
-      }
-      info->regBufType = NCCL_NVLS_REG_BUFFER;
-    }
-  } else if ((info->algorithm == NCCL_ALGO_COLLNET_DIRECT || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) && comm->collNetRegSupport && info->opDev.op != ncclDevPreMulSum && info->opDev.op != ncclDevSumPostDiv) {
-    size_t elementSize = ncclTypeSize(info->datatype);
-    size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count);
-    size_t recvbuffSize = elementSize*ncclFuncRecvCount(info->func, comm->nRanks, info->count);
-    int sendRegBufFlag = 0;
-    int recvRegBufFlag = 0;
-    void *sendHandle, *recvHandle;
-
-    if (ncclParamLocalRegister()) {
-      ncclCollnetLocalRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &sendRegBufFlag, &sendHandle);
-      info->sendMhandle = sendHandle;
-      if (sendRegBufFlag) {
-        ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &recvRegBufFlag, &recvHandle);
-        info->recvMhandle = recvHandle;
-      }
-    }
-
-    if ((sendRegBufFlag == 0 || recvRegBufFlag == 0) && comm->planner.persistent && ncclParamGraphRegister()) {
-      if (!sendRegBufFlag) {
-        ncclCollnetGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &sendRegBufFlag, &sendHandle, cleanupQueue, &info->nCleanupQueueElts);
-        info->sendMhandle = sendHandle;
-      }
-      if (sendRegBufFlag && !recvRegBufFlag) {
-        ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &recvRegBufFlag, &recvHandle, cleanupQueue, &info->nCleanupQueueElts);
-        info->recvMhandle = recvHandle;
-      }
-    }
-
-    if (sendRegBufFlag && recvRegBufFlag) {
-      info->nMaxChannels = 1;
-      info->regBufType = NCCL_COLLNET_REG_BUFFER;
-      if (sendRegBufFlag == 1 && recvRegBufFlag == 1) {
-        INFO(NCCL_REG, "rank %d successfully registered collNet sendbuff %p (handle %p), sendbuff size %ld, recvbuff %p (handle %p), recvbuff size %ld", comm->rank, info->sendbuff, sendHandle, sendbuffSize, info->recvbuff, recvHandle, recvbuffSize);
-      }
-    }
-  } else if (comm->intraNodeP2pSupport && info->protocol == NCCL_PROTO_SIMPLE) {
-    // IPC buffer registration
-    if (info->func == ncclFuncReduceScatter) goto exit;
-    if (info->algorithm == NCCL_ALGO_RING && ((info->func == ncclFuncAllReduce && info->sendbuff == info->recvbuff) || info->func == ncclFuncReduce)) goto exit;
-    if ((info->algorithm == NCCL_ALGO_TREE || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) && info->sendbuff == info->recvbuff) goto exit;
-    if (info->func == ncclFuncAllGather && info->algorithm == NCCL_ALGO_PAT) goto exit;
-
-    int peerRanks[NCCL_MAX_LOCAL_RANKS];
-    int nPeers = 0;
-    size_t elementSize = ncclTypeSize(info->datatype);
-    size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count);
-    size_t recvbuffSize = elementSize*ncclFuncRecvCount(info->func, comm->nRanks, info->count);
-    int regBufFlag = 0;
-    memset(peerRanks, 0xff, sizeof(int) * NCCL_MAX_LOCAL_RANKS);
-
-    if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) {
-      struct ncclChannel* channel = comm->channels;
-      for (int r = 0; r < NCCL_MAX_DIRECT_ARITY; ++r) {
-        for (int updown = 0; updown < 2; ++updown) {
-          int peer;
-          if (updown == 0)
-            peer = channel->collnetDirect.up[r];
-          else
-            peer = channel->collnetDirect.down[r];
-          if (peer != -1) {
-            struct ncclConnector* peerConn = &channel->peers[peer]->recv[0];
-            bool needReg = false;
-
-            NCCLCHECK(registerCheckP2PConnection(comm, peerConn, &comm->graphs[NCCL_ALGO_COLLNET_DIRECT], peer, &needReg));
-            if (needReg) {
-              bool found = false;
-              for (int p = 0; p < nPeers; ++p) {
-                if (peerRanks[p] == peer) {
-                  found = true;
-                  break;
-                }
-              }
-              if (!found) peerRanks[nPeers++] = peer;
-            }
-          }
-        }
-      }
-
-      if (nPeers > 0) {
-        if (ncclParamLocalRegister())
-          ncclIpcLocalRegisterBuffer(comm, info->sendbuff, sendbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->sendbuffOffset, &info->sendbuffRmtAddrs);
-        if (!regBufFlag && comm->planner.persistent && ncclParamGraphRegister()) {
-          ncclIpcGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->sendbuffOffset, &info->sendbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts);
-        }
-        if (regBufFlag) {
-          if (ncclParamLocalRegister())
-            ncclIpcLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs);
-          if (!regBufFlag && comm->planner.persistent && ncclParamGraphRegister()) {
-            ncclIpcGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts);
-          }
-        }
-      }
-      if (regBufFlag) {
-        info->regBufType = NCCL_IPC_REG_BUFFER;
-      }
-    } else if (info->algorithm == NCCL_ALGO_RING) {
-      struct ncclReg* recvRegRecord;
-      NCCLCHECK(ncclRegFind(comm, info->recvbuff, recvbuffSize, &recvRegRecord));
-      if (recvRegRecord == NULL) goto exit;
-      for (int c = 0; c < comm->nChannels; ++c) {
-        struct ncclChannel* channel = comm->channels + c;
-        for (int r = 0; r < 2; ++r) {
-          bool needReg = false;
-          int peer;
-          struct ncclConnector* peerConn;
-          // P2P transport
-          if (r == 0)
-            peer = channel->ring.prev;
-          else
-            peer = channel->ring.next;
-          peerConn = &channel->peers[peer]->recv[0];
-          NCCLCHECK(registerCheckP2PConnection(comm, peerConn, &comm->graphs[NCCL_ALGO_RING], peer, &needReg));
-
-          if (needReg) {
-            bool found = false;
-            for (int p = 0; p < nPeers; ++p) {
-              if (peerRanks[p] == peer) {
-                found = true;
-                break;
-              }
-            }
-            if (!found) peerRanks[nPeers++] = peer;
-          }
-        }
-      }
-      if (nPeers > 0) {
-        if (ncclParamLocalRegister()) {
-          ncclIpcLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs);
-        }
-        if (!regBufFlag && comm->planner.persistent && ncclParamGraphRegister()) {
-          ncclIpcGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts);
-        }
-      }
-      if (regBufFlag) {
-        info->regBufType = NCCL_IPC_REG_BUFFER;
-      }
-    } else if (info->algorithm == NCCL_ALGO_TREE || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) {
-      struct ncclReg* recvRegRecord;
-      NCCLCHECK(ncclRegFind(comm, info->recvbuff, recvbuffSize, &recvRegRecord));
-      if (recvRegRecord == NULL) goto exit;
-      for (int c = 0; c < comm->nChannels; ++c) {
-        struct ncclChannel* channel = comm->channels + c;
-        struct ncclTree* tree = NULL;
-        int peers[NCCL_MAX_TREE_ARITY + 1];
-
-        if (info->algorithm == NCCL_ALGO_TREE)
-          tree = &channel->tree;
-        else
-          tree = &channel->collnetChain;
-        for (int p = 0; p < NCCL_MAX_TREE_ARITY; ++p) peers[p] = tree->down[p];
-        peers[NCCL_MAX_TREE_ARITY] = tree->up;
-        for (int p = 0; p < NCCL_MAX_TREE_ARITY + 1; ++p) {
-          int peer = peers[p];
-          bool peerNeedReg = false;
-          struct ncclConnector* recvConn = NULL;
-          // P2P transport
-          if (peer == -1 || peer == comm->nRanks) continue;
-          recvConn = &channel->peers[peer]->recv[0];
-          NCCLCHECK(registerCheckP2PConnection(comm, recvConn, &comm->graphs[info->algorithm], peer, &peerNeedReg));
-
-          if (peerNeedReg) {
-            bool found = false;
-            for (int pindex = 0; pindex < nPeers; ++pindex) {
-              if (peerRanks[pindex] == peer) {
-                found = true;
-                break;
-              }
-            }
-            if (!found) peerRanks[nPeers++] = peer;
-          }
-        }
-      }
-      if (nPeers > 0) {
-        if (ncclParamLocalRegister()) {
-          ncclIpcLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs);
-        }
-        if (!regBufFlag && comm->planner.persistent && ncclParamGraphRegister()) {
-          ncclIpcGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts);
-        }
-      }
-      if (regBufFlag) {
-        info->regBufType = NCCL_IPC_REG_BUFFER;
-      }
-    }
-
-    if (info->regBufType == NCCL_IPC_REG_BUFFER && comm->nNodes == 1 && 16 < info->nMaxChannels && info->nMaxChannels <= 24) {
-      info->nMaxChannels = 16;
-    }
-  }
-exit:
-#endif
-  return result;
-}
-
-static ncclResult_t registerP2pBuffer(struct ncclComm* comm, void* userbuff, int peerRank, size_t size, int* regFlag, void** regAddr, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue) {
-  ncclResult_t ret = ncclSuccess;
-  uintptr_t offset = 0;
-  uintptr_t* peerRmtAddrs = NULL;
-
-  *regFlag = 0;
-  if (ncclParamLocalRegister()) {
-    ncclIpcLocalRegisterBuffer(comm, userbuff, size, &peerRank, 1, NCCL_IPC_SENDRECV, regFlag, &offset, &peerRmtAddrs);
-  }
-  if (*regFlag == 0 && comm->planner.persistent && ncclParamGraphRegister()) {
-    ncclIpcGraphRegisterBuffer(comm, userbuff, size, &peerRank, 1, NCCL_IPC_SENDRECV, regFlag, &offset, &peerRmtAddrs, reinterpret_cast<void*>(cleanupQueue), NULL);
-  }
-
-  if (*regFlag)
-    *regAddr = (void*)((uintptr_t)peerRmtAddrs + offset);
-  return ret;
-}
-
 static ncclResult_t getCollNetSupport(struct ncclComm* comm, struct ncclTaskColl* task, int* collNetSupport);
 static ncclResult_t getAlgoInfo(
   struct ncclComm* comm, struct ncclTaskColl* task,
@@ -550,10 +249,72 @@ static bool testBudget(
   return ok;
 }
 
+ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm) {
+  struct ncclKernelPlanner* planner = &comm->planner;
+  struct ncclTaskColl *task;
+
+  task = ncclIntruQueueHead(&planner->collTaskQueue);
+  while (task != nullptr) {
+    // Build a ncclDevWorkColl[Reg?] struct for each task.
+    void* regBufSend[NCCL_MAX_LOCAL_RANKS];
+    void* regBufRecv[NCCL_MAX_LOCAL_RANKS];
+    bool regNeedConnect = true;
+    struct ncclWorkList* workNode = NULL;
+    struct ncclDevWorkColl devWork = {};
+
+    if (task->algorithm == NCCL_ALGO_NVLS_TREE || task->algorithm == NCCL_ALGO_NVLS) {
+      workNode = ncclIntruQueueDequeue(&planner->tmpCollWorkQueue);
+      goto next;
+    }
+    ncclRegisterCollBuffers(comm, task, regBufSend, regBufRecv, &planner->collCleanupQueue, &regNeedConnect);
+
+    devWork.sendbuff = (void*)task->sendbuff;
+    devWork.recvbuff = (void*)task->recvbuff;
+    devWork.sendbuffOffset = task->sendbuffOffset;
+    devWork.recvbuffOffset = task->recvbuffOffset;
+    devWork.sendbuffRmtAddrs = task->sendbuffRmtAddrs;
+    devWork.recvbuffRmtAddrs = task->recvbuffRmtAddrs;
+    devWork.root = task->root;
+    devWork.nWarps = task->nWarps;
+    devWork.redOpArg = task->opDev.scalarArg;
+    devWork.redOpArgIsPtr = task->opDev.scalarArgIsPtr;
+    devWork.oneNode = (comm->nNodes == 1);
+    devWork.isOneRPN = comm->isOneRPN;
+    devWork.netRegUsed = devWork.regUsed = 0;
+    if (task->regBufType & NCCL_NET_REG_BUFFER)
+      devWork.netRegUsed = 1;
+    if (task->regBufType & (NCCL_IPC_REG_BUFFER | NCCL_NVLS_REG_BUFFER))
+      devWork.regUsed = 1;
+
+    if (task->regBufType & NCCL_NVLS_REG_BUFFER) {
+      struct ncclDevWorkCollReg workReg = {};
+      workReg.coll = devWork; // C++ struct assignment
+      /* NVLS only has one send and recv buffer registered */
+      workReg.dnInputs[0] = regBufSend[0];
+      workReg.dnOutputs[0] = regBufRecv[0];
+      workNode = ncclMemoryStackAllocInlineArray<ncclWorkList, ncclDevWorkCollReg>(&comm->memScoped, 1);
+      workNode->workType = ncclDevWorkTypeCollReg;
+      workNode->size = sizeof(struct ncclDevWorkCollReg);
+      memcpy((void*)(workNode+1), (void*)&workReg, workNode->size);
+    } else {
+      workNode = ncclMemoryStackAllocInlineArray<ncclWorkList, ncclDevWorkColl>(&comm->memScoped, 1);
+      workNode->workType = ncclDevWorkTypeColl;
+      workNode->size = sizeof(struct ncclDevWorkColl);
+      memcpy((void*)(workNode+1), (void*)&devWork, workNode->size);
+    }
+next:
+    ncclIntruQueueEnqueue(&planner->collWorkQueue, workNode);
+    task = task->next;
+  }
+  assert(ncclIntruQueueEmpty(&planner->tmpCollWorkQueue));
+  return ncclSuccess;
+}
+
 // Called once per ncclGroup to organize the user submitted tasks in
 // comm->planner so that they can be peeled off into plans.
 ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool* needConnect, ncclSimInfo_t* simInfo) {
   struct ncclKernelPlanner* planner = &comm->planner;
+  planner->persistent = ncclCudaGraphValid(planner->capturingGraph);
   // Tasks from the sorter come out ordered size descending.
   struct ncclTaskColl* task = ncclTaskCollSorterDequeueAll(&planner->collSorter);
   // Tasks are assembled by (fn,op,ty) size ascending.
@@ -648,7 +409,7 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
     void* regBufSend[NCCL_MAX_LOCAL_RANKS];
     void* regBufRecv[NCCL_MAX_LOCAL_RANKS];
     bool regNeedConnect = true;
-    registerCollBuffers(comm, task, regBufSend, regBufRecv, &planner->collCleanupQueue, &regNeedConnect);
+    ncclRegisterCollNvlsBuffers(comm, task, regBufSend, regBufRecv, &planner->collCleanupQueue, &regNeedConnect);
 
     if (comm->runtimeConn && comm->initAlgoChannels[task->algorithm] == false) {
       if (task->algorithm == NCCL_ALGO_NVLS_TREE && comm->initAlgoChannels[NCCL_ALGO_NVLS] == false && regNeedConnect == true) {
@@ -662,32 +423,28 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
       }
     }
 
-    struct ncclDevWorkColl devWork = {};
-    devWork.sendbuff = (void*)task->sendbuff;
-    devWork.recvbuff = (void*)task->recvbuff;
-    devWork.sendbuffOffset = task->sendbuffOffset;
-    devWork.recvbuffOffset = task->recvbuffOffset;
-    devWork.sendbuffRmtAddrs = task->sendbuffRmtAddrs;
-    devWork.recvbuffRmtAddrs = task->recvbuffRmtAddrs;
-    devWork.root = task->root;
-    devWork.nWarps = task->nWarps;
-    devWork.redOpArg = task->opDev.scalarArg;
-    devWork.redOpArgIsPtr = task->opDev.scalarArgIsPtr;
-    devWork.oneNode = (comm->nNodes == 1);
-    devWork.regUsed = task->regBufType;
+    if (task->algorithm == NCCL_ALGO_NVLS_TREE || task->algorithm == NCCL_ALGO_NVLS) {
+      struct ncclDevWorkColl devWork = {};
+      devWork.sendbuff = (void*)task->sendbuff;
+      devWork.recvbuff = (void*)task->recvbuff;
+      devWork.sendbuffOffset = task->sendbuffOffset;
+      devWork.recvbuffOffset = task->recvbuffOffset;
+      devWork.sendbuffRmtAddrs = task->sendbuffRmtAddrs;
+      devWork.recvbuffRmtAddrs = task->recvbuffRmtAddrs;
+      devWork.root = task->root;
+      devWork.nWarps = task->nWarps;
+      devWork.redOpArg = task->opDev.scalarArg;
+      devWork.redOpArgIsPtr = task->opDev.scalarArgIsPtr;
+      devWork.oneNode = (comm->nNodes == 1);
+      devWork.netRegUsed = devWork.regUsed = 0;
+      if (task->regBufType & NCCL_NET_REG_BUFFER)
+        devWork.netRegUsed = 1;
+      if (task->regBufType & (NCCL_IPC_REG_BUFFER | NCCL_NVLS_REG_BUFFER))
+        devWork.regUsed = 1;
 
-    struct ncclWorkList* workNode;
-    switch (task->regBufType) {
-    case NCCL_REGULAR_BUFFER:
-    case NCCL_IPC_REG_BUFFER:
-    case NCCL_COLLNET_REG_BUFFER:
-      { workNode = ncclMemoryStackAllocInlineArray<ncclWorkList, ncclDevWorkColl>(&comm->memScoped, 1);
-        workNode->workType = ncclDevWorkTypeColl;
-        workNode->size = sizeof(struct ncclDevWorkColl);
-        memcpy((void*)(workNode+1), (void*)&devWork, workNode->size);
-      } break;
-    case NCCL_NVLS_REG_BUFFER:
-      { struct ncclDevWorkCollReg workReg = {};
+      struct ncclWorkList* workNode;
+      if (task->regBufType & NCCL_NVLS_REG_BUFFER) {
+        struct ncclDevWorkCollReg workReg = {};
         workReg.coll = devWork; // C++ struct assignment
         /* NVLS only has one send and recv buffer registered */
         workReg.dnInputs[0] = regBufSend[0];
@@ -695,15 +452,16 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
         workNode = ncclMemoryStackAllocInlineArray<ncclWorkList, ncclDevWorkCollReg>(&comm->memScoped, 1);
         workNode->workType = ncclDevWorkTypeCollReg;
         workNode->size = sizeof(struct ncclDevWorkCollReg);
-        memcpy((void*)(workNode+1), (void*)&workReg, workNode->size);
-      } break;
-    default:
-      /* impossible value */
-      WARN("Invalid regBufType %d", task->regBufType);
-      return ncclInvalidArgument;
-    }
+        memcpy((void*)(workNode + 1), (void*)&workReg, workNode->size);
+      } else {
+        workNode = ncclMemoryStackAllocInlineArray<ncclWorkList, ncclDevWorkColl>(&comm->memScoped, 1);
+        workNode->workType = ncclDevWorkTypeColl;
+        workNode->size = sizeof(struct ncclDevWorkColl);
+        memcpy((void*)(workNode + 1), (void*)&devWork, workNode->size);
+      }
 
-    ncclIntruQueueEnqueue(&planner->collWorkQueue, workNode);
+      ncclIntruQueueEnqueue(&planner->tmpCollWorkQueue, workNode);
+    }
     task = task->next;
   }
 
@@ -875,15 +633,32 @@ static ncclResult_t scheduleCollTasksToPlan(
         struct ncclProxyOp* proxyOp;
         if (c == (int)devWork->channelLo) {
           proxyOp = &proxyOpLo;
+          proxyOp->loopOffset = 0;
+          proxyOp->channelSize = countLo * elementSize;
         } else if (c == (int)devWork->channelHi) {
           proxyOp = &proxyOpHi;
+          proxyOp->loopOffset = (countLo + nMidChannels * countMid) * elementSize;
+          proxyOp->channelSize = countHi * elementSize;
         } else {
           proxyOp = &proxyOpMid;
+          proxyOp->loopOffset = (countLo + (c - devWork->channelLo - 1) * countMid) * elementSize;
+          proxyOp->channelSize = countMid * elementSize;
         }
         proxyOp->channelId = c;
         proxyOp->opCount = proxyOpId;
         proxyOp->task.coll = task;
         proxyOp->rank = comm->rank;
+        proxyOp->ringAlgo = NULL;
+        if (proxyOp->reg && task->algorithm == NCCL_ALGO_RING && (task->recvNetHandles[c] || task->sendNetHandles[c])) {
+          if (task->func == ncclFuncAllGather) {
+            proxyOp->ringAlgo = new RingAGAlgorithm(task->sendbuff, task->recvbuff, comm->nRanks, comm->channels[c].ring.userRanks, proxyOp->chunkSteps, proxyOp->sliceSteps, proxyOp->chunkSize, proxyOp->sliceSize, proxyOp->loopOffset, proxyOp->channelSize, elementSize, task->count * elementSize, task->sendNetHandles[c], task->recvNetHandles[c], task->srecvNetHandles[c]);
+          } else if (task->func == ncclFuncAllReduce) {
+            proxyOp->ringAlgo = new RingARAlgorithm(task->sendbuff, task->recvbuff, comm->nRanks, comm->channels[c].ring.index, proxyOp->chunkSteps, proxyOp->sliceSteps, proxyOp->chunkSize, proxyOp->sliceSize, proxyOp->loopOffset, proxyOp->channelSize, elementSize, task->sendNetHandles[c], task->recvNetHandles[c], task->srecvNetHandles[c]);
+          } else if (task->func == ncclFuncBroadcast) {
+            proxyOp->ringAlgo = new RingBCAlgorithm(task->sendbuff, task->recvbuff, comm->rank, task->root, comm->nRanks, comm->channels[c].ring.userRanks, proxyOp->chunkSteps, proxyOp->sliceSteps, proxyOp->chunkSize, proxyOp->sliceSize, proxyOp->loopOffset, proxyOp->channelSize, task->sendNetHandles[c], task->recvNetHandles[c], task->srecvNetHandles[c]);
+          }
+          proxyOp->ringAlgo->incRefCount();
+        }
         addWorkBatchToPlan(comm, plan, c, workNode->workType, task->devFuncId, plan->workBytes);
         // Coverity reports "proxyOp->connection" as being possibly uninitialized.  It's hard to
         // determine if that's actually true but it's also not clear if that would be an issue.
@@ -900,6 +675,10 @@ static ncclResult_t scheduleCollTasksToPlan(
     }
 
     if (comm->rank == 0) {
+      INFO(NCCL_TUNING, "%s: %ld Bytes -> Algo %s proto %s channel{Lo..Hi}={%d..%d}",
+        ncclFuncToString(task->func), task->count * ncclTypeSize(task->datatype), ncclAlgoToString(task->algorithm),
+        ncclProtoToString(task->protocol), devWork->channelLo, devWork->channelHi);
+
       if (task->isCollnet) {
         TRACE(NCCL_COLL, "Collective %s(%s, %s, %s, %s) count=%ld devFuncId=%d channel{Lo..Hi}={%d..%d} count=%ld chunkCount=%d",
           ncclFuncToString(task->func), ncclDevRedOpToString(task->opDev.op),
@@ -956,6 +735,7 @@ static ncclResult_t addP2pToPlan(
   bool protoLL[2] = {!selfSend, !selfSend};
   bool network[2] = {false, false};
   bool proxySameProcess[2] = {true, true};
+  void** handles[2] = {NULL, NULL};
   uint8_t base = ncclP2pChannelBaseForRound(comm, p2pRound);
   if (!selfSend) {
     for (int part=0; part < nChannelsMax; part++) {
@@ -981,7 +761,7 @@ static ncclResult_t addP2pToPlan(
   int chunkSize[2];
   int chunkDataSize[2];
   int chunkDataSize_u32fp8[2];
-  bool registered[2] = {false, false};
+  bool netRegistered[2] = {false, false};
   bool ipcRegistered[2] = {false, false};
 
   for (int dir=0; dir < 2; dir++) { // 0=recv, 1=send
@@ -1007,10 +787,20 @@ static ncclResult_t addP2pToPlan(
     if (protocol[dir] == NCCL_PROTO_LL) chunkSize[dir] *= 2;
 
     if (network[dir]) {
-      if (bytes[dir] > 0 && proxySameProcess[dir] && protocol[dir] == NCCL_PROTO_SIMPLE) {
-        struct ncclReg* regRecord;
-        NCCLCHECK(ncclRegFind(comm, addrs[dir], bytes[dir], &regRecord));
-        registered[dir] = regRecord && regRecord->nDevs;
+      if (bytes[dir] > 0 && proxySameProcess[dir] && protocol[dir] == NCCL_PROTO_SIMPLE && (ncclPxnDisable(comm) || !comm->isAllNvlink)) {
+        int regFlag = 0;
+        NCCLCHECK(ncclCalloc(&handles[dir], nChannelsMax));
+        for (int part = 0; part < nChannelsMax; part++) {
+          int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, part);
+          struct ncclChannelPeer** channelPeers = comm->channels[channelId].peers;
+          int peerRank = dir ? sendRank : recvRank;
+          struct ncclConnector* conn = dir ? &channelPeers[peerRank]->send[connIndex]
+            : &channelPeers[peerRank]->recv[connIndex];
+          if (conn->conn.flags & NCCL_DIRECT_NIC)
+            ncclRegisterP2pNetBuffer(comm, addrs[dir], bytes[dir], conn, &regFlag, &handles[dir][part], &plan->cleanupQueue);
+          if (!regFlag) break;
+        }
+        netRegistered[dir] = regFlag ? true : false;
       }
     } else if (bytes[dir] > 0 && addrs[dir] && protocol[dir] == NCCL_PROTO_SIMPLE && !selfSend) {
       int peerRank = dir ? sendRank : recvRank;
@@ -1020,12 +810,12 @@ static ncclResult_t addP2pToPlan(
       struct ncclConnector* conn = dir ? &channelPeers[peerRank]->send[connIndex]
         : &channelPeers[peerRank]->recv[connIndex];
       void* regAddr = NULL;
-      if (conn->conn.flags & (NCCL_IPC_WRITE | NCCL_IPC_READ | NCCL_DIRECT_WRITE | NCCL_DIRECT_READ)) {
+      if (conn->conn.flags & (NCCL_P2P_WRITE | NCCL_P2P_READ)) {
         // We require users registering buffers on both sides
-        NCCLCHECK(registerP2pBuffer(comm, addrs[dir], peerRank, bytes[dir], &regFlag, &regAddr, &plan->cleanupQueue));
+        NCCLCHECK(ncclRegisterP2pIpcBuffer(comm, addrs[dir], bytes[dir], peerRank, &regFlag, &regAddr, &plan->cleanupQueue));
         if (regFlag) {
-          if (dir == 0 && conn->conn.flags & (NCCL_IPC_WRITE | NCCL_DIRECT_WRITE)) recvAddr = regAddr;
-          else if (dir == 1 && conn->conn.flags & (NCCL_IPC_READ | NCCL_DIRECT_READ)) sendAddr = regAddr;
+          if (dir == 0 && (conn->conn.flags & NCCL_P2P_WRITE)) recvAddr = regAddr;
+          else if (dir == 1 && (conn->conn.flags & NCCL_P2P_READ)) sendAddr = regAddr;
         }
       }
       ipcRegistered[dir] = regFlag ? true : false;
@@ -1057,7 +847,7 @@ static ncclResult_t addP2pToPlan(
   work->channelBase = base;
   work->nSendChannels = nChannels[1];
   work->sendProtoLL = protoLL[1];
-  work->sendRegistered = registered[1];
+  work->sendNetReg = netRegistered[1];
   work->sendIpcReg = ipcRegistered[1];
   work->sendChunkSize_u32fp8 = chunkDataSize_u32fp8[1];
   work->sendRank = sendRank;
@@ -1065,7 +855,7 @@ static ncclResult_t addP2pToPlan(
   work->sendBytes = sendBytes==-1 ? 0 : sendBytes;
   work->nRecvChannels = nChannels[0];
   work->recvProtoLL = protoLL[0];
-  work->recvRegistered = registered[0];
+  work->recvNetReg = netRegistered[0];
   work->recvIpcReg = ipcRegistered[0];
   work->recvChunkSize_u32fp8 = chunkDataSize_u32fp8[0];
   work->recvRank = recvRank;
@@ -1084,7 +874,7 @@ static ncclResult_t addP2pToPlan(
     op->protocol = protocol[dir];
     op->pattern = dir ? ncclPatternSend : ncclPatternRecv;
     op->chunkSize = chunkSize[dir];
-    op->reg = registered[dir];
+    op->reg = netRegistered[dir];
     op->coll = p2pTasks[dir] ? p2pTasks[dir]->func : 0;
     op->task.p2p = p2pTasks[dir];
     op->rank = comm->rank;
@@ -1116,9 +906,10 @@ static ncclResult_t addP2pToPlan(
         size_t partBeg, partEnd;
         ncclP2pPartBounds(nParts, part, bytes, &partBeg, &partEnd);
         if (proxyOps[dir].reg) {
-          proxyOps[dir].nsteps = 1;
-          proxyOps[dir].recvbuff = (uint8_t*)addr+partBeg;
-          proxyOps[dir].nbytes = partEnd-partBeg;
+          (dir ? proxyOps[dir].sendbuff : proxyOps[dir].recvbuff) = (uint8_t*)addr + partBeg;
+          (dir ? proxyOps[dir].sendMhandle : proxyOps[dir].recvMhandle) = handles[dir][part];
+          proxyOps[dir].nbytes = partEnd - partBeg;
+          proxyOps[dir].nsteps = DIVUP(proxyOps[dir].nbytes, NCCL_MAX_NET_SIZE);
         } else {
           proxyOps[dir].nsteps = divUp(partEnd-partBeg, chunkDataSize);
           proxyOps[dir].nbytes = std::min(partEnd-partBeg, chunkDataSize);
@@ -1198,6 +989,8 @@ static ncclResult_t scheduleP2pTasksToPlan(
         // Skip send to self in-place (we don't need to support this).
         ncclIntruQueueDequeue(&peers[sendRank].sendQueue);
         ncclIntruQueueDequeue(&peers[recvRank].recvQueue);
+        ncclMemoryPoolFree(&comm->memPool_ncclTaskP2p, send);
+        ncclMemoryPoolFree(&comm->memPool_ncclTaskP2p, recv);
         comm->planner.nTasksP2p -= 2;
       } else {
         // Ensure room for worst case of one new batch per channel.
@@ -1302,8 +1095,13 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla
     plan->kernelArgs->workBuf = comm->workFifoBufDev;
     break;
   case ncclDevWorkStorageTypePersistent:
+    // We rely on 16-byte alignment
+    #if __cplusplus >= 201103L
+    fifoBufHost = aligned_alloc(16, ROUNDUP(workBytes, 16));
+    #else
     static_assert(16 <= alignof(max_align_t), "We rely on 16-byte alignment.");
     fifoBufHost = malloc(workBytes);
+    #endif
     fifoCursor = 0;
     fifoMask = ~0u;
     break;
@@ -1346,37 +1144,41 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla
     break;
   case ncclDevWorkStorageTypePersistent:
     { ncclResult_t result = ncclSuccess;
+      struct uploadWork_cleanup_t* cleanup = nullptr;
       cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
       void* fifoBufDev = nullptr;
-      CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+      CUDACHECKGOTO(cudaThreadExchangeStreamCaptureMode(&mode), result, fail);
 
       // Acquire deviceStream to gain access to deviceStream.cudaStream. Since the
       // user's graph will be launched later, and it also acquires the deviceStream,
       // it will observe this upload.
-      NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), result, finish_scope);
+      NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), result, fail);
 
-      CUDACHECKGOTO(cudaMallocAsync(&fifoBufDev, workBytes, comm->memPool, comm->sharedRes->deviceStream.cudaStream), result, finish_scope);
+      CUDACHECKGOTO(cudaMallocAsync(&fifoBufDev, workBytes, comm->memPool, comm->sharedRes->deviceStream.cudaStream), result, fail);
       plan->workBufPersistent = fifoBufDev;
       plan->kernelArgs->workBuf = fifoBufDev;
 
-      CUDACHECKGOTO(cudaMemcpyAsync(fifoBufDev, fifoBufHost, workBytes, cudaMemcpyDefault, comm->sharedRes->deviceStream.cudaStream), result, finish_scope);
+      // coverity[uninit_use_in_call:FALSE] => fifoBufHost is never NULL
+      CUDACHECKGOTO(cudaMemcpyAsync(fifoBufDev, fifoBufHost, workBytes, cudaMemcpyDefault, comm->sharedRes->deviceStream.cudaStream), result, fail);
       cudaEvent_t memcpyDone;
-      CUDACHECKGOTO(cudaEventCreateWithFlags(&memcpyDone, cudaEventDisableTiming), result, finish_scope);
-      CUDACHECKGOTO(cudaEventRecord(memcpyDone, comm->sharedRes->deviceStream.cudaStream), result, finish_scope);
+      CUDACHECKGOTO(cudaEventCreateWithFlags(&memcpyDone, cudaEventDisableTiming), result, fail);
+      CUDACHECKGOTO(cudaEventRecord(memcpyDone, comm->sharedRes->deviceStream.cudaStream), result, fail);
 
-      struct uploadWork_cleanup_t* cleanup;
-      NCCLCHECK(ncclCalloc(&cleanup, 1));
+      NCCLCHECKGOTO(ncclCalloc(&cleanup, 1), result, fail);
       cleanup->base.fn = uploadWork_cleanup_fn;
       cleanup->base.event = memcpyDone;
       cleanup->hostBuf = fifoBufHost;
-      ncclIntruQueueEnqueue(&comm->eventCallbackQueue, &cleanup->base);
+      ncclIntruQueueEnqueue(&comm->eventCallbackQueue, (struct ncclCommEventCallback *)cleanup);
 
-      NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream), result, finish_scope);
-      NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm), result, finish_scope);
+      NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream), result, fail);
+      NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm), result, fail);
 
     finish_scope:
-      CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-      if (result != ncclSuccess) return result;
+      if (mode != cudaStreamCaptureModeRelaxed) (void)cudaThreadExchangeStreamCaptureMode(&mode);
+      return result;
+    fail:
+      if (!cleanup) free(fifoBufHost);
+      goto finish_scope;
     } break;
   default: break;
   }
@@ -1388,6 +1190,7 @@ static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan*
   uint64_t p2pOpBump[MAXCHANNELS] = {/*0...*/};
   // Advance comm's collOpCount by number of colls in this plan.
   comm->sharedRes->collOpCount += plan->collOpCount;
+  comm->collOpCount += plan->collOpCount;
 
   struct ncclProxyOp* op = ncclIntruQueueHead(&plan->proxyOpQueue);
   while (op != nullptr) {
@@ -1410,18 +1213,9 @@ static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan*
 
     NCCLCHECK(ncclProxySaveOp(comm, op, nullptr));
     op->opCount = oldId; // Restore for next uploadProxyOps()
-
-    struct ncclProxyOp* opNext = op->enqNext;
-    if (!plan->persistent) {
-      // Non-persistent kernels upload ops only once so can be free'd here.
-      ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, op);
-    }
-    op = opNext;
+    op = op->enqNext;
   }
 
-  // Erase proxyOpQueue since all ops were free'd back to mempool.
-  if (!plan->persistent) ncclIntruQueueConstruct(&plan->proxyOpQueue);
-
   for (int c=0; c < MAXCHANNELS; c++) {
     // Advance channel's p2pOpCount by number of p2p's in this plan channel.
     comm->sharedRes->p2pOpCount[c] += p2pOpBump[c];
@@ -1450,6 +1244,8 @@ static void CUDART_CB hostStreamPlanCallback(void *plan_) {
   if (result != ncclSuccess) {
     WARN("hostStreamPlanCallback() failed : %s", ncclGetErrorString(result));
   }
+  if (!plan->persistent) ncclAtomicRefCountDecrement(&plan->comm->noncapturedRefs);
+  return;
 }
 
 static ncclResult_t reclaimPlan(struct ncclComm* comm, struct ncclCommCallback* me) {
@@ -1462,32 +1258,41 @@ static ncclResult_t reclaimPlan(struct ncclComm* comm, struct ncclCommCallback*
       CUDACHECK(cudaFree(plan->workBufPersistent));
       CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
     }
-    struct ncclProxyOp* q = ncclIntruQueueHead(&plan->proxyOpQueue);
-    while (q != nullptr) {
-      struct ncclProxyOp* q1 = q->enqNext;
-      ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, q);
-      q = q1;
-    }
-    struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue);
-    while (ct != nullptr) {
-      struct ncclTaskColl* ct1 = ct->next;
-      ncclMemoryPoolFree(&comm->memPool_ncclTaskColl, ct);
-      ct = ct1;
-    }
-    struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue);
-    while (pt != nullptr) {
-      struct ncclTaskP2p* pt1 = pt->next;
-      ncclMemoryPoolFree(&comm->memPool_ncclTaskP2p, pt);
-      pt = pt1;
-    }
-    ncclResult_t result = ncclSuccess;
-    while (!ncclIntruQueueEmpty(&plan->cleanupQueue)) {
-      struct ncclCommCallback* cb = ncclIntruQueueDequeue(&plan->cleanupQueue);
-      ncclResult_t res1 = cb->fn(comm, cb); // Expect to reclaim memory of cb
-      if (res1 != ncclSuccess) result = res1;
-    }
-    NCCLCHECK(result);
   }
+  // Free coll tasks
+  struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue);
+  while (ct != nullptr) {
+    struct ncclTaskColl* ct1 = ct->next;
+    free(ct->sendNetHandles);
+    free(ct->recvNetHandles);
+    free(ct->srecvNetHandles);
+    ncclMemoryPoolFree(&comm->memPool_ncclTaskColl, ct);
+    ct = ct1;
+  }
+  // Free p2p tasks
+  struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue);
+  while (pt != nullptr) {
+    struct ncclTaskP2p* pt1 = pt->next;
+    ncclMemoryPoolFree(&comm->memPool_ncclTaskP2p, pt);
+    pt = pt1;
+  }
+  // Free proxy ops
+  struct ncclProxyOp* q = ncclIntruQueueHead(&plan->proxyOpQueue);
+  while (q != nullptr) {
+    struct ncclProxyOp* q1 = q->enqNext;
+    if (q->ringAlgo && q->ringAlgo->decRefCount() == 0) delete q->ringAlgo;
+    ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, q);
+    q = q1;
+  }
+  // Run other free callbacks
+  ncclResult_t result = ncclSuccess;
+  while (!ncclIntruQueueEmpty(&plan->cleanupQueue)) {
+    struct ncclCommCallback* cb = ncclIntruQueueDequeue(&plan->cleanupQueue);
+    ncclResult_t res1 = cb->fn(comm, cb); // Expect to reclaim memory of cb
+    if (res1 != ncclSuccess) result = res1;
+  }
+  NCCLCHECK(result);
+  // Free plan struct
   ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan);
   return ncclSuccess;
 }
@@ -1509,10 +1314,6 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
   planner->persistent = persistent;
   int nPlans = 0;
 
-  // Poll for callbacks sent to us from other threads. Typically these free
-  // resources from to our memory pools.
-  NCCLCHECK(ncclCommPollCallbacks(comm, /*waitSome=*/false));
-
   if (planner->nTasksColl + planner->nTasksP2p != 0) {
     do {
       memset(&planner->wipPlan, 0, sizeof(planner->wipPlan));
@@ -1577,7 +1378,7 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
     }
     NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, launchStream, &comm->sharedRes->deviceStream), result, failure);
 
-    if (persistent || comm->persistentRefs != 0 || ncclCudaLaunchBlocking) {
+    if (persistent || comm->persistentRefs != 0 || ncclCudaLaunchBlocking || __atomic_load_n(&comm->noncapturedRefs, __ATOMIC_ACQUIRE)) {
       // We have to launch host tasks to push proxy args. We are careful to only
       // do this if necessary since host tasks impose a high performance cost in CUDA.
       bool acquired = false;
@@ -1587,6 +1388,8 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
             acquired = true;
             NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->hostStream), result, failure);
           }
+          if (!persistent) ncclAtomicRefCountIncrement(&comm->noncapturedRefs);
+          plan->isHostCbEnq = true;
           NCCLCHECKGOTO(ncclStrongStreamLaunchHost(planner->capturingGraph, &comm->sharedRes->hostStream, hostStreamPlanCallback, plan), result, failure);
         }
       }
@@ -1602,6 +1405,7 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
       NCCLCHECKGOTO(ncclCudaGraphAddDestructor(planner->capturingGraph, persistentDestructor, (void*)planHead), result, failure);
     }
   }
+
 failure:
   return result;
 }
@@ -1694,7 +1498,7 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
 }
 
 ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan) {
-  if (!(plan->persistent || comm->persistentRefs != 0 || ncclCudaLaunchBlocking)) {
+  if (!(plan->persistent || ncclCudaLaunchBlocking || plan->isHostCbEnq)) {
     // We are not using the host stream for proxy ops and reclaimation submission.
     NCCLCHECK(hostStreamPlanTask(comm, plan));
   } else {
@@ -1778,8 +1582,7 @@ static void initCollCostTable(float** collCostTable) {
 static ncclResult_t updateCollCostTable(
     struct ncclComm* comm, struct ncclTaskColl* info, size_t nBytes,
     int collNetSupport, int nvlsSupport, int numPipeOps,
-    float** collCostTable, int* backupAlgo, int* backupProto, float* backupTime
-  ) {
+    float** collCostTable) {
   float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
 
   if (comm->nRanks == 1) {
@@ -1799,16 +1602,12 @@ static ncclResult_t updateCollCostTable(
     if (a == NCCL_ALGO_PAT && info->func == ncclFuncReduceScatter
         && (info->opDev.op == ncclDevPreMulSum || info->opDev.op == ncclDevSumPostDiv)) continue;
     for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-      bool backup;
-      float time;
-      NCCLCHECK(ncclTopoGetAlgoTime(comm, info->func, a, p, nBytes, numPipeOps, &time, &backup));
-      if (!backup) {
-        table[a][p] = time;
-      } else {
-        if (time >= 0.0 && time < *backupTime) {
-          *backupAlgo = a;
-          *backupProto = p;
-          *backupTime = time;
+      NCCLCHECK(ncclTopoGetAlgoTime(comm, info->func, a, p, nBytes, numPipeOps, &table[a][p]));
+      // Relegate fp8 reduction trees of sufficient depth that they incur precision loss
+      // to be least preferred.
+      if (info->datatype == ncclFloat8e4m3 || info->datatype == ncclFloat8e5m2) {
+        if (a == NCCL_ALGO_RING && comm->nRanks > 8) {
+          table[a][p] *= 1024.0; // Any factor large enough to act as a partition between lossy and non-lossy algos.
         }
       }
     }
@@ -1819,7 +1618,7 @@ static ncclResult_t updateCollCostTable(
 
 static ncclResult_t topoGetAlgoInfo(
     struct ncclComm* comm, struct ncclTaskColl* info, size_t nBytes,
-    float** collCostTable, int backupAlgo, int backupProto, float backupTime, ncclSimInfo_t* simInfo
+    float** collCostTable, ncclSimInfo_t* simInfo
   ) {
   float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
 
@@ -1844,15 +1643,19 @@ static ncclResult_t topoGetAlgoInfo(
   // Yes, we are first assigning and then testing if protocol is sane, but that's OK in this case.
   // coverity[check_after_sink]
   if (info->algorithm == NCCL_ALGO_UNDEF || info->protocol == NCCL_PROTO_UNDEF) {
-    if (backupAlgo == NCCL_ALGO_UNDEF || backupProto == NCCL_PROTO_UNDEF) {
-      WARN("Error : no algorithm/protocol available");
-      return ncclInternalError;
+    char ncclAlgoEnvStr[1024] = "";
+    char ncclProtoEnvStr[1024] = "";
+    char* algoEnv = getenv("NCCL_ALGO");
+    if (algoEnv) {
+      snprintf(ncclAlgoEnvStr, 1023, " NCCL_ALGO was set to %s.", algoEnv);
     }
-    info->algorithm = backupAlgo;
-    info->protocol = backupProto;
-    time = backupTime;
+    char* protoEnv = getenv("NCCL_PROTO");
+    if (protoEnv) {
+      snprintf(ncclProtoEnvStr, 1023, " NCCL_PROTO was set to %s.", protoEnv);
+    }
+    WARN("Error : no algorithm/protocol available for function %s with datatype %s.%s%s", ncclFuncToString(info->func), ncclDatatypeToString(info->datatype), ncclAlgoEnvStr, ncclProtoEnvStr);
+    return (algoEnv || protoEnv) ? ncclInvalidUsage : ncclInternalError;
   }
-  if (comm->rank == 0) INFO(NCCL_TUNING, "%s: %ld Bytes -> Algo %d proto %d time %f", ncclFuncToString(info->func), nBytes, info->algorithm, info->protocol, time);
   if (simInfo) simInfo->estimatedTime = time;
   TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", nBytes, info->algorithm, info->protocol, time);
 
@@ -1913,19 +1716,24 @@ static ncclResult_t getAlgoInfo(
   info->algorithm = NCCL_ALGO_UNDEF;
   info->protocol = NCCL_PROTO_UNDEF;
   int nMaxChannels = 0;
-  int backupAlgo = NCCL_ALGO_UNDEF;
-  int backupProto = NCCL_PROTO_UNDEF;
-  float backupTime = 3600000000.0;
   float collCostTable[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
   initCollCostTable((float **)collCostTable);
-  NCCLCHECK(updateCollCostTable(comm, info, nBytes, collNetSupport, nvlsSupport, numPipeOps, (float **)collCostTable, &backupAlgo, &backupProto, &backupTime));
+  NCCLCHECK(updateCollCostTable(comm, info, nBytes, collNetSupport, nvlsSupport, numPipeOps, (float **)collCostTable));
   if (comm->tuner != NULL) {
+    size_t elementSize = ncclTypeSize(info->datatype);
+    size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count);
+    size_t recvbuffSize = elementSize*ncclFuncRecvCount(info->func, comm->nRanks, info->count);
+    struct ncclReg* regSendBuf;
+    struct ncclReg* regRecvBuf;
+    NCCLCHECK(ncclRegFind(comm, info->sendbuff, sendbuffSize, &regSendBuf));
+    NCCLCHECK(ncclRegFind(comm, info->recvbuff, recvbuffSize, &regRecvBuf));
+    int regBuff = ((regSendBuf && regRecvBuf) || (ncclCudaGraphValid(comm->planner.capturingGraph) && ncclParamGraphRegister()));
     NCCLCHECK(comm->tuner->getCollInfo(
           comm->tunerContext, info->func, nBytes,
           numPipeOps, (float **)collCostTable, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
-          &nMaxChannels));
+          regBuff, &nMaxChannels));
   }
-  NCCLCHECK(topoGetAlgoInfo(comm, info, nBytes, (float **)collCostTable, backupAlgo, backupProto, backupTime, simInfo));
+  NCCLCHECK(topoGetAlgoInfo(comm, info, nBytes, (float **)collCostTable, simInfo));
   info->nMaxChannels = nMaxChannels == 0 ? info->nMaxChannels : nMaxChannels;
   return ncclSuccess;
 }
@@ -1975,37 +1783,7 @@ static ncclResult_t calcCollChunking(
   }
 
   int nstepsPerLoop, nchunksPerLoop;
-  switch (pattern) {
-  case ncclPatternTreeUp:
-  case ncclPatternTreeDown:
-  case ncclPatternTreeUpDown:
-  case ncclPatternPatUp:
-  case ncclPatternPatDown:
-  case ncclPatternPipelineFrom:
-  case ncclPatternPipelineTo:
-  case ncclPatternCollnetChain:
-    nstepsPerLoop = nchunksPerLoop = 1;
-    break;
-  case ncclPatternNvls:
-    nstepsPerLoop = 1; nchunksPerLoop = comm->channels[0].nvls.nHeads;
-    break;
-  case ncclPatternCollnetDirect:
-    nstepsPerLoop = 1; nchunksPerLoop = comm->channels[0].collnetDirect.nHeads;
-    break;
-  case ncclPatternRing:
-    nstepsPerLoop = comm->nRanks-1; nchunksPerLoop = comm->nRanks;
-    break;
-  case ncclPatternRingTwice:
-    nstepsPerLoop = 2*(comm->nRanks-1); nchunksPerLoop = comm->nRanks;
-    break;
-  case ncclPatternNvlsTree:
-    nstepsPerLoop = 1; nchunksPerLoop = comm->channels[0].nvls.nHeads;
-    break;
-  default:
-    WARN("Unknown pattern %d", pattern);
-    return ncclInternalError;
-  }
-
+  size_t loopOffset = 0;
   int stepSize   = comm->buffSizes[info->protocol]/NCCL_STEPS;
   int chunkSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->chunkSteps : 1;
   int sliceSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->sliceSteps : 1;
@@ -2066,22 +1844,60 @@ static ncclResult_t calcCollChunking(
   // Compute directFlags of work struct.
   if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) {
     // Set direct direction for broadcast-gather (read or write)
-    *outDirectFlags = (nBytes/nChannels <= 1024 * 4) ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
+    *outDirectFlags = (nBytes/nChannels <= 1024 * 4) ? NCCL_P2P_READ : NCCL_P2P_WRITE;
   } else {
     *outDirectFlags = 0;
   }
 
   // Compute nSteps for proxies
-  //if (comm->rank == 0) printf("Coll %d, size %ld -> %dx%d, chunkSize %d (algo %d proto%d)\n", info->func, info->nBytes, info->nChannels, info->nThreads, chunkSize, info->algorithm, info->protocol);
   chunkSize = chunkSize / grainSize * grainSize; // align chunkSize to multiple grainSize
-  int nLoops = (int)DIVUP(nBytes, size_t(nChannels)*nchunksPerLoop*chunkSize);
+  switch (pattern) {
+  case ncclPatternTreeUp:
+  case ncclPatternTreeDown:
+  case ncclPatternTreeUpDown:
+  case ncclPatternPatUp:
+  case ncclPatternPatDown:
+  case ncclPatternPipelineFrom:
+  case ncclPatternPipelineTo:
+  case ncclPatternCollnetChain:
+    nstepsPerLoop = nchunksPerLoop = 1;
+    break;
+  case ncclPatternNvls:
+    nstepsPerLoop = 1; nchunksPerLoop = comm->channels[0].nvls.nHeads;
+    loopOffset = nChannels * chunkSize * comm->channels[0].nvls.headRank;
+    break;
+  case ncclPatternCollnetDirect:
+    nstepsPerLoop = 1; nchunksPerLoop = comm->channels[0].collnetDirect.nHeads;
+    loopOffset = nChannels * chunkSize * comm->channels[0].collnetDirect.headRank;
+    break;
+  case ncclPatternRing:
+    nstepsPerLoop = comm->nRanks-1; nchunksPerLoop = comm->nRanks;
+    break;
+  case ncclPatternRingTwice:
+    nstepsPerLoop = 2*(comm->nRanks-1); nchunksPerLoop = comm->nRanks;
+    break;
+  case ncclPatternNvlsTree:
+    nstepsPerLoop = 1; nchunksPerLoop = comm->channels[0].nvls.nHeads;
+    break;
+  default:
+    WARN("Unknown pattern %d", pattern);
+    return ncclInternalError;
+  }
+
+  // Compute nSteps for proxies
+  size_t loopSize = size_t(nChannels)*nchunksPerLoop*chunkSize;
+  int nLoops = (int)DIVUP(nBytes, loopSize);
   memset(proxyOp, 0, sizeof(*proxyOp));
   proxyOp->nsteps = nstepsPerLoop * nLoops * chunkSteps;
   proxyOp->sliceSteps = sliceSteps;
   proxyOp->chunkSteps = chunkSteps;
   proxyOp->chunkSize = chunkSize;
+  proxyOp->sliceSize = chunkSize / chunkSteps * sliceSteps;
+  proxyOp->loopSize = loopSize;
+  proxyOp->loopOffset = loopOffset;
   proxyOp->protocol = info->protocol;
   proxyOp->dtype = info->datatype;
+  proxyOp->algorithm = info->algorithm;
   if (info->opDev.op == ncclDevPreMulSum || info->opDev.op == ncclDevSumPostDiv) {
     proxyOp->redOp = ncclSum; // Network sees avg as sum
   } else {
@@ -2090,17 +1906,50 @@ static ncclResult_t calcCollChunking(
   proxyOp->pattern = pattern;
   proxyOp->coll = info->func;
   proxyOp->root = info->root;
+  proxyOp->isOneRPN = comm->isOneRPN;
   // This is used by P2P to reduce the receive buffer size. We don't use it in collectives
   // because some protocols need to transmit more than the total size, plus they sometimes
   // round up
   proxyOp->nbytes = stepSize*sliceSteps;
 
-  if (info->regBufType == NCCL_COLLNET_REG_BUFFER) {
+  if (info->regBufType & NCCL_NET_REG_BUFFER) {
     proxyOp->reg = 1;
-    proxyOp->nsteps = DIVUP(nBytes, NCCL_MAX_COLLNET_SIZE);
-    proxyOp->sendMhandle = info->sendMhandle;
+    if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT || info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) {
+      if (proxyOp->isOneRPN) {
+        proxyOp->nsteps = 1;
+        proxyOp->loopOffset = 0;
+        proxyOp->sendbuff = (uint8_t*)info->sendbuff;
+        proxyOp->sendMhandle = info->sendMhandle;
+      } else {
+        if (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter) {
+          proxyOp->nbytes = nBytes / nchunksPerLoop;
+          proxyOp->loopSize = proxyOp->loopSize / nchunksPerLoop;
+          proxyOp->loopOffset = 0;
+          if (info->func == ncclFuncAllGather) {
+            proxyOp->sendbuff = (uint8_t*)info->sendbuff;
+            proxyOp->sendMhandle = info->sendMhandle;
+          }
+        } else {
+          proxyOp->sendbuff = (uint8_t*)info->recvbuff;
+          proxyOp->sendMhandle = info->recvMhandle;
+        }
+      }
+    } else if (info->algorithm == NCCL_ALGO_RING) {
+      if (proxyOp->isOneRPN && info->func == ncclFuncAllGather) {
+        proxyOp->chunkSize = NCCL_MAX_NET_SIZE;
+        proxyOp->sliceSize = NCCL_MAX_NET_SIZE;
+        proxyOp->chunkSteps = 1;
+        proxyOp->sliceSteps = 1;
+        proxyOp->loopSize = size_t(nChannels) * nchunksPerLoop * proxyOp->chunkSize;
+        proxyOp->nsteps = DIVUP(nBytes, proxyOp->loopSize) * nstepsPerLoop;
+        proxyOp->loopOffset = 0;
+      }
+    } else {
+      WARN("Net registration invalid algorithm %s", ncclAlgoToString(info->algorithm));
+      return ncclInternalError;
+    }
+
     proxyOp->recvMhandle = info->recvMhandle;
-    proxyOp->sendbuff = (uint8_t*)info->sendbuff;
     proxyOp->recvbuff = (uint8_t*)info->recvbuff;
     proxyOp->nbytes = nBytes;
   } else {
@@ -2119,7 +1968,7 @@ static ncclResult_t calcCollChunking(
     proxyOp->nbytes = DIVUP(nBytes, nChannels);
   }
 
-  *outChunkSize = chunkSize;
+  *outChunkSize = proxyOp->chunkSize;
   return ncclSuccess;
 }
 
@@ -2130,10 +1979,13 @@ static ncclResult_t hostToDevRedOp(
     int8_t   i8; uint8_t   u8;
     int32_t i32; uint32_t u32;
     int64_t i64; uint64_t u64;
-    half f16; float f32; double f64;
+    __half f16; float f32; double f64;
     #if defined(__CUDA_BF16_TYPES_EXIST__)
       __nv_bfloat16 bf16;
     #endif
+    #if defined(__CUDA_FP8_TYPES_EXIST__)
+      __nv_fp8_storage_t f8;
+    #endif
     void *ptr;
   };
   u64 = 0;
@@ -2144,7 +1996,8 @@ static ncclResult_t hostToDevRedOp(
   if (nbits <= 0) return ncclInvalidArgument;
   uint64_t allBits = uint64_t(-1)>>(64-nbits);
   uint64_t signBit = allBits^(allBits>>1);
-
+  bool datatype_signed = false;
+  
   switch (int(op)) {
   case ncclSum:  opFull->op = ncclDevSum;  break;
   case ncclProd: opFull->op = ncclDevProd; break;
@@ -2162,10 +2015,22 @@ static ncclResult_t hostToDevRedOp(
   case ncclAvg:
     switch ((int)datatype) {
     case ncclInt8:  case ncclInt32:  case ncclInt64:
+      datatype_signed = true;
+      // no break, we want to fall through...
     case ncclUint8: case ncclUint32: case ncclUint64:
       opFull->op = ncclDevSumPostDiv;
-      u64 = comm->nRanks;
+      u64 = comm->nRanks<<1 | datatype_signed;
       break;
+    #if defined(__CUDA_FP8_TYPES_EXIST__)
+    case ncclFloat8e4m3:
+      opFull->op = ncclDevPreMulSum;
+      f8 = __nv_cvt_float_to_fp8(float(1.0/comm->nRanks), __NV_SATFINITE, __NV_E4M3);
+      break;
+    case ncclFloat8e5m2:
+      opFull->op = ncclDevPreMulSum;
+      f8 = __nv_cvt_float_to_fp8(float(1.0/comm->nRanks), __NV_SATFINITE, __NV_E5M2);
+      break;
+    #endif
     case ncclFloat16:
       opFull->op = ncclDevPreMulSum;
       f16 = __float2half(float(1.0/comm->nRanks)); // __double2half not supported pre CUDA 11.x
@@ -2257,6 +2122,13 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
     // Empty collectives can be discarded.
     if (info->count == 0) return ncclSuccess;
 
+    if (info->datatype == ncclFloat8e4m3 || info->datatype == ncclFloat8e5m2) {
+      if (comm->minCompCap < 90) {
+        WARN("FP8 reduction support begins with sm90 capable devices.");
+        return ncclInvalidArgument;
+      }
+    }
+
     // Copy reduction op state from op handle into info struct here since the
     // op handle may be destroyed before ncclGroupEnd().
     struct ncclDevRedOpFull opDev;
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index 999312a0df..6e93568265 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -248,11 +248,31 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
 NCCL_PARAM(IgnoreDisabledP2p, "IGNORE_DISABLED_P2P", 0);
 
 int ncclTopoUserP2pLevel = -1;
-ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank) {
+ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* system, int rank1, int rank2,
+                              int* p2p, int *read, int* intermediateRank) {
+  int mnnvl = 0;
+  struct ncclPeerInfo* info1 = NULL;
+  struct ncclPeerInfo* info2 = NULL;
   *p2p = 0;
   if (read) *read = 0;
   if (intermediateRank) *intermediateRank = -1;
 
+  // Rule out different nodes / isolated containers
+  if (comm) {
+    info1 = comm->peerInfo+rank1;
+    info2 = comm->peerInfo+rank2;
+    if (info1->hostHash != info2->hostHash) {
+      if (comm->MNNVL) {
+        NCCLCHECK(ncclTopoCheckMNNVL(comm->topo, info1, info2, &mnnvl));
+        if (!mnnvl) return ncclSuccess;
+      } else {
+        return ncclSuccess;
+      }
+    } else if (info1->shmDev != info2->shmDev) {
+      return ncclSuccess;
+    }
+  }
+
   // Get GPUs from topology
   int g1, g2;
   NCCLCHECK(ncclTopoRankToIndex(system, rank1, &g1));
@@ -297,7 +317,8 @@ compare:
   if (*p2p == 1) {
     // NCCL_IGNORE_DISABLED_P2P=2 is used by unit tests that don't want to
     // validate against NVML at all since they are pretending to be on other hw.
-    if (g1 != g2 && ncclParamIgnoreDisabledP2p() != 2) {
+    if (g1 != g2 && (comm == NULL || (info1->hostHash == comm->peerInfo[comm->rank].hostHash &&
+                                      info1->hostHash == info2->hostHash)) && ncclParamIgnoreDisabledP2p() != 2) {
       int indexes[3] = {-1,-1,-1};
       int verticeN = 0;
       NCCLCHECK(ncclNvmlEnsureInitialized());
@@ -356,14 +377,14 @@ ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerIn
 NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
 int ncclTopoUserGdrLevel = -1;
 
-ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int64_t netId, int read, int* useGdr) {
+ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int rank, int64_t netId, int read, int* useGdr) {
   *useGdr = 0;
 
   // Get GPU and NET
   int n, g;
   NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &n));
   struct ncclTopoNode* net = system->nodes[NET].nodes+n;
-  NCCLCHECK(ncclTopoIdToIndex(system, GPU, busId, &g));
+  NCCLCHECK(ncclTopoRankToIndex(system, rank, &g));
   struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
 
   // Check that both the NIC and GPUs support it
@@ -404,12 +425,32 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int6
     distance = proxyGpu->paths[NET][n].type;
   }
   if (distance > netGdrLevel) {
-    INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %lx / HCA %lx (distance %d > %d)", busId, netId, distance, netGdrLevel);
+    INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %d / HCA %lx (distance %d > %d)", rank, netId, distance, netGdrLevel);
     return ncclSuccess;
   }
 
   *useGdr = 1;
-  INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %lx / HCA %lx (distance %d <= %d), read %d", busId, netId, distance, netGdrLevel, read);
+  INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %d / HCA %lx (distance %d <= %d), read %d", rank, netId, distance, netGdrLevel, read);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoIsGdrAvail(struct ncclTopoSystem* system, int rank, bool *avail) {
+  int netNum = system->nodes[NET].count;
+  int useGdr = 0;
+  *avail = false;
+  for (int n = 0; n < netNum; n++) {
+    int64_t netId = system->nodes[NET].nodes[n].id;
+    NCCLCHECK(ncclTopoCheckGdr(system, rank, netId, 1, &useGdr));
+    if (useGdr) {
+      *avail = true;
+      break;
+    }
+    NCCLCHECK(ncclTopoCheckGdr(system, rank, netId, 0, &useGdr));
+    if (useGdr) {
+      *avail = true;
+      break;
+    }
+  }
   return ncclSuccess;
 }
 
@@ -417,12 +458,17 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int6
 NCCL_PARAM(NetForceFlush, "NET_FORCE_FLUSH", 0);
 
 // Determine whether we need to flush the GDR recv buffers
-ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int* flush) {
+ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int netDev, int rank, int* flush) {
+  *flush = 1;
+  ncclNetProperties_t props;
+  NCCLCHECK(comm->ncclNet->getProperties(netDev, &props));
+  if (props.forceFlush == 1 || ncclParamNetForceFlush()) return ncclSuccess;
   int g;
-  NCCLCHECK(ncclTopoIdToIndex(system, GPU, busId, &g));
+  struct ncclTopoSystem* system = comm->topo;
+  NCCLCHECK(ncclTopoRankToIndex(system, rank, &g));
   struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
   // Flush is required on Ampere and earlier
-  *flush = gpu->gpu.cudaCompCap < 90 ? 1 : ncclParamNetForceFlush();
+  if (gpu->gpu.cudaCompCap >= 90) *flush = 0;
   return ncclSuccess;
 }
 
@@ -516,7 +562,7 @@ ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks,
     NCCLCHECK(ncclTopoGetNetDev(comm, comm->rank, NULL, 0, rank, &netId, NULL, &proxyRank));
     if (proxyRank == comm->rank) continue;
     int useGdr;
-    NCCLCHECK(ncclTopoCheckGdr(comm->topo, comm->busId, netId, 1, &useGdr));
+    NCCLCHECK(ncclTopoCheckGdr(comm->topo, comm->rank, netId, 1, &useGdr));
     if (useGdr == 0) continue;
     int found = 0;
     for (int r=0; r<nr; r++) {
@@ -562,7 +608,8 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
   for (int g=0; g<system->nodes[GPU].count; g++) {
     for (int p=0; p<system->nodes[GPU].count; p++) {
       int p2p;
-      NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].gpu.rank, system->nodes[GPU].nodes[g].gpu.rank, &p2p, NULL, NULL));
+      NCCLCHECK(ncclTopoCheckP2p(comm, system, system->nodes[GPU].nodes[p].gpu.rank,
+                                 system->nodes[GPU].nodes[g].gpu.rank, &p2p, NULL, NULL));
       if (p2p == 0) {
         // Divert all traffic through the CPU
         int cpu;
@@ -618,7 +665,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
       if (gpu->paths[NET][n].type < PATH_PHB) {
         // Update path when we dont want to / can't use GPU Direct RDMA.
         int gdr;
-        NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, netNode->id, 0, &gdr));
+        NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].gpu.rank, netNode->id, 0, &gdr));
         if (gdr == 0) {
           // We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU
           int localCpu;
diff --git a/src/graph/search.cc b/src/graph/search.cc
index ad6f580540..9b72ac1609 100644
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@@ -1142,7 +1142,7 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr
       offset = strlen(line);
     }
     if (system->nodes[NET].count > 0) {
-      sprintf(line+offset, " %s/%lx-%lx", topoNodeTypeStr[NET], NCCL_TOPO_ID_SYSTEM_ID(graph->inter[2*c+1]), NCCL_TOPO_ID_LOCAL_ID(graph->inter[2*c]));
+      sprintf(line+offset, " %s/%lx-%lx", topoNodeTypeStr[NET], NCCL_TOPO_ID_SYSTEM_ID(graph->inter[2*c+1]), NCCL_TOPO_ID_LOCAL_ID(graph->inter[2*c+1]));
       offset = strlen(line);
     }
     INFO(NCCL_GRAPH, "%s", line);
diff --git a/src/graph/topo.cc b/src/graph/topo.cc
index 9771ae05cb..d758ac989b 100644
--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@@ -296,7 +296,7 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN
         NCCLCHECK(ncclTopoPrintRec(link->remNode, node, line, nextOffset));
       } else {
         if (link->remNode->type == NET) {
-          sprintf(line+nextOffset, "%s/%lx-%lx (%lx/%d/%f)", topoNodeTypeStr[link->remNode->type], NCCL_TOPO_ID_SYSTEM_ID(link->remNode->id), NCCL_TOPO_ID_LOCAL_ID(link->remNode->id), link->remNode->net.asic, link->remNode->net.port, link->remNode->net.bw);
+          sprintf(line+nextOffset, "%s/%lx-%lx (%d/%lx/%d/%f)", topoNodeTypeStr[link->remNode->type], NCCL_TOPO_ID_SYSTEM_ID(link->remNode->id), NCCL_TOPO_ID_LOCAL_ID(link->remNode->id), link->remNode->net.collSupport, link->remNode->net.asic, link->remNode->net.port, link->remNode->net.bw);
         } else {
           sprintf(line+nextOffset, "%s/%lx-%lx", topoNodeTypeStr[link->remNode->type], NCCL_TOPO_ID_SYSTEM_ID(link->remNode->id), NCCL_TOPO_ID_LOCAL_ID(link->remNode->id));
         }
@@ -383,6 +383,7 @@ ncclResult_t ncclTopoAddNic(struct ncclXmlNode* xmlNic, struct ncclTopoSystem* s
     if (strcmp(xmlNet->name, "net") != 0) continue;
     int index;
     NCCLCHECK(xmlGetAttrIndex(xmlNet, "dev", &index));
+    // This means that the "dev" attribute wasn't set on this net xml node. That means it should not be added to the system topology graph
     if (index == -1) continue;
     NCCLCHECK(ncclTopoAddNet(xmlNet, system, nic, systemId));
   }
@@ -403,7 +404,7 @@ struct kvDict kvDictPciGen[] = {
   { "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, { "32 GT/s", 240 }, /* Kernel 5.6 and earlier */
   { "2.5 GT/s PCIe", 15 }, { "5.0 GT/s PCIe", 30 }, { "8.0 GT/s PCIe", 60 }, { "16.0 GT/s PCIe", 120 }, { "32.0 GT/s PCIe", 240 }, { "64.0 GT/s PCIe", 480 },
   { NULL, 60 /* Default fallback */ } }; // x100 Mbps per lane
-ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* system, struct ncclTopoNode* parent, int systemId) {
+ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* system, struct ncclTopoNode* parent, int systemId, int numaId) {
   const char* str;
 
   int type;
@@ -430,9 +431,9 @@ ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* s
   if (xmlNic != NULL) {
     type = NIC;
     // Ignore sub device ID and merge multi-port NICs into one PCI device.
-    busId &= 0xfffffffffffffff0;
     struct ncclTopoNode* nicNode = NULL;
-    int64_t id = NCCL_TOPO_ID(systemId, busId);
+    int64_t localNicId = NCCL_TOPO_LOCAL_NIC_ID(numaId, busId);
+    int64_t id = NCCL_TOPO_ID(systemId, localNicId);
     NCCLCHECK(ncclTopoGetNode(system, &nicNode, type, id));
     if (nicNode == NULL) {
       NCCLCHECK(ncclTopoCreateNode(system, &nicNode, type, id));
@@ -453,7 +454,7 @@ ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* s
     for (int s=0; s<xmlPci->nSubs; s++) {
       struct ncclXmlNode* xmlSubPci = xmlPci->subs[s];
       if (strcmp(xmlSubPci->name, "pcilink") != 0) { // PCI links will be added later
-        NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node, systemId));
+        NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node, systemId, numaId));
       }
     }
   }
@@ -520,12 +521,14 @@ ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* s
   }
   for (int s=0; s<xmlCpu->nSubs; s++) {
     struct ncclXmlNode* node = xmlCpu->subs[s];
-    if (strcmp(node->name, "pci") == 0) NCCLCHECK(ncclTopoAddPci(node, system, cpu, systemId));
+    if (strcmp(node->name, "pci") == 0) NCCLCHECK(ncclTopoAddPci(node, system, cpu, systemId, numaId));
     if (strcmp(node->name, "nic") == 0) {
       struct ncclTopoNode* nic = NULL;
-      NCCLCHECK(ncclTopoGetNode(system, &nic, NIC, 0));
+      int64_t localNicId = NCCL_TOPO_LOCAL_NIC_ID(numaId, 0);
+      int64_t id = NCCL_TOPO_ID(systemId, localNicId);
+      NCCLCHECK(ncclTopoGetNode(system, &nic, NIC, id));
       if (nic == NULL) {
-        NCCLCHECK(ncclTopoCreateNode(system, &nic, NIC, NCCL_TOPO_ID(systemId, 0)));
+        NCCLCHECK(ncclTopoCreateNode(system, &nic, NIC, id));
         NCCLCHECK(ncclTopoConnectNodes(cpu, nic, LINK_PCI, LOC_BW));
         NCCLCHECK(ncclTopoConnectNodes(nic, cpu, LINK_PCI, LOC_BW));
       }
@@ -725,14 +728,528 @@ ncclResult_t ncclTopoRefreshBcmP2pLinks(void) {
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) {
+// This is just checking for direct descendence
+int ncclTopoCheckPix(ncclXmlNode* common, ncclXmlNode** nodes, int nNodes) {
+  const char* tempBusId;
+  // If the common parent isn't a pci switch, then this isn't PIX
+  NCCLCHECK(xmlGetAttrStr(common, "busid", &tempBusId));
+  if (tempBusId == NULL) return 0;
+  TRACE(NCCL_GRAPH, "Checking pix for busid=%s", tempBusId);
+
+  // All the nodes must have a "nic" which is a parent, and then a pci node (busid) which must be a child of the "common"
+  for (int i = 0; i < nNodes; i++) {
+    ncclXmlNode* node = nodes[i];
+    if (strcmp(node->name, "net") == 0) {
+      node = node->parent;
+      if (node == NULL) return 0;
+      if (strcmp(node->name, "nic") == 0) {
+        node = node->parent;
+        if (node == NULL) return 0;
+        // All nodes must descend from the same first level pci switch
+        if (strcmp(node->name, "pci") == 0) {
+          TRACE(NCCL_GRAPH, "Comparing parent of node=%p to common=%p", node->parent, common);
+          if (node->parent != common) return 0;
+        }
+      }
+    }
+  }
+
+  return 1;
+}
+
+#define NCCL_TOPO_XML_DEPTH_MAX 256
+typedef struct xmlNodeStack {
+  ncclXmlNode* elems[NCCL_TOPO_XML_DEPTH_MAX];
+  int tail;
+
+  ncclXmlNode* top() {
+    if (!empty()) {
+      return elems[tail - 1];
+    } else {
+      return NULL;
+    }
+  }
+
+  ncclXmlNode* pop() {
+    ncclXmlNode* node = top();
+    if (node) {
+      tail--;
+    }
+    return node;
+  }
+
+  void push(ncclXmlNode* node) {
+    if (tail < NCCL_TOPO_XML_DEPTH_MAX) {
+      elems[tail++] = node;
+    }
+  }
+
+  bool empty() {
+    return tail == 0;
+  }
+
+} xmlNodeStack;
+
+// 1. Find the common parent xmlNode between the given set of nodes
+ncclResult_t ncclTopoGetPath(ncclXmlNode** nodes, int nNodes, int* path, ncclXmlNode** parent) {
+  // Track a stack of parents per-net node being merged
+  xmlNodeStack* parents;
+  NCCLCHECK(ncclCalloc(&parents, nNodes));
+  // Find the common parent
+  ncclXmlNode* common = NULL;
+
+  if (nNodes == 1) {
+    common = nodes[0];
+    *path = PATH_LOC;
+    goto out;
+  }
+
+  for (int i = 0; i < nNodes; i++) {
+    ncclXmlNode* temp;
+    temp = nodes[i];
+    while (temp) {
+      parents[i].push(temp);
+      temp = strcmp(temp->name, "system") == 0 ? NULL : temp->parent;
+    }
+  }
+
+  common = NULL;
+  int c;
+  c = 1;
+  while (c && !parents[0].empty()) {
+    ncclXmlNode* temp = parents[0].top();
+    for (int i = 1; i < nNodes; i++) {
+      if (!parents[i].empty()) {
+        c &= (temp == parents[i].top());
+      } else {
+        c = 0;
+        break;
+      }
+    }
+
+    if (c) {
+      common = temp;
+      if (common == NULL) TRACE(NCCL_GRAPH, "COMMON IS NULL");
+      for (int i = 0; i < nNodes; i++) {
+        parents[i].pop();
+      }
+    // Check multi-port while we still have the mismatched parents
+    // For multi-port to be true, all parents (peers) must have the busId attribute with all but the last character matching
+    } else {
+      int multiPort = 1;
+      const char* tempBusId;
+
+      NCCLCHECK(xmlGetAttr(temp, "busid", &tempBusId));
+      if (tempBusId) {
+        for (int i = 1; i < nNodes; i++) {
+          if (!parents[i].empty()) {
+            const char* busId;
+            NCCLCHECK(xmlGetAttr(parents[i].top(), "busid", &busId));
+            if (busId) {
+              if (strlen(busId) != strlen(tempBusId)) {
+                multiPort = 0;
+                break;
+              }
+              if (strncmp(busId, tempBusId, strlen(busId)-1) != 0) {
+                multiPort = 0;
+                break;
+              }
+            } else {
+              multiPort = 0;
+              break;
+            }
+          }
+        }
+      } else {
+        multiPort = 0;
+      }
+
+      if (multiPort) {
+        *path = PATH_PORT;
+        goto out;
+      }
+    }
+  }
+
+  if (common == NULL) {
+    *path = PATH_DIS;
+  } else if (strcmp(common->name,"system") == 0) {
+    *path = PATH_SYS;
+  } else if (strcmp(common->name, "cpu") == 0) {
+    *path = PATH_PHB;
+  } else if (strcmp(common->name, "nic") == 0) {
+    *path = PATH_PORT;
+  } else if (strcmp(common->name, "net") == 0) {
+    *path = PATH_PORT;
+  } else if (ncclTopoCheckPix(common, nodes, nNodes)) {
+    *path = PATH_PIX;
+  } else {
+    *path = PATH_PXB;
+  }
+
+out:
+  *parent = common;
+  free(parents);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoMakeUniqueBusId(struct ncclXml* xml, char* busId, struct ncclXmlNode** pciNode, struct ncclXmlNode* parent) {
+  int i = 0;
+  int64_t rBusId;
+  NCCLCHECK(busIdToInt64(busId, &rBusId));
+  // Try to find an unused busid - NCCL expects leaf busid to be unique
+  while (i < 100) {
+    rBusId++;
+    TRACE(NCCL_GRAPH, "Trying to make new busId %lx", rBusId);
+    int64ToBusId(rBusId, busId);
+    struct ncclXmlNode* temp = NULL;
+    NCCLCHECK(xmlFindTagKv(xml, "pci", &temp, "busid", busId));
+    if (temp == NULL) {
+      NCCLCHECK(xmlAddNode(xml, parent, "pci", pciNode));
+      NCCLCHECK(xmlSetAttr(*pciNode, "busid", busId));
+      TRACE(NCCL_GRAPH, "Made new busId %lx", rBusId);
+      return ncclSuccess;
+    }
+    TRACE(NCCL_GRAPH, "Conflicting busId %lx", rBusId);
+    i++;
+  }
+
+  WARN("TOPO/NET : Couldn't generate unique busId after %d tries", i);
+  return ncclInternalError;
+}
+
+ncclResult_t ncclTopoMakePciParent(struct ncclXml* xml, struct ncclXmlNode** parent, struct ncclXmlNode* physNetNode) {
+  struct ncclXmlNode* newBusId = NULL;
+  struct ncclXmlNode* pci = physNetNode->parent;
+  if (pci) {
+    pci = pci->parent;
+    if (pci) {
+      if (strcmp(pci->name, "pci") == 0) {
+        char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+        memset(busId, 0, sizeof(busId));
+        const char* originalBusId;
+        // Seed busId with the current NIC 0's busId to make discovering a unique hash quicker
+        NCCLCHECK(xmlGetAttrStr(pci, "busid", &originalBusId));
+        snprintf(busId, sizeof(busId), "%s", originalBusId);
+        NCCLCHECK(ncclTopoMakeUniqueBusId(xml, busId, &newBusId, *parent));
+        for (int i = 0; i < pci->nAttrs; i++) {
+          NCCLCHECK(xmlSetAttr(newBusId, pci->attrs[i].key, pci->attrs[i].value));
+        }
+        NCCLCHECK(xmlSetAttr(newBusId, "busid", busId));
+        *parent = newBusId;
+      }
+    }
+  }
+
+  if (newBusId == NULL) {
+    const char* name;
+    NCCLCHECK(xmlGetAttr(physNetNode, "name", &name));
+    WARN("TOPO/NET : Can't find busId of child 0 %s", name);
+    return ncclInternalError;
+  }
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoMakeVnic(ncclComm_t comm, struct ncclXml* xml, ncclNetVDeviceProps_t* vProps,
+struct ncclXmlNode** physNetNodes, struct ncclXmlNode** netNode, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
+  if (vProps->ndevs > NCCL_NET_MAX_DEVS_PER_NIC) {
+    WARN("TOPO/NET : Tried to merge too many NICs. %d > %d", vProps->ndevs, NCCL_NET_MAX_DEVS_PER_NIC);
+    return ncclInternalError;
+  }
+
+  // Trigger the merge, then get the new device's properties
+  int vDevIndex = 0;
+  ncclResult_t ret = makeVDevice(&vDevIndex, vProps);
+  if (ret == ncclInvalidUsage) {
+    WARN("TOPO/NET : Tried merging multiple devices together and failed. Try setting NCCL_NET_MERGE_LEVEL=LOC");
+    NCCLCHECK(ret);
+  }
+
+  INFO(NCCL_GRAPH, "TOPO/NET : Made vNic %d", vDevIndex);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoForceMerge(ncclComm_t comm, struct ncclXml* xml, char* str, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
+  INFO(NCCL_ENV|NCCL_NET, "TOPO/NET : Force-fusing NICs using NCCL_NET_FORCE_MERGE=%s", str);
+  char* semi_token;
+  char* semi = strtok_r(str, ";", &semi_token);
+  while (semi) {
+    TRACE(NCCL_NET, "Fusing %s", semi);
+    struct netIf userIfs[NCCL_NET_MAX_DEVS_PER_NIC];
+    int nUserIfs = parseStringList(semi, userIfs, NCCL_NET_MAX_DEVS_PER_NIC);
+    if (nUserIfs == 0) {
+      INFO(NCCL_NET, "NET/IB : Invalid NCCL_NET_FORCE_MERGE specified %s. Couldn't parse substring %s. Please provide a semicolon-delimited list of comma-delimited NIC groups.",
+        str, semi);
+      continue;
+    }
+
+    ncclNetVDeviceProps_t vProps = {0};
+    for (int d = 0; d < nPhysDevs; d++) {
+      if (matchIfList(propsList[d].name, propsList[d].port, userIfs, nUserIfs, 1)) {
+        vProps.devs[vProps.ndevs++] = d;
+      }
+    }
+
+    if (vProps.ndevs != nUserIfs) {
+      WARN("TOPO/NET : Only matched %d devices, %d requested from %s",
+        vProps.ndevs, nUserIfs, semi);
+      return ncclInvalidUsage;
+    }
+
+    if (vProps.ndevs > NCCL_NET_MAX_DEVS_PER_NIC) {
+      WARN("Specified fused NIC %s which has too many devices (%d). Max %d", semi, vProps.ndevs, NCCL_NET_MAX_DEVS_PER_NIC);
+      return ncclInvalidUsage;
+    }
+
+    struct ncclXmlNode* netNode;
+    NCCLCHECK(ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice));
+
+    // Only set that a device is "placed" after successfully making a vNic (it's possible to exit before this)
+    for (int i = 0; i < vProps.ndevs; i++) {
+      placedDevs[vProps.devs[i]] = 1;
+    }
+
+    semi = strtok_r(NULL, ";", &semi_token);;
+  }
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoAutoMerge(ncclComm_t comm, struct ncclXml* xml, int mergeLevel, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
+  // Compute the path type between each device
+  int* paths = NULL;
+  ncclResult_t res = ncclSuccess;
+  ncclCalloc(&paths, nPhysDevs*nPhysDevs);
+  TRACE(NCCL_GRAPH, "Allocated %d paths", nPhysDevs*nPhysDevs);
+  for (int i = 0; i < nPhysDevs; i++) {
+    for (int j = 0; j < nPhysDevs; j++) {
+      struct ncclXmlNode* nodes[2];
+      nodes[0] = physNetNodes[i];
+      nodes[1] = physNetNodes[j];
+      struct ncclXmlNode* parent;
+      NCCLCHECKGOTO(ncclTopoGetPath(nodes, 2, &paths[i*nPhysDevs + j], &parent), res, out);
+    }
+  }
+
+  // Place all remaining physical devices into a virtual device given the mergeLevel criteria
+  for (int i = 0; i < nPhysDevs; i++) {
+    // Select the first unplaced device "i" as the root
+    if (placedDevs[i] == 0) {
+      // Init a new vDevice
+      ncclNetVDeviceProps_t vProps;
+      vProps = {0};
+      vProps.devs[vProps.ndevs++] = i;
+      placedDevs[i] = 1;
+      TRACE(NCCL_GRAPH, "Placed dev %d", i);
+
+      // Select each unplaced device "j" which is at most "mergeLevel" distance from "i", but not equal to "i"
+      // (Don't merge the same device with itself)
+      for (int j = 0; j < nPhysDevs; j++) {
+        if (paths[i*nPhysDevs + j] <= mergeLevel &&
+        placedDevs[j] == 0 && j != i) {
+          vProps.devs[vProps.ndevs++] = j;
+          placedDevs[j] = 1;
+          TRACE(NCCL_GRAPH, "Placed dev %d path=%d", j, paths[i*nPhysDevs + j] );
+        }
+        if (vProps.ndevs == NCCL_NET_MAX_DEVS_PER_NIC) break;
+      }
+
+      if (vProps.ndevs > NCCL_NET_MAX_DEVS_PER_NIC) {
+        WARN("TOPO/NET : Tried to merge too many NICs. %d > %d", vProps.ndevs, NCCL_NET_MAX_DEVS_PER_NIC);
+        return ncclInternalError;
+      }
+
+      struct ncclXmlNode* netNode;
+      NCCLCHECKGOTO(ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice), res, out);
+    }
+  }
+
+out:
+  free(paths);
+  return res;
+}
+
+struct kvDict nicPathKvList[] = {
+  { "LOC",  PATH_LOC },
+  { "PORT", PATH_PORT },
+  { "PIX",  PATH_PIX },
+  { "PXB",  PATH_PXB },
+  { "PXN",  PATH_PXN },
+  { "PHB",  PATH_PHB },
+  { "SYS",  PATH_SYS },
+  { NULL, 0 }
+};
+
+ncclResult_t ncclTopoGetVNicParent(struct ncclXml* xml, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), ncclNetVDeviceProps_t* vProps, ncclXmlNode** parent) {
+  ncclNetProperties_t props[NCCL_NET_MAX_DEVS_PER_NIC];
+  ncclXmlNode* physNetNodes[NCCL_NET_MAX_DEVS_PER_NIC];
+  for (int i = 0; i < vProps->ndevs; i++) {
+    NCCLCHECK(getProperties(vProps->devs[i], props + i));
+    struct ncclXmlNode* physNetNode;
+    NCCLCHECK(xmlFindTagKv(xml, "net", &physNetNode, "name", props[i].name));
+    physNetNodes[i] = physNetNode;
+    TRACE(NCCL_GRAPH, "Re-found physical ncclNet node %d %s", i,  props[i].name);
+  }
+
+  int path = PATH_LOC;
+  NCCLCHECK(ncclTopoGetPath(physNetNodes, vProps->ndevs, &path, parent));
+  if (path == PATH_LOC) {
+    *parent = NULL;
+  } else if (parent && strcmp((*parent)->name, "pci") == 0) {
+    // If the common parent is PCI, we must reparent the new NIC under a made up busId
+    NCCLCHECK(ncclTopoMakePciParent(xml, parent, physNetNodes[0]));
+  }
+  TRACE(NCCL_GRAPH, "Selected parent %s with path %d", (*parent)->name, path);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoMakeVNics(ncclComm_t comm, struct ncclXml* xml, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*), ncclResult_t (*getProperties)(int, ncclNetProperties_t*), int physicalDevs) {
+  int* placedDevs = NULL;
+  struct ncclXmlNode** physNetNodes = NULL;
+  if (physicalDevs == 0) return ncclSuccess;
+
+  ncclCalloc(&physNetNodes, physicalDevs);
+  ncclResult_t res = ncclSuccess;
+
+  ncclNetProperties_t* props = NULL;
+  ncclCalloc(&props, physicalDevs);
+  for (int i = 0; i < physicalDevs; i++) {
+    NCCLCHECKGOTO(getProperties(i, props + i), res, out);
+    struct ncclXmlNode* physNetNode;
+    NCCLCHECKGOTO(xmlFindTagKv(xml, "net", &physNetNode, "name", props[i].name), res, out);
+    physNetNodes[i] = physNetNode;
+    TRACE(NCCL_GRAPH, "Found physical ncclNet node %d %s", i,  props[i].name);
+  }
+
+  // By default, don't merge any devices
+  int mergeLevel;
+  mergeLevel = PATH_PORT;
+  char* mergeLevelEnv;
+  mergeLevelEnv = getenv("NCCL_NET_MERGE_LEVEL");
+  if (mergeLevelEnv) kvConvertToInt(mergeLevelEnv, &mergeLevel, nicPathKvList);
+  char* forceMerge;
+  forceMerge = getenv("NCCL_NET_FORCE_MERGE");
+  NCCLCHECK(ncclCalloc(&placedDevs, physicalDevs));
+  memset(placedDevs, 0, sizeof(int)*physicalDevs);
+
+  if (forceMerge) {
+    NCCLCHECKGOTO(ncclTopoForceMerge(comm, xml, forceMerge, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out);
+  }
+  NCCLCHECKGOTO(ncclTopoAutoMerge(comm, xml, mergeLevel, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out);
+
+out:
+  free(physNetNodes);
+  free(props);
+  if (placedDevs) free(placedDevs);
+  return res;
+}
+
+static ncclResult_t ncclTopoPopulateNics(ncclComm_t comm, ncclXml* xml, int startIndex, int endIndex, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), const char* netName, int coll, int keep, int virtualNics) {
+  for (int n = startIndex; n < endIndex; n++) {
+    ncclNetProperties_t props;
+    NCCLCHECK(getProperties(n, &props));
+    struct ncclXmlNode* netNode = NULL;
+    struct ncclXmlNode* parent = NULL;
+    if (virtualNics) {
+      struct ncclXmlNode* net = NULL;
+      NCCLCHECK(xmlFindTagKv(xml, "net", &net, "name", props.name));
+      // In the event of multithreaded use case, we need to re-discover the shared parent of the given devices for this vNIC
+      // Only run this if the net doesn't exist locally - this may alter the XML state
+      if (net == NULL) NCCLCHECK(ncclTopoGetVNicParent(xml, getProperties, &props.vProps, &parent));
+    }
+
+    NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode, parent));
+
+    const char* colAttr;
+    NCCLCHECK(xmlGetAttr(netNode, "coll", &colAttr));
+
+    // If coll == 0 but the netNode is tagged as coll, don't update the keep value
+    if (colAttr == NULL || coll != 0 || strcmp(colAttr,"1") != 0) NCCLCHECK(xmlSetAttrInt(netNode, "keep", keep));
+    NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
+    NCCLCHECK(xmlInitAttrInt(netNode, "latency", props.latency));
+    NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
+    NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
+    NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
+    NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
+    bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
+    INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", netName, gdrSupport ? "Enabled" : "Disabled", n, props.name);
+    NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport));
+    // Only set coll if it's not 0
+    if (coll) NCCLCHECK(xmlInitAttrInt(netNode, "coll", coll));
+
+    const char* keepAttr;
+    NCCLCHECK(xmlGetAttr(netNode, "coll", &colAttr));
+    NCCLCHECK(xmlGetAttr(netNode, "keep", &keepAttr));
+    INFO(NCCL_GRAPH, "ncclTopoPopulateNics : Filled %s in topo with pciPath=%s keep=%s coll=%s",
+      props.name, props.pciPath, keepAttr, colAttr);
+  }
+
+  return ncclSuccess;
+}
+
+struct ncclTopoNetState {
+  int nVirtualNics;
+  int nPhysicalNics;
+  const char* name;
+};
+
+// Calls to network plugin APIs should be protected. This function should be called inside a per-process lock.
+static ncclResult_t ncclTopoProcessNet(ncclComm_t comm, ncclXml* xml, int coll, const char* dumpXmlFile, ncclTopoNetState* state, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*), ncclResult_t (*devices)(int*), const char* netName) {
+  int usePhysicalDevices = (dumpXmlFile || makeVDevice == NULL);
+  if (state->nPhysicalNics == -1) NCCLCHECK(devices(&state->nPhysicalNics));
+  // Enumerate physical devices
+  NCCLCHECK(ncclTopoPopulateNics(comm, xml, 0, state->nPhysicalNics, getProperties, netName, coll, 1, 0));
+  if (!usePhysicalDevices) {
+    if (state->nVirtualNics == -1) {
+      NCCLCHECK(ncclTopoMakeVNics(comm, xml, makeVDevice, getProperties, state->nPhysicalNics));
+      int nDevs;
+      NCCLCHECK(devices(&nDevs));
+      state->nVirtualNics = nDevs - state->nPhysicalNics;
+    }
+    // Remove keep=1 for physical collnets
+    if (state->nVirtualNics > 0) {
+      NCCLCHECK(ncclTopoPopulateNics(comm, xml, 0, state->nPhysicalNics, getProperties, netName, coll, 0, 0));
+      // Populate new devices
+      NCCLCHECK(ncclTopoPopulateNics(comm, xml, state->nPhysicalNics, state->nPhysicalNics+state->nVirtualNics, getProperties, netName, coll, 1, 1));
+    }
+  }
+
+  return ncclSuccess;
+}
+
+static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER;
+ncclTopoNetState netStates[NCCL_NET_MAX_PLUGINS] = {};
+ncclTopoNetState collNetStates[NCCL_NET_MAX_PLUGINS] = {};
+ncclResult_t ncclTopoGetSharedState(ncclTopoNetState** state, const char* name, ncclTopoNetState* states) {
+  INFO(NCCL_GRAPH, "Retrieving state for %s", name);
+  for (int i = 0; i < NCCL_NET_MAX_PLUGINS; i++) {
+    // Empty slot
+    if (states[i].name == NULL) {
+      states[i].nVirtualNics = -1;
+      states[i].nPhysicalNics = -1;
+      states[i].name = strdup(name);
+      *state = states + i;
+      INFO(NCCL_GRAPH, "Initialized state %d for %s", i, name);
+      return ncclSuccess;
+    // Found my slot
+    } else if (strcmp(states[i].name, name) == 0) {
+      *state = states + i;
+      return ncclSuccess;
+    }
+  }
+  WARN("NET/TOPO : Couldn't find net with name %s", name);
+  return ncclInternalError;
+}
+
+ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system, const char* dumpXmlFile) {
   ncclResult_t ret = ncclSuccess;
   struct ncclXml* xml;
   char* mem = NULL;
   int* localRanks = NULL;
-  int netDevCount = 0;
   struct ncclXml* rankXml;
   int localRank = -1, nLocalRanks = 0;
+  int netLockHeld = 0;
   NCCLCHECK(xmlAlloc(&xml, NCCL_TOPO_XML_MAX_NODES));
   const char* xmlTopoFile = ncclGetEnv("NCCL_TOPO_FILE");
   if (xmlTopoFile) {
@@ -761,47 +1278,24 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
     NCCLCHECKGOTO(xmlSetAttrInt(node, "rank", comm->rank), ret, fail);
     NCCLCHECKGOTO(xmlInitAttrInt(node, "gdr", comm->peerInfo[comm->rank].gdrSupport), ret, fail);
   }
+
   // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
   // so we start with collnet so that it has precedence.
+  pthread_mutex_lock(&netLock);
+  netLockHeld = 1;
+  INFO(NCCL_GRAPH, "TOPO/NET : Importing network plugins to topology");
+  ncclTopoNetState* state;
+  state = NULL;
   if (collNetSupport(comm)) {
-    NCCLCHECKGOTO(collNetDevices(comm, &netDevCount), ret, fail);
-    for (int n=0; n<netDevCount; n++) {
-      ncclNetProperties_t props;
-      NCCLCHECKGOTO(collNetGetProperties(comm, n, &props), ret, fail);
-      struct ncclXmlNode* netNode;
-      NCCLCHECKGOTO(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode), ret, fail);
-      NCCLCHECKGOTO(xmlSetAttrInt(netNode, "keep", 1), ret, fail);
-      NCCLCHECKGOTO(xmlSetAttrInt(netNode, "dev", n), ret, fail);
-      NCCLCHECKGOTO(xmlInitAttrInt(netNode, "speed", props.speed), ret, fail);
-      NCCLCHECKGOTO(xmlInitAttrInt(netNode, "port", props.port), ret, fail);
-      NCCLCHECKGOTO(xmlInitAttrUint64(netNode, "guid", props.guid), ret, fail);
-      NCCLCHECKGOTO(xmlInitAttrInt(netNode, "maxconn", props.maxComms), ret, fail);
-      bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
-      INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
-      NCCLCHECKGOTO(xmlInitAttrInt(netNode, "gdr", gdrSupport), ret, fail);
-      NCCLCHECKGOTO(xmlInitAttrInt(netNode, "coll", 1), ret, fail);
-    }
-  }
-  if (netDevCount == 0) {
-    NCCLCHECKGOTO(comm->ncclNet->devices(&netDevCount), ret, fail);
-  }
-  for (int n=0; n<netDevCount; n++) {
-    ncclNetProperties_t props;
-    NCCLCHECKGOTO(comm->ncclNet->getProperties(n, &props), ret, fail);
-    comm->netDeviceType = props.netDeviceType;
-    struct ncclXmlNode* netNode;
-    NCCLCHECKGOTO(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode), ret, fail);
-    NCCLCHECKGOTO(xmlSetAttrInt(netNode, "keep", 1), ret, fail);
-    NCCLCHECKGOTO(xmlSetAttrInt(netNode, "dev", n), ret, fail);
-    NCCLCHECKGOTO(xmlInitAttrInt(netNode, "speed", props.speed), ret, fail);
-    NCCLCHECKGOTO(xmlInitAttrInt(netNode, "port", props.port), ret, fail);
-    NCCLCHECKGOTO(xmlInitAttrFloat(netNode, "latency", props.latency), ret, fail);
-    NCCLCHECKGOTO(xmlInitAttrUint64(netNode, "guid", props.guid), ret, fail);
-    NCCLCHECKGOTO(xmlInitAttrInt(netNode, "maxconn", props.maxComms), ret, fail);
-    bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
-    INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
-    NCCLCHECKGOTO(xmlInitAttrInt(netNode, "gdr", gdrSupport), ret, fail);
+    NCCLCHECKGOTO(ncclTopoGetSharedState(&state, comm->ncclCollNet->name, collNetStates), ret, fail);
+    NCCLCHECKGOTO(ncclTopoProcessNet(comm, xml, 1, dumpXmlFile, state,
+      comm->ncclCollNet->getProperties, comm->ncclCollNet->makeVDevice, comm->ncclCollNet->devices, comm->ncclCollNet->name), ret, fail);
   }
+  NCCLCHECKGOTO(ncclTopoGetSharedState(&state, comm->ncclNet->name, netStates), ret, fail);
+  NCCLCHECKGOTO(ncclTopoProcessNet(comm, xml, 0, dumpXmlFile, state,
+    comm->ncclNet->getProperties, comm->ncclNet->makeVDevice, comm->ncclNet->devices, comm->ncclNet->name), ret, fail);
+  pthread_mutex_unlock(&netLock);
+  netLockHeld = 0;
 
   // Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
   NCCLCHECKGOTO(ncclTopoTrimXml(xml), ret, fail);
@@ -845,19 +1339,21 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
     NCCLCHECKGOTO(ncclTopoFuseXml(xml, peerXml), ret, fail);
   }
 
-  xmlTopoFile = ncclGetEnv("NCCL_TOPO_DUMP_FILE");
-  if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) {
-    INFO(NCCL_ENV, "NCCL_TOPO_DUMP_FILE set by environment to %s", xmlTopoFile);
-    NCCLCHECKGOTO(ncclTopoDumpXmlToFile(xmlTopoFile, xml), ret, fail);
+  if (dumpXmlFile && comm->rank == ncclParamTopoDumpFileRank()) {
+    INFO(NCCL_ENV, "NCCL_TOPO_DUMP_FILE set by environment to %s", dumpXmlFile);
+    NCCLCHECKGOTO(ncclTopoDumpXmlToFile(dumpXmlFile, xml), ret, fail);
   }
 
-  NCCLCHECKGOTO(ncclTopoGetSystemFromXml(xml, system, comm->peerInfo[comm->rank].hostHash), ret, fail);
+  // Only update our topo tracking structure if we aren't dumping (separate steps)
+  if (dumpXmlFile == NULL) NCCLCHECKGOTO(ncclTopoGetSystemFromXml(xml, system, comm->peerInfo[comm->rank].hostHash), ret, fail);
+
 exit:
   if (!comm->MNNVL && localRanks) free(localRanks);
   if (mem) free(mem);
   free(xml);
   return ret;
 fail:
+  if (netLockHeld) pthread_mutex_unlock(&netLock);
   goto exit;
 }
 
diff --git a/src/graph/topo.h b/src/graph/topo.h
index 0837fb4b38..8e7cda5b44 100644
--- a/src/graph/topo.h
+++ b/src/graph/topo.h
@@ -78,6 +78,9 @@ extern const char* topoLinkTypeStr[];
 // Connection through the network
 #define PATH_NET 8
 
+// New type of path which should precede PATH_PIX
+#define PATH_PORT PATH_NVL
+
 // Disconnected
 #define PATH_DIS 9
 extern const char* topoPathTypeStr[];
@@ -106,6 +109,7 @@ struct ncclTopoLinkList {
 #define NCCL_TOPO_ID_LOCAL_ID_MASK 0x00ffffffffffffff
 #define NCCL_TOPO_ID_SYSTEM_ID(id) (id >> 56)
 #define NCCL_TOPO_ID_LOCAL_ID(id) (id & NCCL_TOPO_ID_LOCAL_ID_MASK)
+#define NCCL_TOPO_LOCAL_NIC_ID(numaid, busid) (((int64_t)numaid << 56) + busid)
 #define NCCL_TOPO_ID(systemid, localid) (((int64_t)systemid << 56) + (localid & NCCL_TOPO_ID_LOCAL_ID_MASK))
 
 struct ncclTopoNode {
diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc
index f0a6224528..f5f2e1185e 100644
--- a/src/graph/tuning.cc
+++ b/src/graph/tuning.cc
@@ -31,23 +31,87 @@ static int getNthreads(const char* name, int env, int min, int max, int def) {
   return nt;
 }
 
-ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* list) {
-  int def, set;
-  if (str[0] == '^') {
-    def = 1; set = 0; str++;
-  } else {
-    def = 0; set = 1;
+// Parse a map of prefixes to a list of elements. The first prefix is
+// optional and, if not present, the list of elements will be applied
+// to all prefixes. Only the first list of elements can lack a
+// prefix. Prefixes (if present) are followed by a colon. Lists of
+// elements are comma delimited. Mappings of prefix to the lists of
+// elements are semi-colon delimited.
+//
+// For example:
+//
+//     NCCL_ALGO="ring,collnetdirect;allreduce:tree,collnetdirect;broadcast:ring"
+// Enable ring and collnetdirect for all functions, then select tree
+// and collnetdirect for allreduce and ring for broadcast.
+//
+//     NCCL_PROTO="LL,Simple;allreduce:^LL"
+// Enable LL and Simple for all functions, but everything except LL
+// for allreduce.
+//
+//     NCCL_PROTO="^LL128;allreduce:LL128"
+// Enable everything but LL128, but only LL128 for allreduce.
+ncclResult_t parseList(const char* str, const char* prefixElems[], int nprefixes, const char* elems[], int nelems, int* list) {
+  char* fullStr = strdup(str);
+  char* tmpFullStr;
+  char* fullToken = strtok_r(fullStr, ";", &tmpFullStr);
+  while (fullToken) {
+    char* subToken = strdup(fullToken);
+    char* tmpSubStr;
+    char* prefix = strtok_r(subToken, ":", &tmpSubStr);
+    char* elemList = strtok_r(NULL, ":", &tmpSubStr);
+    if (elemList == NULL) {
+      if (fullToken != fullStr) {
+        // It makes no sense for any entry other than the first to not have a prefix,
+        // because then all the prefixes before the prefix-less entry would be
+        // overwritten.
+        WARN("All entries except the first must have a prefix: \"%s\"", str);
+        return ncclInvalidUsage;
+      }
+      elemList = prefix;
+      prefix = NULL;
+    }
+
+    int unset, set;
+    if (elemList[0] == '^') {
+      unset = 1; set = 0; elemList++;
+    } else {
+      unset = 0; set = 1;
+    }
+
+    bool foundPrefix = false;
+    for (int p=0; p<nprefixes; p++) {
+      if (prefix && strcasecmp(prefix, prefixElems[p]) != 0) continue;
+      foundPrefix = true;
+      for (int e=0; e<nelems; e++) list[p*nelems+e] = unset;
+
+      char* tokStr = strdup(elemList);
+      char* tmpStr;
+      char* elem = strtok_r(tokStr, ",", &tmpStr);
+      while (elem) {
+        int e;
+        for (e=0; e<nelems; e++) {
+          if (strcasecmp(elem, elems[e]) == 0) {
+            list[p*nelems+e] = set;
+            break;
+          }
+        }
+        if (e==nelems) {
+          WARN("Unrecognized element token \"%s\" when parsing \"%s\"", elem, str);
+          return ncclInvalidUsage;
+        }
+        elem = strtok_r(NULL, ",", &tmpStr);
+      }
+      free(tokStr);
+    }
+    if (!foundPrefix) {
+      WARN("Unrecognized prefix token \"%s\" when parsing \"%s\"", prefix, str);
+      return ncclInvalidUsage;
+    }
+    free(subToken);
+
+    fullToken = strtok_r(NULL, ";", &tmpFullStr);
   }
-  for (int i=0; i<nelems; i++) list[i] = def;
-  char* tokStr = strdup(str);
-  char* tmpStr;
-  char* token = strtok_r(tokStr, ",", &tmpStr);
-  while (token) {
-    for (int i=0; i<nelems; i++)
-      if (strcasecmp(token, elems[i]) == 0) list[i] = set;
-    token = strtok_r(NULL, ",", &tmpStr);
-  }
-  free(tokStr);
+  free(fullStr);
   return ncclSuccess;
 }
 
@@ -144,17 +208,16 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
   if (nRanks <= 1) return ncclSuccess;
 
   int compCapIndex = minCompCap >= 90 ? HOPPER_COMPCAP_IDX : minCompCap >= 80 ? AMPERE_COMPCAP_IDX : VOLTA_COMPCAP_IDX;
-  int cpuArch, cpuVendor, cpuModel;
-  NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
   int index2 = nNodes <= 2 ? nNodes-1 : 2;
   // LL: for single node, we look at GPU type; for multi-node, we look at CPU type
-  int index1 = nNodes == 1 ? compCapIndex : cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD ? 1 : 0;
+  int index1 = nNodes == 1 ? compCapIndex :
+               (comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD || comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_MIXED) ? 1 : 0;
   double llMaxBw = llMaxBws[index1][index2];
   double perChMaxTreeBw = perChMaxTreeBws[compCapIndex][index2];
   double perChMaxRingLL128Bw = perChMaxRingLL128Bws[compCapIndex][index2];
   double perChMaxTreeLL128Bw = perChMaxTreeLL128Bws[compCapIndex][index2];
   // De-penalize Tree/Simple latency on Power systems to favor Tree than Ring
-  if (cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE];
+  if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE];
   float ppn = (float)nRanks / nNodes;
 
   int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS];
@@ -190,7 +253,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
         if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw);
         if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), graphs[a]->nChannels*perChMaxTreeLL128Bw);
         if (a == NCCL_ALGO_TREE && graphs[a]->pattern == NCCL_TOPO_PATTERN_TREE) busBw *= .85;
-        if (a == NCCL_ALGO_PAT) busBw *= .85;
+        if (a == NCCL_ALGO_PAT) busBw *= .75;
         if (a == NCCL_ALGO_COLLNET_DIRECT && p != NCCL_PROTO_SIMPLE) busBw = 0;  // Not used
         if (a == NCCL_ALGO_COLLNET_CHAIN && p != NCCL_PROTO_SIMPLE) busBw = 0;  // Not used
         if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE) {
@@ -226,10 +289,6 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
           busBw *= ratio;
         }
         comm->bandwidths[coll][a][p] = busBw;
-        /* Ring bandwidth backup */
-        if (a == NCCL_ALGO_RING)
-          comm->ringbdw[coll][p] = comm->bandwidths[coll][NCCL_ALGO_RING][p];
-
         comm->latencies[coll][a][p] = baseLat[a][p];
         float intraLat = hwLat[intraHw[a]][a][p];
         // With ppn=1 latencies are fully exposed, use the Tree network latency
@@ -286,41 +345,78 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
 
   // Protocols/Algorithms enable/disable, and user overrides.
   // All are enabled except ll128 which is enabled by default only in certain cases.
-  int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 };
-  int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1, 1, 1, 1 };
+  int protoEnable[NCCL_NUM_FUNCTIONS*NCCL_NUM_PROTOCOLS];
+  int algoEnable[NCCL_NUM_FUNCTIONS*NCCL_NUM_ALGORITHMS];
+  for (int f=0; f<NCCL_NUM_FUNCTIONS; f++) {
+    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+      protoEnable[f*NCCL_NUM_PROTOCOLS+p] = p == NCCL_PROTO_LL128 ? 2 : 1;
+    }
+    for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+      algoEnable[f*NCCL_NUM_ALGORITHMS+a] = 1;
+    }
+  }
 
   const char *protoStr = ncclGetEnv("NCCL_PROTO");
   if (protoStr) {
     INFO(NCCL_ENV, "NCCL_PROTO set by environment to %s", protoStr);
-    NCCLCHECK(parseList(protoStr, ncclProtoStr, NCCL_NUM_PROTOCOLS, protoEnable));
+    NCCLCHECK(parseList(protoStr, ncclFuncStr, NCCL_NUM_FUNCTIONS, ncclProtoStr, NCCL_NUM_PROTOCOLS, protoEnable));
   }
   const char *algoStr = ncclGetEnv("NCCL_ALGO");
   if (algoStr) {
     INFO(NCCL_ENV, "NCCL_ALGO set by environment to %s", algoStr);
-    NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable));
+    NCCLCHECK(parseList(algoStr, ncclFuncStr, NCCL_NUM_FUNCTIONS, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable));
   }
 
-  if (comm->nNodes == 1) algoEnable[NCCL_ALGO_NVLS_TREE] = 0;
-
-  // Disable CollNet if it is not supported
-  if (comm->collNetSupport == 0) {
-    algoEnable[NCCL_ALGO_COLLNET_DIRECT] = 0;
-    algoEnable[NCCL_ALGO_COLLNET_CHAIN] = 0;
-    if (nNodes > 1) algoEnable[NCCL_ALGO_NVLS] = 0;
-    // If user has hard set NCCL_ALGO=COLLNET, ignore it
-    if (algoEnable[NCCL_ALGO_RING] == 0 && algoEnable[NCCL_ALGO_TREE] == 0 &&
-        algoEnable[NCCL_ALGO_NVLS] == 0 && algoEnable[NCCL_ALGO_NVLS_TREE] == 0) {
-      algoEnable[NCCL_ALGO_RING] = algoEnable[NCCL_ALGO_TREE] = 1;
+  if (comm->rank == 0 && (algoStr||protoStr)) {
+    constexpr int strLength = 1024;
+    char funcAlgoProtoTuningStr[strLength];
+    int offset = 0;
+    offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "\n     Function | ");
+    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+      offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "%8s  ", ncclProtoStr[p]);
+    }
+    offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), " | ");
+    for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+      offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "%13s  ", ncclAlgoStr[a]);
+    }
+    offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "\n");
+
+    for (int f=0; f<NCCL_NUM_FUNCTIONS; f++) {
+      offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "%13s | ", ncclFuncStr[f]);
+      for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+        offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "%8d  ", protoEnable[f*NCCL_NUM_PROTOCOLS+p]);
+      }
+      offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), " | ");
+      for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+        offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "%13d  ", algoEnable[f*NCCL_NUM_ALGORITHMS+a]);
+      }
+      offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "\n");
+    }
+
+    INFO(NCCL_ENV, "Enabled NCCL Func/Proto/Algo Matrix:%s", funcAlgoProtoTuningStr);
+  }
+
+  int nvsCount = 0;
+  NCCLCHECK(ncclTopoGetNvsCount(comm->topo, &nvsCount));
+
+  for (int f=0; f<NCCL_NUM_FUNCTIONS; f++) {
+    for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+      int disable = 0;
+      // Disable NVLS Tree on a single node
+      if (comm->nNodes == 1 && a == NCCL_ALGO_NVLS_TREE) disable = 1;
+      // Disable Collnet+Direct, Collnet+Chain or Collnet+NVLS if collnet is not supported.
+      if (comm->collNetSupport == 0 &&
+          (a == NCCL_ALGO_COLLNET_DIRECT ||
+           a == NCCL_ALGO_COLLNET_CHAIN ||
+           (a == NCCL_ALGO_NVLS && comm->nNodes > 1))) disable = 1;
+      // Disable CollNet+Direct if not on an NVSwitch system
+      if (nvsCount == 0 && a == NCCL_ALGO_COLLNET_DIRECT) disable = 1;
+      if (disable) algoEnable[f*NCCL_NUM_ALGORITHMS+a] = 0;
     }
-  } else {
-    // Disable CollNet+Direct if not on an NVSwitch system
-    int nvsCount = 0;
-    NCCLCHECK(ncclTopoGetNvsCount(comm->topo, &nvsCount));
-    if (nvsCount == 0) algoEnable[NCCL_ALGO_COLLNET_DIRECT] = 0;
   }
 
   for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-    int pEnable = protoEnable[p];
+    int pEnable = protoEnable[c*NCCL_NUM_PROTOCOLS+p];
     if (pEnable == 2 && p == NCCL_PROTO_LL128) {
       // Enable LL128 by default only on Volta/Ampere/Hopper+NVLink. Other cases are not tested and may cause silent data corruption.
       pEnable = 1;
@@ -335,66 +431,51 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
       }
     }
     if (pEnable == 0) comm->bandwidths[c][a][p] = 0;
-    if (algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
-    if (a == NCCL_ALGO_RING && pEnable == 0) comm->ringbdw[c][p] = 0;
-  }
-
-  for (int c = 0; c < NCCL_NUM_FUNCTIONS; c++) {
-    bool available = false;
-    for (int a = 0; a < NCCL_NUM_ALGORITHMS; a++)
-      for (int p = 0; p < NCCL_NUM_PROTOCOLS; p++)
-        if (comm->bandwidths[c][a][p] != 0) {
-          available = true;
-          goto check_avail;
-        }
-  check_avail:
-    if (available == false) {
-      /* at least set ring algo available */
-      for (int p = 0; p < NCCL_NUM_PROTOCOLS; p++)
-        comm->bandwidths[c][NCCL_ALGO_RING][p] = comm->ringbdw[c][p];
-    }
+    if (algoEnable[c*NCCL_NUM_ALGORITHMS+a] == 0) comm->bandwidths[c][a][p] = 0;
   }
 
   if (comm->rank == 0) {
-    char line[1024];
+    constexpr int lineLen = 1024;
+    char line[lineLen];
+    int offset = 0;
     for (int block=0; block<DIVUP(NCCL_NUM_ALGORITHMS, 3); block++) {
-      sprintf(line, "  Algorithm   |");
+      offset = snprintf(line, lineLen, "  Algorithm   |");
       for (int ba=0; ba<3; ba++) {
-	int a = block*3+ba;
+        int a = block*3+ba;
         if (a >= NCCL_NUM_ALGORITHMS) continue;
-        sprintf(line+strlen(line), " %14s   %14s   %14s |", "", ncclAlgoStr[a], "");
+        offset += snprintf(line+offset, std::max(0, lineLen-offset), " %14s   %14s   %14s |", "", ncclAlgoStr[a], "");
       }
       INFO(NCCL_TUNING, "%s", line);
-      sprintf(line, "  Protocol    |");
+      offset = snprintf(line, lineLen, "  Protocol    |");
       for (int ba=0; ba<3; ba++) {
         for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-          sprintf(line+strlen(line), " %14s |", ncclProtoStr[p]);
+          offset += snprintf(line+offset, std::max(0, lineLen-offset), " %14s |", ncclProtoStr[p]);
         }
       }
       INFO(NCCL_TUNING, "%s", line);
-      sprintf(line, " Max NThreads |");
+      offset = snprintf(line, lineLen, " Max NThreads |");
       for (int ba=0; ba<3; ba++) {
-	int a = block*3+ba;
+        int a = block*3+ba;
         if (a >= NCCL_NUM_ALGORITHMS) continue;
         for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-          sprintf(line+strlen(line), " %14d |", comm->maxThreads[a][p]);
+          offset += snprintf(line+offset, std::max(0, lineLen-offset), " %14d |", comm->maxThreads[a][p]);
         }
       }
       INFO(NCCL_TUNING, "%s", line);
       for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) {
-        sprintf(line, "%13s |", ncclFuncStr[c]);
+        offset = snprintf(line, lineLen, "%13s |", ncclFuncStr[c]);
         for (int ba=0; ba<3; ba++) {
-	  int a = block*3+ba;
+          int a = block*3+ba;
           if (a >= NCCL_NUM_ALGORITHMS) continue;
           for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-            sprintf(line+strlen(line), "%8.1f/%6.1f |", comm->latencies[c][a][p], comm->bandwidths[c][a][p]);
+            offset += snprintf(line+offset, std::max(0, lineLen-offset), "%8.1f/%6.1f |", comm->latencies[c][a][p], comm->bandwidths[c][a][p]);
           }
         }
         INFO(NCCL_TUNING, "%s", line);
       }
     }
   }
-
+ 
   // Set per-thread amount of work before we increase nThreads and nChannels
   for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
     comm->threadThresholds[a][NCCL_PROTO_LL] = NCCL_LL_THREAD_THRESHOLD;
@@ -438,19 +519,10 @@ static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][23] = {
   {  .9,  .9,  .9,  .9,  .9,  .9,  .9,  .8,  .7,  .6,  .6,  .5,  .5,  .5,  .5,  .6,  .7,  .8,  .7,  .7,  .8,  .9,  .9 }
 };
 
-ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, int protocol, size_t nBytes, int numPipeOps, float* time, bool* backup) {
+ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, int protocol, size_t nBytes, int numPipeOps, float* time) {
   float bw = comm->bandwidths[coll][algorithm][protocol];
   float lat = comm->latencies[coll][algorithm][protocol];
 
-  if (backup) {
-    *backup = false;
-    if (algorithm == NCCL_ALGO_RING && bw == 0.0f) {
-      /* try back up RING algorithm */
-      bw = comm->ringbdw[coll][protocol];
-      *backup = true;
-    }
-  }
-
   if (bw == 0) {
     *time = -1.0; return ncclSuccess;
   }
diff --git a/src/graph/xml.cc b/src/graph/xml.cc
index bb123b7980..a412893897 100644
--- a/src/graph/xml.cc
+++ b/src/graph/xml.cc
@@ -17,6 +17,9 @@
 #include <cpuid.h>
 #endif
 
+// Arbitrarily large number for constructing virtual topology string
+#define NCCL_MAX_XML_DEPTH 1024
+
 /*******************/
 /* XML File Parser */
 /*******************/
@@ -430,7 +433,7 @@ static ncclResult_t getBcmLinks(const char* busId, int* nlinks, char** peers) {
 
 ncclResult_t ncclTopoGetStrFromSys(const char* path, const char* fileName, char* strValue) {
   char filePath[PATH_MAX];
-  sprintf(filePath, "%s/%s", path, fileName);
+  snprintf(filePath, sizeof(filePath), "%s/%s", path, fileName);
   int offset = 0;
   FILE* file;
   if ((file = fopen(filePath, "r")) != NULL) {
@@ -883,7 +886,7 @@ ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct nccl
 // where sysPath/subsystem points to.
 ncclResult_t ncclTopoGetSubsystem(const char* sysPath, char* subSys) {
   char subSysPath[PATH_MAX];
-  sprintf(subSysPath, "%s/subsystem", sysPath);
+  snprintf(subSysPath, sizeof(subSysPath), "%s/subsystem", sysPath);
   char* path = realpath(subSysPath, NULL);
   if (path == NULL) {
     subSys[0] = '\0';
@@ -896,8 +899,9 @@ ncclResult_t ncclTopoGetSubsystem(const char* sysPath, char* subSys) {
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const char* netName, struct ncclXmlNode** netNode) {
+ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const char* netName, struct ncclXmlNode** netNode, struct ncclXmlNode* forceParent) {
   NCCLCHECK(xmlFindTagKv(xml, "net", netNode, "name", netName));
+
   if (*netNode != NULL) return ncclSuccess;
 
   const char* pciSysPath = pciPath;
@@ -906,13 +910,15 @@ ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const cha
     NCCLCHECK(ncclTopoGetSubsystem(pciSysPath, subSystem));
     // This is not a PCI device (virtual, usb, ...).
     if (strcmp(subSystem, "pci") != 0) {
-      INFO(NCCL_GRAPH, "Topology detection: network path %s is not a PCI device (%s). Attaching to first CPU", pciSysPath, subSystem);
+      INFO(NCCL_NET|NCCL_GRAPH, "Topology detection: network path %s is not a PCI device (%s). Attaching to first CPU", pciSysPath, subSystem);
       pciSysPath = NULL;
     }
   }
 
   struct ncclXmlNode* parent = NULL;
-  if (pciSysPath) {
+  if (forceParent) {
+    parent = forceParent;
+  } else if (pciSysPath) {
     int offset;
     for (offset=strlen(pciSysPath)-1; pciSysPath[offset] != '/'; offset--);
     char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
diff --git a/src/graph/xml.h b/src/graph/xml.h
index 0ee56790b7..f06c0e68b2 100644
--- a/src/graph/xml.h
+++ b/src/graph/xml.h
@@ -50,7 +50,7 @@ ncclResult_t ncclTopoGetXmlGraphFromFile(const char* xmlGraphFile, struct ncclXm
 
 /* Auto-detect functions */
 ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct ncclXmlNode** gpuNode);
-ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const char* netName, struct ncclXmlNode** netNode);
+ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const char* netName, struct ncclXmlNode** netNode, struct ncclXmlNode* forceParent=NULL);
 
 /* Remove unneeded parts */
 ncclResult_t ncclTopoTrimXml(struct ncclXml* xml);
@@ -132,6 +132,13 @@ static ncclResult_t xmlGetAttrFloat(struct ncclXmlNode* node, const char* attrNa
   return ncclSuccess;
 }
 
+static ncclResult_t xmlGetAttrFloatDefault(struct ncclXmlNode* node, const char* attrName, float* value, float defaultValue) {
+  const char* str;
+  NCCLCHECK(xmlGetAttr(node, attrName, &str));
+  *value = str ? strtof(str, NULL) : defaultValue;
+  return ncclSuccess;
+}
+
 static ncclResult_t xmlFindTag(struct ncclXml* xml, const char* tagName, struct ncclXmlNode** node) {
   *node = NULL;
   for (int i=0; i<xml->maxIndex; i++) {
@@ -208,6 +215,24 @@ static ncclResult_t xmlSetAttr(struct ncclXmlNode* node, const char* attrName, c
   return ncclSuccess;
 }
 
+static ncclResult_t xmlPrintNodeRecursive(struct ncclXmlNode* node, const char* name) {
+  while (node) {
+    char line[1024*8];
+    int cursor = 0;
+    snprintf(line, sizeof(line), "<name=%s", node->name);
+    for (int i = 0; i < node->nAttrs; i++) {
+      cursor = strlen(line);
+      snprintf(line + cursor, sizeof(line) - cursor, " %s=%s", node->attrs[i].key, node->attrs[i].value);
+    }
+    cursor = strlen(line);
+    snprintf(line + cursor, sizeof(line) - cursor, ">");
+    INFO(NCCL_GRAPH, "%s", line);
+    node = node->parent;
+  }
+  return ncclSuccess;
+}
+
+
 static ncclResult_t xmlSetAttrIfUnset(struct ncclXmlNode* node, const char* attrName, const char* value) {
   int index;
   NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
diff --git a/src/group.cc b/src/group.cc
index 3d3ecb88c0..e387db70cc 100644
--- a/src/group.cc
+++ b/src/group.cc
@@ -323,7 +323,7 @@ static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** g
   /* reset everything */
   while (!ncclIntruQueueEmpty(asyncJobsPtr)) {
     struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsPtr);
-    if (job->comm && !job->comm->config.blocking)
+    if (!job->destroyFlag && job->comm && !job->comm->config.blocking)
       (void) ncclCommSetAsyncError(job->comm, error);
     if (job->undo) job->undo(job);
     if (job->destructor) job->destructor((void*)job);
@@ -392,7 +392,6 @@ fail:
 }
 
 static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInfo = NULL) {
-  int savedDev;
   ncclResult_t ret = ncclSuccess;
   struct ncclGroupJob *gjob = (struct ncclGroupJob*) job_;
   struct ncclComm *groupCommHeadMain = *gjob->groupCommHeadPtr;
@@ -401,8 +400,6 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
 
   bool *groupAbortFlag = gjob->abortFlagPtr;
 
-  CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, fail);
-
   if (!simInfo && groupCommPreconnectHeadMain != nullptr) {
     struct ncclComm* comm = groupCommPreconnectHeadMain;
     do {
@@ -454,12 +451,19 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
       }
       comm = comm->groupNext;
     } while (comm);
-
     NCCLCHECKGOTO(asyncJobLaunch(&asyncCollJobs, groupAbortFlag), ret, fail);
     while (!ncclIntruQueueEmpty(&asyncCollJobs)) {
       struct ncclAsyncJob* job = ncclIntruQueueDequeue(&asyncCollJobs);
       if (job->destructor) job->destructor((void*)job);
     }
+
+    // done with all buffer allocation, start registration and enqueue
+    comm = groupCommHeadMain;
+    do {
+      CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
+      NCCLCHECKGOTO(ncclTasksRegAndEnqueue(comm), ret, fail);
+      comm = comm->groupNext;
+    } while (comm);
   }
 
   if ((!simInfo) && (groupCommHeadMain != nullptr)) {
@@ -476,6 +480,9 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
   while (groupCommHeadMain != nullptr) {
     struct ncclComm* comm = groupCommHeadMain;
     struct ncclComm* next = comm->groupNext;
+    // Poll for callbacks sent to us from other threads. Typically these free
+    // resources from to our memory pools and UB
+    NCCLCHECKGOTO(ncclCommPollCallbacks(comm, /*waitSome=*/false), ret, fail);
     (void) ncclGroupCommLeave(comm);
     if (!comm->config.blocking) {
       (void) ncclCommSetAsyncError(comm, ret);
@@ -483,8 +490,6 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
     groupCommHeadMain = next;
   }
 
-  CUDACHECK(cudaSetDevice(savedDev));
-
 exit:
   return ret;
 fail:
@@ -563,7 +568,10 @@ ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo) {
       ret = ncclInProgress;
     } else {
       /* blocking group */
+      int savedDev;
+      CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, fail);
       NCCLCHECKGOTO(groupLaunch(&ncclGroupJobMainPtr->base, internalSimInfoPtr), ret, fail);
+      CUDACHECKGOTO(cudaSetDevice(savedDev), ret, fail);
       if (simInfo) memcpy((void*)simInfo, (void*)internalSimInfoPtr, realSize);
       groupResetJobState(ncclGroupJobMainPtr);
     }
diff --git a/src/include/collectives.h b/src/include/collectives.h
index e45d78f26f..c82ebce6fc 100644
--- a/src/include/collectives.h
+++ b/src/include/collectives.h
@@ -10,6 +10,7 @@
 #include "nccl.h"
 #include "nccl_common.h"
 #include "device.h"
+#define NCCL_MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two.
 
 // CHUNKSIZE must be a multiple of SLICESIZE
 #define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4)
@@ -23,6 +24,7 @@
 #define REDUCE_SLICESTEPS 1
 #define REDUCE_CHUNKSTEPS 1
 #define NCCL_MAX_SLICE_PER_CHUNK 2  // max value for CHUNKSTEPS/SLICESTEPS, must accord with above
+#define NCCL_MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two.
 
 const char* ncclFuncToString(ncclFunc_t op);
 const char* ncclDevRedOpToString(ncclDevRedOp_t op);
@@ -34,11 +36,11 @@ inline int ncclTypeSize(ncclDataType_t type) {
   switch (type) {
   case ncclInt8:
   case ncclUint8:
+  case ncclFloat8e4m3:
+  case ncclFloat8e5m2:
     return 1;
   case ncclFloat16:
-  #if defined(__CUDA_BF16_TYPES_EXIST__)
   case ncclBfloat16:
-  #endif
     return 2;
   case ncclInt32:
   case ncclUint32:
@@ -67,6 +69,319 @@ struct ncclConnFifo {
 
 #include <stdio.h>
 
+class RingAlgorithm {
+protected:
+  int refCount;
+  int nRanks;
+  int nStepsPerLoop;
+  int chunkSteps;
+  int sliceSteps;
+  ssize_t sliceSize;
+  ssize_t loopSize;
+  ssize_t channelSize;
+  uint8_t *sendbuff;
+  uint8_t *recvbuff;
+  void *sendMhandle;
+  void *recvMhandle;
+  void *srecvMhandle;
+public:
+  // this ring class is used by proxy thread to retrieve the send and recv buffer, size as well as corresponding
+  // mem handle based on the current step of the proxy args. The derived ring algo class is AR, AG, and BC which
+  // would be allocated during enqueue stage and copied to proxy side through shared memory. For each copy, we will
+  // increase the refCount by incRefCount() since the same ring algo object can be referenced multiple times for send
+  // and recv progress. After all steps are done, we decrease the refCount and only delete the ring object when
+  // refCount == 0.
+  virtual void getNextSendAddr(int curStep, uint8_t **sendbuffOut, size_t *sizeOut, void **mhandleOut) = 0;
+  virtual void getNextRecvAddr(int curStep, uint8_t **recvbuffOut, size_t *sizeOut, void **mhandleOut) = 0;
+  int incRefCount() {
+    return __atomic_add_fetch(&refCount, 1, __ATOMIC_RELAXED);
+  }
+  int decRefCount() {
+    return __atomic_sub_fetch(&refCount, 1, __ATOMIC_RELEASE);
+  }
+  RingAlgorithm() { refCount = 0; }
+  virtual ~RingAlgorithm() {};
+};
+
+class RingARAlgorithm : public RingAlgorithm {
+private:
+  int ringIndex;
+  int elemSize;
+  ssize_t chunkSize;
+  int slicePerChunk;
+public:
+  void getNextSendAddr(int curStep, uint8_t **sendbuffOut, size_t *sizeOut, void **mhandleOut) {
+    int curLoop = curStep / nStepsPerLoop;
+    int curLoopStage = (curStep % nStepsPerLoop) / chunkSteps;
+    int chunkStage = curLoopStage % nRanks;
+    int sliceStage = (curStep % chunkSteps) / sliceSteps;
+    ssize_t elemOffset = curLoop * loopSize;
+    ssize_t remSize = channelSize - elemOffset;
+    ssize_t chunkOffset;
+    ssize_t sliceOffset;
+    ssize_t curSliceSize;
+    ssize_t curChunkSize;
+    ssize_t size;
+    ssize_t nelem;
+    int chunkId;
+
+    if (remSize < loopSize) {
+      curChunkSize = alignUp(divUp(remSize / elemSize, nRanks), 16 / elemSize) * elemSize;
+    } else {
+      curChunkSize = chunkSize;
+    }
+    chunkId = (ringIndex + nRanks - 1 - chunkStage) % nRanks;
+    chunkOffset = chunkId * curChunkSize;
+    nelem = std::min(remSize - chunkOffset, curChunkSize);
+    curSliceSize = std::max(divUp(nelem / elemSize, 16 * slicePerChunk) * 16, sliceSize / elemSize / 32) * elemSize;
+    sliceOffset = sliceStage * curSliceSize;
+
+    if (nelem <= sliceOffset) {
+      *sendbuffOut = sendbuff;
+      *mhandleOut = sendMhandle;
+    } else {
+      if (curLoopStage == 0) {
+        *sendbuffOut = sendbuff + elemOffset + chunkOffset + sliceOffset;
+        *mhandleOut = sendMhandle;
+      } else {
+        *sendbuffOut = recvbuff + elemOffset + chunkOffset + sliceOffset;
+        *mhandleOut = srecvMhandle;
+      }
+    }
+    size = std::min(curSliceSize, nelem - sliceOffset);
+    *sizeOut = size < 0 ? 0 : size;
+    return;
+  }
+
+  void getNextRecvAddr(int curStep, uint8_t **recvbuffOut, size_t *sizeOut, void **mhandleOut) {
+    int curLoop = curStep / nStepsPerLoop;
+    int curLoopStage = ((curStep + chunkSteps) % nStepsPerLoop) / chunkSteps;
+    int chunkStage = curLoopStage % nRanks;
+    int sliceStage = (curStep % chunkSteps) / sliceSteps;
+    ssize_t elemOffset = curLoop * loopSize;
+    ssize_t remSize = channelSize - elemOffset;
+    ssize_t chunkOffset;
+    ssize_t sliceOffset;
+    ssize_t curSliceSize;
+    ssize_t curChunkSize;
+    ssize_t size;
+    ssize_t nelem;
+    int chunkId;
+
+    if (remSize < loopSize) {
+      curChunkSize = alignUp(divUp(remSize / elemSize, nRanks), 16 / elemSize) * elemSize;
+    } else {
+      curChunkSize = chunkSize;
+    }
+
+    if (curLoopStage == 0) {
+      chunkId = (ringIndex + 1) % nRanks;
+    } else {
+      chunkId = (ringIndex + nRanks - 1 - chunkStage) % nRanks;
+    }
+
+    chunkOffset = chunkId * curChunkSize;
+    nelem = std::min(remSize - chunkOffset, curChunkSize);
+    curSliceSize = std::max(divUp(nelem / elemSize, 16 * slicePerChunk) * 16, sliceSize / elemSize / 32) * elemSize;
+    sliceOffset = sliceStage * curSliceSize;
+    if (nelem <= sliceOffset) {
+      *recvbuffOut = recvbuff;
+    } else {
+      *recvbuffOut = recvbuff + elemOffset + chunkOffset + sliceOffset;
+    }
+    if (sizeOut) {
+      size = std::min(curSliceSize, nelem - sliceOffset);
+      *sizeOut = size < 0 ? 0 : size;
+    }
+    *mhandleOut = recvMhandle;
+    return;
+  }
+
+  RingARAlgorithm(const void *sendbuff, void *recvbuff, int nRanks, int ringIndex, int chunkSteps, int sliceSteps, size_t chunkSize, size_t sliceSize, size_t gridOffset, size_t channelSize, int elemSize, void *sendMhandle, void *recvMhandle, void *srecvMhandle) {
+    this->ringIndex = ringIndex;
+    this->nRanks = nRanks;
+    this->nStepsPerLoop = 2 * (nRanks - 1) * chunkSteps;
+    this->chunkSteps = chunkSteps;
+    this->sliceSteps = sliceSteps;
+    this->chunkSize = chunkSize;
+    this->sliceSize = sliceSize;
+    this->loopSize = nRanks * chunkSize;
+    this->sendbuff = (uint8_t*)sendbuff + gridOffset;
+    this->recvbuff = (uint8_t*)recvbuff + gridOffset;
+    this->channelSize = channelSize;
+    this->elemSize = elemSize;
+    this->sendMhandle = sendMhandle;
+    this->recvMhandle = recvMhandle;
+    this->srecvMhandle = srecvMhandle;
+    this->slicePerChunk = chunkSteps / sliceSteps;
+  }
+  ~RingARAlgorithm() {}
+};
+
+class RingAGAlgorithm : public RingAlgorithm {
+private:
+  int *ringRanks;
+  int elemSize;
+  ssize_t sendSize;
+  int slicePerChunk;
+public:
+  void getNextSendAddr(int curStep, uint8_t **sendbuffOut, size_t *sizeOut, void **mhandleOut) {
+    int curLoop = curStep / nStepsPerLoop;
+    int chunkStage = (curStep % nStepsPerLoop) / chunkSteps;
+    int sliceStage = (curStep % chunkSteps) / sliceSteps;
+    ssize_t sliceOffset;
+    ssize_t curSliceSize;
+    ssize_t offset;
+    ssize_t elemOffset = curLoop * loopSize;
+    ssize_t chunkSize = std::min(loopSize, channelSize - elemOffset);
+    ssize_t size;
+    int rankDest;
+    uint8_t *buff;
+    void *mhandle;
+
+    curSliceSize = std::max(divUp(chunkSize / elemSize, 16 * slicePerChunk) * 16, sliceSize / elemSize / 32) * elemSize;
+    sliceOffset = sliceStage * curSliceSize;
+    if (chunkStage == 0) {
+      rankDest = ringRanks[0];
+      offset = elemOffset + sliceOffset;
+      buff = sendbuff + offset;
+      mhandle = sendMhandle;
+    } else {
+      rankDest = ringRanks[nRanks - chunkStage];
+      offset = elemOffset + rankDest * sendSize + sliceOffset;
+      buff = recvbuff + offset;
+      mhandle = srecvMhandle;
+    }
+    *sendbuffOut = buff;
+    size = std::min(curSliceSize, channelSize - elemOffset - sliceOffset);
+    *sizeOut = size < 0 ? 0 : size;
+    *mhandleOut = mhandle;
+    return;
+  }
+
+  void getNextRecvAddr(int curStep, uint8_t **recvbuffOut, size_t *sizeOut, void **mhandleOut) {
+    int curLoop = curStep / nStepsPerLoop;
+    int chunkStage = ((curStep + chunkSteps) % nStepsPerLoop) / chunkSteps;
+    int sliceStage = (curStep % chunkSteps) / sliceSteps;
+    ssize_t sliceOffset;
+    ssize_t curSliceSize;
+    ssize_t offset;
+    ssize_t elemOffset = curLoop * loopSize;
+    ssize_t chunkSize = std::min(loopSize, channelSize - elemOffset);
+    ssize_t size;
+    int rankDest;
+
+    curSliceSize = std::max(divUp(chunkSize / elemSize, 16 * slicePerChunk) * 16, sliceSize / elemSize / 32) * elemSize;
+    sliceOffset = sliceStage * curSliceSize;
+    if (chunkStage == 0) {
+      rankDest = ringRanks[1];
+    } else {
+      rankDest = ringRanks[nRanks - chunkStage];
+    }
+    offset = elemOffset + rankDest * sendSize + sliceOffset;
+    *recvbuffOut = recvbuff + offset;
+    if (sizeOut) {
+      size = std::min(sliceSize, channelSize - elemOffset - sliceOffset);
+      *sizeOut = size < 0 ? 0 : size;
+    }
+    *mhandleOut = recvMhandle;
+  }
+
+  RingAGAlgorithm(const void *sendbuff, void *recvbuff, int nRanks, int *ringRanks, int chunkSteps, int sliceSteps, size_t chunkSize, size_t sliceSize, size_t gridOffset, size_t channelSize, int elemSize, size_t sendSize, void *sendMhandle, void *recvMhandle, void *srecvMhandle) {
+    this->ringRanks = ringRanks;
+    this->nRanks = nRanks;
+    this->nStepsPerLoop = (nRanks - 1) * chunkSteps;
+    this->chunkSteps = chunkSteps;
+    this->sliceSteps = sliceSteps;
+    this->elemSize = elemSize;
+    this->sliceSize = sliceSize;
+    this->loopSize = chunkSize;
+    this->sendSize = sendSize;
+    this->channelSize = channelSize;
+    this->sendbuff = (uint8_t*)sendbuff + gridOffset;
+    this->recvbuff = (uint8_t*)recvbuff + gridOffset;
+    this->sendMhandle = sendMhandle;
+    this->recvMhandle = recvMhandle;
+    this->srecvMhandle = srecvMhandle;
+    this->slicePerChunk = chunkSteps / sliceSteps;
+  }
+  ~RingAGAlgorithm() {}
+};
+
+class RingBCAlgorithm : public RingAlgorithm {
+private:
+  int root;
+  int rank;
+  int nextRank;
+public:
+  void getNextSendAddr(int curStep, uint8_t **sendbuffOut, size_t *sizeOut, void **mhandleOut) {
+    int curLoop = curStep / nStepsPerLoop;
+    int sliceStage = (curStep % chunkSteps) / sliceSteps;
+    ssize_t sliceOffset = sliceStage * sliceSize;
+    ssize_t offset;
+    ssize_t elemOffset = curLoop * loopSize;
+    ssize_t size;
+    uint8_t *buff;
+    void *mhandle;
+
+    offset = elemOffset + sliceOffset;
+    if (offset >= channelSize) {
+      buff = sendbuff;
+      mhandle = sendMhandle;
+    } else if (rank == root) {
+      buff = sendbuff + offset;
+      mhandle = sendMhandle;
+    } else {
+      buff = recvbuff + offset;
+      mhandle = srecvMhandle;
+    }
+    *sendbuffOut = buff;
+    size = std::min(sliceSize, channelSize - offset);
+    *sizeOut = size < 0 ? 0 : size;
+    *mhandleOut = mhandle;
+    return;
+  }
+
+  void getNextRecvAddr(int curStep, uint8_t **recvbuffOut, size_t *sizeOut, void **mhandleOut) {
+    int curLoop = curStep / nStepsPerLoop;
+    int sliceStage = (curStep % chunkSteps) / sliceSteps;
+    ssize_t sliceOffset = sliceStage * sliceSize;
+    ssize_t offset;
+    ssize_t elemOffset = curLoop * loopSize;
+    ssize_t size;
+    offset = elemOffset + sliceOffset;
+    if (offset >= channelSize) {
+      *recvbuffOut = recvbuff;
+    } else {
+      *recvbuffOut = recvbuff + offset;
+    }
+    if (sizeOut) {
+      size = std::min(sliceSize, channelSize - offset);
+      *sizeOut = size < 0 ? 0 : size;
+    }
+    *mhandleOut = recvMhandle;
+    return;
+  }
+
+  RingBCAlgorithm(const void* sendbuff, void* recvbuff, int rank, int root, int nRanks, int *ringRanks, int chunkSteps, int sliceSteps, size_t chunkSize, size_t sliceSize, size_t gridOffset, size_t channelSize, void *sendMhandle, void *recvMhandle, void *srecvMhandle) {
+    this->root = root;
+    this->rank = rank;
+    this->nextRank = ringRanks[1];
+    this->nStepsPerLoop = chunkSteps;
+    this->chunkSteps = chunkSteps;
+    this->sliceSteps = sliceSteps;
+    this->sliceSize = sliceSize;
+    this->loopSize = chunkSize;
+    this->channelSize = channelSize;
+    this->sendbuff = (uint8_t*)sendbuff + gridOffset;
+    this->recvbuff = (uint8_t*)recvbuff + gridOffset;
+    this->sendMhandle = sendMhandle;
+    this->recvMhandle = recvMhandle;
+    this->srecvMhandle = srecvMhandle;
+  }
+  ~RingBCAlgorithm() {}
+};
+
 template<typename T>
 class PatRSAlgorithm{
   size_t offset;
@@ -532,10 +847,10 @@ restart:
       int sendDataRank = (rank + nranks + s) % nranks;
       outIx = sendDataRank * count + offset;
       recvDim = s ? firstBitSet(s, nrPow2) : -1;
-      s -= (1<<recvDim);
       if (recvDim == -1) {
         recvOffset = -1;
       } else {
+        s -= (1<<recvDim);
         int foffset = (a*2*scale*aggDelta) >> (recvDim+1);
         recvOffset = (foffset%postFreq)*nelem;
         recvStepOffset = foffset / postFreq;
diff --git a/src/include/comm.h b/src/include/comm.h
index 9d102dfed2..c3f4eb49f6 100644
--- a/src/include/comm.h
+++ b/src/include/comm.h
@@ -197,12 +197,15 @@ struct ncclTaskColl {
   int32_t algorithm:8, protocol:8;
   uint32_t isCollnet:1, isNvls:1;
   uint32_t devFuncId:30;
-  enum ncclRegBufferType regBufType;
+  int regBufType;
   // number of elements in planner->ipcMemQueue associated with this collective
   int nCleanupQueueElts;
 
   void* sendMhandle;
   void* recvMhandle;
+  void** sendNetHandles;
+  void** recvNetHandles;
+  void** srecvNetHandles;
   // index for IPC record lookup
   uintptr_t sendbuffOffset;
   uintptr_t recvbuffOffset;
@@ -236,6 +239,7 @@ struct ncclKernelPlan {
   struct ncclKernelPlan* next;
 
   bool persistent; // aka captured in a graph
+  bool isHostCbEnq;
   enum ncclDevWorkStorageType workStorageType;
   bool kernelSpecialized;
   void *kernelFn;
@@ -365,6 +369,7 @@ struct ncclKernelPlanner {
 
   struct ncclIntruQueue<struct ncclTaskColl, &ncclTaskColl::next> collTaskQueue;
   struct ncclIntruQueue<struct ncclWorkList, &ncclWorkList::next> collWorkQueue;
+  struct ncclIntruQueue<struct ncclWorkList, &ncclWorkList::next> tmpCollWorkQueue;
   struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next> collCleanupQueue;
 
   //////////////////////////////////////////////////////////////////////////////
@@ -463,6 +468,8 @@ struct ncclComm {
 
   // Counter for tracking CUDA launches (P2P and collectives included)
   uint64_t opCount;
+  // Collective operation counter
+  uint64_t collOpCount;
 
   // Channels for collectives
   int nChannels; // connection nChannels
@@ -486,7 +493,6 @@ struct ncclComm {
   ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
   float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
   float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
-  float ringbdw[NCCL_NUM_FUNCTIONS][NCCL_NUM_PROTOCOLS];
   int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
 
   /* This attribute can indicate the states of communicators and return code of
@@ -532,7 +538,7 @@ struct ncclComm {
   int proxyRefCountOld; /* store proxy post-atomic-sub refcount */
   // Whether this communicator uses collNet
   int collNetSupport;
-  bool collNetRegSupport;
+  bool isOneRPN;
   uint8_t collNetSupportMatrix[4/*sum,prod,max,min*/][ncclNumTypes];
   bool intraNodeP2pSupport;
   int* collNetHeads;
@@ -560,6 +566,7 @@ struct ncclComm {
   // Subset of those in groupNext list. Holds 0x1 if not needing preconnect.
   struct ncclComm* preconnectNext;
   int persistentRefs; // number of persistent plan-lists capturing this comm
+  int noncapturedRefs; // number of non-captured hostStreamPlanCallback on the stream
   struct P2pSchedulePair { int sendRank; int recvRank; } *p2pSchedule;
 
   struct ncclKernelPlanner planner;
@@ -599,9 +606,16 @@ struct ncclComm {
 
   // buffer registration cache
   struct ncclRegCache regCache;
+  int isAllNvlink;
+  bool useNetPXN;
+  bool useGdr;
+  int splitCount;
   uint64_t endMagic;
 };
 
+static_assert(offsetof(struct ncclComm, startMagic) == 0, "startMagic must be the first field of ncclComm");
+static_assert(offsetof(struct ncclComm, endMagic) == sizeof(struct ncclComm) - sizeof(uint64_t), "endMagic must be the last field of ncclComm");
+
 enum ncclLaunchMode {
   ncclLaunchModeInvalid=0,
   ncclLaunchModeParallel,
@@ -644,7 +658,7 @@ inline ncclResult_t ncclCommPollEventCallbacks(struct ncclComm *comm) {
     }
   }
 finish:
-  cudaThreadExchangeStreamCaptureMode(&mode);
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
   return ncclSuccess;
 }
 
diff --git a/src/include/debug.h b/src/include/debug.h
index 491ac3e123..4e50cbf5a7 100644
--- a/src/include/debug.h
+++ b/src/include/debug.h
@@ -38,4 +38,6 @@ extern char ncclLastError[];
 
 void ncclSetThreadName(pthread_t thread, const char *fmt, ...);
 
+void ncclResetDebugInit();
+
 #endif
diff --git a/src/include/device.h b/src/include/device.h
index 153b5ae36c..0c861f5952 100644
--- a/src/include/device.h
+++ b/src/include/device.h
@@ -88,24 +88,18 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK
 #define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 8
 #define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)
 
-#define NCCL_DIRECT_WRITE 0x01
-#define NCCL_DIRECT_READ  0x02
+#define NCCL_P2P_WRITE 0x01
+#define NCCL_P2P_READ  0x02
 #define NCCL_DIRECT_NIC   0x04
-#define NCCL_IPC_WRITE    0x08
-#define NCCL_IPC_READ     0x10
-#define NCCL_NVLS_MIN_POLL 0x20
+#define NCCL_NVLS_MIN_POLL 0x80
 
 // Number of named barriers supported by CUDA
 #define NCCL_MAX_GROUPS 16
 
-#define NCCL_MAX_COLLNET_SIZE (1L << 29)
-
-enum ncclRegBufferType {
-  NCCL_REGULAR_BUFFER = 0,
-  NCCL_IPC_REG_BUFFER = 1,
-  NCCL_NVLS_REG_BUFFER = 2,
-  NCCL_COLLNET_REG_BUFFER = 3
-};
+#define NCCL_REGULAR_BUFFER 0x00
+#define NCCL_IPC_REG_BUFFER 0x01
+#define NCCL_NVLS_REG_BUFFER 0x02
+#define NCCL_NET_REG_BUFFER 0x04
 
 struct ncclConnInfo {
   // Regular comm mechanism
@@ -143,8 +137,6 @@ struct ncclConnector {
   struct ncclTransportComm* transportComm;
   void* transportResources;
   struct ncclConnInfo conn;
-  int sendMemSameProcess;
-  int recvMemSameProcess;
 };
 
 struct ncclRing {
@@ -228,7 +220,7 @@ struct alignas(16) ncclDevWorkP2p {
   uint8_t sendChunkSize_u32fp8, recvChunkSize_u32fp8;
 
   uint8_t sendProtoLL:1, recvProtoLL:1;
-  uint8_t sendRegistered:1, recvRegistered:1;
+  uint8_t sendNetReg:1, recvNetReg:1;
   uint8_t sendIpcReg:1, recvIpcReg:1;
 };
 
@@ -267,7 +259,7 @@ struct alignas(16) ncclDevWorkColl {
   //   nChannels == (channelHi - channelLo) + 1
   uint32_t channelLo:8, channelHi:8;
   uint32_t nWarps:8;
-  uint32_t redOpArgIsPtr:1, regUsed:2, oneNode:1, direct:4;
+  uint32_t redOpArgIsPtr:1, regUsed:1, netRegUsed:1, oneNode:1, direct:2, isOneRPN:1;
   uint32_t root;
   void* recvbuff;
   void* sendbuff;
@@ -393,7 +385,7 @@ struct ncclDevComm {
   int nNodes;
   int buffSizes[NCCL_NUM_PROTOCOLS];
   int p2pChunkSize;
-  int isNvlink;
+  int isAllNvlink;
 
   // Work fifo return credits
   uint32_t* workConsumed/*[MAXCHANNELS]*/;
@@ -525,9 +517,7 @@ inline bool ncclNvlsSupported(int devRedOp, int type) {
   case ncclInt64:
   case ncclUint64:
   case ncclFloat16:
-  #if defined(__CUDA_BF16_TYPES_EXIST__)
   case ncclBfloat16:
-  #endif
     return devRedOp == ncclDevSum || devRedOp == ncclDevMinMax;
   case ncclFloat:
   case ncclDouble:
diff --git a/src/include/enqueue.h b/src/include/enqueue.h
index 1bb5a604f5..3eb6c07435 100644
--- a/src/include/enqueue.h
+++ b/src/include/enqueue.h
@@ -25,5 +25,16 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
 ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
 ncclResult_t ncclLaunchFinish(struct ncclComm* comm);
 ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool* needConnect, ncclSimInfo_t* simInfo);
+ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm);
+
+static inline size_t ncclFuncSendCount(ncclFunc_t func, int nRanks, size_t count) {
+  return func == ncclFuncReduceScatter ? nRanks*count : count;
+}
+static inline size_t ncclFuncRecvCount(ncclFunc_t func, int nRanks, size_t count) {
+  return func == ncclFuncAllGather ? nRanks*count : count;
+}
+static inline size_t ncclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count) {
+  return func == ncclFuncAllGather || func == ncclFuncReduceScatter ? nRanks*count : count;
+}
 
 #endif // End include guard
diff --git a/src/include/graph.h b/src/include/graph.h
index b6d86b398e..602cc8cd9a 100644
--- a/src/include/graph.h
+++ b/src/include/graph.h
@@ -19,7 +19,7 @@ ncclResult_t ncclTopoCudaPath(int cudaDev, char** path);
 
 struct ncclTopoSystem;
 // Build the topology
-ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system);
+ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system, const char* dumpXmlFile=NULL);
 ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system);
 ncclResult_t ncclTopoPrint(struct ncclTopoSystem* system);
 
@@ -33,10 +33,11 @@ ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm);
 
 // Query topology
 ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int64_t* id, int* dev, int* proxyRank);
-ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank);
+ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank);
 ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* ret);
-ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int64_t netId, int read, int* useGdr);
-ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int* flush);
+ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int rank, int64_t netId, int read, int* useGdr);
+ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int netDev, int rank, int* flush);
+ncclResult_t ncclTopoIsGdrAvail(struct ncclTopoSystem* system, int rank, bool *avail);
 ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int rank1, int rank2, int* net);
 int ncclPxnDisable(struct ncclComm* comm);
 ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks);
@@ -118,6 +119,6 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
     struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, struct ncclComm* parent);
 
 ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs);
-ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, int protocol, size_t nBytes, int numPipeOps, float* time, bool* backup=nullptr);
+ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, int protocol, size_t nBytes, int numPipeOps, float* time);
 
 #endif
diff --git a/src/include/ibvwrap.h b/src/include/ibvwrap.h
index c3709584c3..3a4c42bb21 100644
--- a/src/include/ibvwrap.h
+++ b/src/include/ibvwrap.h
@@ -12,6 +12,8 @@
 #ifndef NCCL_IBVWRAP_H_
 #define NCCL_IBVWRAP_H_
 
+#include <arpa/inet.h>
+#include <netinet/in.h>
 #ifdef NCCL_BUILD_RDMA_CORE
 #include <infiniband/verbs.h>
 #else
@@ -89,4 +91,14 @@ static inline ncclResult_t wrap_ibv_post_recv(struct ibv_qp *qp, struct ibv_recv
 
 ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event);
 
+// converts a GID into a readable string. On success, returns a non-null pointer to gidStr.
+// NULL is returned if there was an error, with errno set to indicate the error.
+// errno = ENOSPC if the converted string would exceed strLen.
+static inline const char* ibvGetGidStr(union ibv_gid* gid, char* gidStr, size_t strLen) {
+  // GID is a 16B handle, to convert it to a readable form, we use inet_ntop
+  // sizeof(ibv_gid) == sizeof(struct in6_addr), so using AF_INET6
+  static_assert(sizeof(union ibv_gid) == sizeof(struct in6_addr), "the sizeof struct ibv_gid must be the size of struct in6_addr");
+  return inet_ntop(AF_INET6, gid->raw, gidStr, strLen);
+}
+
 #endif //End include guard
diff --git a/src/include/nccl_common.h b/src/include/nccl_common.h
index 26851b17e3..fcf2251fe9 100644
--- a/src/include/nccl_common.h
+++ b/src/include/nccl_common.h
@@ -32,6 +32,7 @@ typedef enum {
   NCCL_BOOTSTRAP = 0x1000,
   NCCL_REG = 0x2000,
   NCCL_PROFILE = 0x4000,
+  NCCL_RAS = 0x8000,
   NCCL_ALL = ~0
 } ncclDebugLogSubSys;
 
diff --git a/src/include/nccl_net.h b/src/include/nccl_net.h
index 467d9fdb89..f165aa1bf0 100644
--- a/src/include/nccl_net.h
+++ b/src/include/nccl_net.h
@@ -13,6 +13,9 @@
 #include <stdint.h>
 
 #define NCCL_NET_HANDLE_MAXSIZE 128
+//Maximum value NCCL can accept for maxP2pBytes and maxCollBytes net properties
+#define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L)
+#define NCCL_NET_OPTIONAL_RECV_COMPLETION 0x1
 
 #define NCCL_PTR_HOST 0x1
 #define NCCL_PTR_CUDA 0x2
@@ -21,6 +24,161 @@
 // Maximum number of requests per comm object
 #define NCCL_NET_MAX_REQUESTS 32
 
+// Max number of ncclNet objects which can live in the same process
+#define NCCL_NET_MAX_PLUGINS 3
+
+#define NCCL_NET_MAX_DEVS_PER_NIC_V9 4
+#define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V9
+
+typedef struct {
+  int ndevs;
+  int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9];
+} ncclNetVDeviceProps_v9_t;
+typedef ncclNetVDeviceProps_v9_t ncclNetVDeviceProps_t;
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int regIsGlobal;                 // regMr is not tied to a particular comm
+  int forceFlush;                  // Force a flush on receives
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+  ncclNetVDeviceProps_v9_t vProps;
+  size_t maxP2pBytes;              // Max transfer size for point-to-point operations
+  size_t maxCollBytes;             // Max transfer size for collective operations
+} ncclNetProperties_v9_t;
+typedef ncclNetProperties_v9_t ncclNetProperties_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+
+  // Create a virtual NIC given the specified properties, which can be accessed at device index d
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props);
+} ncclNet_v9_t;
+
+typedef ncclNet_v9_t ncclNet_t;
+
+#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v9
+
+typedef struct {
+  void* mhandle;
+  void* address;
+  size_t size;
+} ncclNetSGE_v9_t;
+
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, size_t count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v9_t* recvParts,
+                             size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                             void* sendMhandle, void** request);
+  ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v9_t* sendParts, void* recvData,
+                                 size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                                 ncclDataType_t dataType, ncclRedOp_t redOp,
+                                 void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Create a virtual NIC given the specified properties, which can be accessed at device index d
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props);
+} ncclCollNet_v9_t;
+
+typedef ncclCollNet_v9_t ncclCollNet_t;
+
+#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v9
+
 typedef struct {
   char* name;                      // Used mostly for logging.
   char* pciPath;                   // Path to the PCI device in /sys.
@@ -37,8 +195,6 @@ typedef struct {
   int netDeviceVersion;            // Version number for network offload
 } ncclNetProperties_v8_t;
 
-typedef ncclNetProperties_v8_t ncclNetProperties_t;
-
 typedef struct {
   // Name of the network (mainly for logs)
   const char* name;
@@ -94,10 +250,6 @@ typedef struct {
   ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
 } ncclNet_v8_t;
 
-typedef ncclNet_v8_t ncclNet_t;
-
-#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v8
-
 typedef struct {
   void* mhandle;
   void* address;
@@ -151,10 +303,6 @@ typedef struct {
   ncclResult_t (*closeListen)(void* listenComm);
 } ncclCollNet_v8_t;
 
-typedef ncclCollNet_v8_t ncclCollNet_t;
-
-#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v8
-
 typedef struct {
   char* name;                      // Used mostly for logging.
   char* pciPath;                   // Path to the PCI device in /sys.
diff --git a/src/include/nccl_profiler.h b/src/include/nccl_profiler.h
index 556a0f6e45..a8164d075e 100644
--- a/src/include/nccl_profiler.h
+++ b/src/include/nccl_profiler.h
@@ -16,9 +16,133 @@ enum {
   ncclProfileProxyOp   = (1 << 3),  // proxy operation event type
   ncclProfileProxyStep = (1 << 4),  // proxy step event type
   ncclProfileProxyCtrl = (1 << 5),  // proxy control event type
-  ncclProfileNumEvents = (     6),
 };
 
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      size_t trafficBytes;
+      uint8_t nMaxChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+    } coll;
+
+    struct {
+      const char* name;
+      uint64_t commHash;
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+  };
+} ncclProfilerEventDescr_v2_t;
+
+typedef enum {
+  ncclProfilerProxyOpSendPosted,
+  ncclProfilerProxyOpSendRemFifoWait,
+  ncclProfilerProxyOpSendTransmitted,
+  ncclProfilerProxyOpSendDone,
+  ncclProfilerProxyOpRecvPosted,
+  ncclProfilerProxyOpRecvReceived,
+  ncclProfilerProxyOpRecvTransmitted,
+  ncclProfilerProxyOpRecvDone,
+
+  /* Legacy proxy profiler states */
+  ncclProfilerProxyStepSendGPUWait,
+  ncclProfilerProxyStepSendWait,
+  ncclProfilerProxyStepRecvWait,
+  ncclProfilerProxyStepRecvFlushWait,
+  ncclProfilerProxyStepRecvGPUWait,
+
+  /* Legacy proxy control states */
+  ncclProfilerProxyCtrlIdle,
+  ncclProfilerProxyCtrlActive,
+  ncclProfilerProxyCtrlSleep,
+  ncclProfilerProxyCtrlWakeup,
+  ncclProfilerProxyCtrlAppend,
+  ncclProfilerProxyCtrlAppendEnd,
+} ncclProfilerEventState_v2_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+    int steps;
+  } proxyOp;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+} ncclProfilerEventStateArgs_v2_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v2_t;
+
+typedef ncclProfilerEventDescr_v2_t ncclProfilerEventDescr_t;
+typedef ncclProfilerEventState_v2_t ncclProfilerEventState_t;
+typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_t;
+typedef ncclProfiler_v2_t ncclProfiler_t;
+
 typedef struct {
   uint8_t type;                 // event type descriptor: ncclProfileColl, ...
   void* parentObj;              // pointer to the profiler parent object (for coll is the group)
@@ -69,42 +193,8 @@ typedef struct {
   };
 } ncclProfilerEventDescr_v1_t;
 
-typedef enum {
-  ncclProfilerProxyOpSendPosted,
-  ncclProfilerProxyOpSendRemFifoWait,
-  ncclProfilerProxyOpSendTransmitted,
-  ncclProfilerProxyOpSendDone,
-  ncclProfilerProxyOpRecvPosted,
-  ncclProfilerProxyOpRecvReceived,
-  ncclProfilerProxyOpRecvTransmitted,
-  ncclProfilerProxyOpRecvDone,
-
-  /* Legacy proxy profiler states */
-  ncclProfilerProxyStepSendGPUWait,
-  ncclProfilerProxyStepSendWait,
-  ncclProfilerProxyStepRecvWait,
-  ncclProfilerProxyStepRecvFlushWait,
-  ncclProfilerProxyStepRecvGPUWait,
-
-  /* Legacy proxy control states */
-  ncclProfilerProxyCtrlIdle,
-  ncclProfilerProxyCtrlActive,
-  ncclProfilerProxyCtrlSleep,
-  ncclProfilerProxyCtrlWakeup,
-  ncclProfilerProxyCtrlAppend,
-  ncclProfilerProxyCtrlAppendEnd,
-} ncclProfilerEventState_v1_t;
-
-typedef union {
-  struct {
-    size_t transSize;
-    int steps;
-  } proxyOp;
-
-  struct {
-    int appendedProxyOps;
-  } proxyCtrl;
-} ncclProfilerEventStateArgs_v1_t;
+typedef ncclProfilerEventState_v2_t ncclProfilerEventState_v1_t;
+typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_v1_t;
 
 typedef struct {
   const char* name;
@@ -142,9 +232,4 @@ typedef struct {
   ncclResult_t (*finalize)(void* context);
 } ncclProfiler_v1_t;
 
-typedef ncclProfilerEventDescr_v1_t ncclProfilerEventDescr_t;
-typedef ncclProfilerEventState_v1_t ncclProfilerEventState_t;
-typedef ncclProfilerEventStateArgs_v1_t ncclProfilerEventStateArgs_t;
-typedef ncclProfiler_v1_t ncclProfiler_t;
-
 #endif
diff --git a/src/include/nccl_tuner.h b/src/include/nccl_tuner.h
index 5cd02149f9..6e61118b9c 100644
--- a/src/include/nccl_tuner.h
+++ b/src/include/nccl_tuner.h
@@ -11,6 +11,55 @@
 #include "nccl.h"
 #include "nccl_common.h"
 
+// API to be implemented by external tuner
+typedef struct {
+  // Name of the tuner
+  const char* name;
+
+  // Initializes tuner states.
+  // Inputs:
+  //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
+  //   - nNodes: number of nodes in current communicator.
+  //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
+  // Outputs:
+  //   - context: tuner context object
+  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
+
+  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
+  // Inputs:
+  //   - context: tuner context object
+  //   - collType: collective type , e.g., allreduce, allgather…
+  //   - nBytes: collective size in bytes
+  //   - numPipeOps: number of operations in the group
+  //   - numAlgo: number of algorithms in collCostTable
+  //   - numProto: number of protocols in collCostTable
+  //   - regBuff: can register user buffer
+  //
+  // Outputs:
+  //   - nChannels: number of channels (hence SMs) to be used.
+  //
+  // InOut:
+  //   - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
+  //                    NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
+  //
+  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
+  // default tuning for the given collective.
+  // Also, the plugin is allowed to not set any output, or set only the
+  // algorithm and protocol, but not only the algorithm or only the protocol.
+  // Unset fields will be set automatically by NCCL.
+  ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
+                              int numPipeOps, float** collCostTable, int numAlgo, int numProto,
+                              int regBuff, int* nChannels);
+
+  // Terminates the plugin and cleans up any resources that the plugin allocated.
+  // context: tuner context object
+  ncclResult_t (*destroy)(void* context);
+} ncclTuner_v4_t;
+
+typedef ncclTuner_v4_t ncclTuner_t;
+
+#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4"
+
 // API to be implemented by external tuner
 typedef struct {
   // Name of the tuner
@@ -55,10 +104,6 @@ typedef struct {
   ncclResult_t (*destroy)(void* context);
 } ncclTuner_v3_t;
 
-typedef ncclTuner_v3_t ncclTuner_t;
-
-#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v3"
-
 // API to be implemented by external tuner
 typedef struct {
   // Name of the tuner
diff --git a/src/include/net_device.h b/src/include/net_device.h
index 7bb2968c05..5fae9b5424 100644
--- a/src/include/net_device.h
+++ b/src/include/net_device.h
@@ -25,6 +25,7 @@ typedef struct {
 } ncclNetDeviceHandle_v7_t;
 
 typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
-typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_t;
+typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t;
+typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_t;
 
 #endif
diff --git a/src/include/nvmlwrap.h b/src/include/nvmlwrap.h
index 7dee7d4aef..72fbf9ce2a 100644
--- a/src/include/nvmlwrap.h
+++ b/src/include/nvmlwrap.h
@@ -302,7 +302,7 @@ extern ncclNvmlDevicePairInfo ncclNvmlDevicePairs[ncclNvmlMaxDevices][ncclNvmlMa
 
 struct ncclNvmlCCStatus {
     bool CCEnabled;
-    bool multiGpuCCEnabled;
+    bool multiGpuProtectedPCIE;
 };
 
 // All ncclNvmlFoo() functions call ncclNvmlEnsureInitialized() implicitly.
diff --git a/src/include/profiler.h b/src/include/profiler.h
index 36774dc848..2b7efe0f69 100644
--- a/src/include/profiler.h
+++ b/src/include/profiler.h
@@ -36,9 +36,9 @@ ncclResult_t ncclProfilerStartRecvProxyOpEvent(int sub, struct ncclProxyArgs* ar
 ncclResult_t ncclProfilerStopProxyOpEvent(int sub, struct ncclProxyArgs* args);
 
 // Proxy Step Start/Stop Event Wrappers
-ncclResult_t ncclProfilerStartSendProxyStepEvents(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi);
-ncclResult_t ncclProfilerStartRecvProxyStepEvents(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi);
-ncclResult_t ncclProfilerStopProxyStepEvents(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi);
+ncclResult_t ncclProfilerStartSendProxyStepEvent(int sub, struct ncclProxyArgs* args, int stepId);
+ncclResult_t ncclProfilerStartRecvProxyStepEvent(int sub, struct ncclProxyArgs* args, int stepId);
+ncclResult_t ncclProfilerStopProxyStepEvent(int sub, struct ncclProxyArgs* args, int stepId);
 
 // Proxy Control Start/Stop Events Wrappers
 ncclResult_t ncclProfilerStartProxyCtrlEvent(void* profilerContext, void** eHandle);
@@ -46,7 +46,7 @@ ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle);
 
 // Record Event Wrappers
 ncclResult_t ncclProfilerRecordProxyOpEventState(int sub, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState);
-ncclResult_t ncclProfilerRecordProxyStepEventStates(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi, ncclProfilerEventState_t eState);
+ncclResult_t ncclProfilerRecordProxyStepEventState(int sub, struct ncclProxyArgs* args, int stepId, ncclProfilerEventState_t eState);
 ncclResult_t ncclProfilerRecordProxyCtrlEventState(void*eHandle, int appended, ncclProfilerEventState_t eState);
 
 // Profiler utility functions
diff --git a/src/include/proxy.h b/src/include/proxy.h
index a1c44d6b1f..b6ef0fa9db 100644
--- a/src/include/proxy.h
+++ b/src/include/proxy.h
@@ -15,6 +15,7 @@
 #include <pthread.h>
 #include "shmutils.h"
 #include "p2p.h"
+#include "collectives.h"
 
 typedef enum : uint8_t {
   ncclPatternRing,
@@ -56,7 +57,11 @@ struct ncclProxyOp {
   int root;
   int next;
   int nsteps;
-  int chunkSize;
+  size_t chunkSize;
+  size_t sliceSize;
+  size_t loopSize;
+  size_t loopOffset;
+  size_t channelSize;
   uint8_t sliceSteps;
   uint8_t chunkSteps;
   uint8_t channelId;
@@ -65,13 +70,15 @@ struct ncclProxyOp {
   uint8_t /*ncclFunc_t*/ coll;
   uint8_t /*ncclPattern_t*/ pattern;
   uint8_t protocol;
+  uint8_t algorithm;
   uint8_t reg;
-  // collnet buffer reg handles
+  // collnet/p2p/coll buffer reg handles
   void* sendMhandle;
   void* recvMhandle;
   uint8_t* sendbuff;
   uint8_t* recvbuff;
-
+  int isOneRPN;
+  RingAlgorithm *ringAlgo;
   union ncclProxyOpSpecifics specifics;
 
   // Profiler plugin
@@ -93,19 +100,21 @@ struct ncclProxyOp {
 struct ncclProxySubArgs {
   struct ncclProxyConnection* connection;
   int reg;
-  // p2p mhandle
-  void* mhandle;
   // collnet handles
   void* sendMhandle;
   void* recvMhandle;
   uint8_t* sendbuff;
   uint8_t* recvbuff;
   size_t offset;
+  ssize_t loopSize;
+  ssize_t loopOffset;
   int channelId;
   int nsteps;
   ssize_t nbytes;
+  ssize_t chunkSize;
   int peer;
-
+  int isOneRPN;
+  RingAlgorithm *ringAlgo;
   int groupSize; // Number of consecutive sub operations sharing the same recvComm
   uint64_t base;
   uint64_t posted;
@@ -114,11 +123,14 @@ struct ncclProxySubArgs {
   uint64_t transmitted;
   uint64_t done;
   uint64_t end;
+  int regBufferReady;
   void* requests[NCCL_STEPS];
 
   // Profiler plugin
   int eActivationMask;
   int rank;
+  pid_t pid;
+  void* profilerContext;
   void* taskEventHandle;
   void* opEventHandle;
   void* stepEventHandles[NCCL_STEPS];
@@ -133,10 +145,11 @@ struct ncclProxyArgs {
   proxyProgressFunc_t progress;
   int nsubs;
   int done;
+  int onePPN;
   uint64_t opCount;
   int sliceSteps;
   int chunkSteps;
-  int chunkSize;
+  size_t chunkSize;
   size_t totalSendSize;
   size_t totalRecvSize;
   size_t sendSizePerRound;
@@ -146,16 +159,13 @@ struct ncclProxyArgs {
   uint8_t /*ncclPattern_t*/ pattern;
   uint8_t /*ncclFunc_t*/ coll;
   uint8_t protocol;
+  uint8_t algorithm;
   int state;
   char* sharedBuff[NCCL_STEPS];
   int sharedSize[NCCL_STEPS];
 
   int idle;
 
-  // Profiler plugin
-  pid_t pid;
-  void* profilerContext;
-
   // Element linking
   struct ncclProxyArgs* next;
   struct ncclProxyArgs* nextPeer;
diff --git a/src/include/ras.h b/src/include/ras.h
new file mode 100644
index 0000000000..7909b3dc89
--- /dev/null
+++ b/src/include/ras.h
@@ -0,0 +1,24 @@
+/*************************************************************************
+ * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_RAS_H_
+#define NCCL_RAS_H_
+
+#include "socket.h"
+
+// Structure used to communicate data about NCCL ranks from NCCL threads to RAS.
+struct rasRankInit {
+  union ncclSocketAddress addr;
+  pid_t pid;
+  int cudaDev;
+  int nvmlDev;
+};
+
+ncclResult_t ncclRasCommInit(struct ncclComm* comm, struct rasRankInit* myRank);
+ncclResult_t ncclRasCommFini(const struct ncclComm* comm);
+ncclResult_t ncclRasAddRanks(struct rasRankInit* ranks, int nranks);
+
+#endif // !NCCL_RAS_H_
diff --git a/src/include/register.h b/src/include/register.h
index 7c60535d9a..740a645f43 100644
--- a/src/include/register.h
+++ b/src/include/register.h
@@ -6,6 +6,9 @@
 #include <cuda.h>
 #include <stdint.h>
 
+int64_t ncclParamLocalRegister();
+int64_t ncclParamGraphRegister();
+
 enum {
   NET_REG_COMPLETE = 0x01,
   NVLS_REG_COMPLETE = 0x02,
@@ -20,16 +23,21 @@ struct ncclPeerRegIpcAddr {
   uintptr_t* hostPeerRmtAddrs;
 };
 
+struct ncclRegNetHandles {
+  void* handle;
+  struct ncclProxyConnector* proxyConn;
+  struct ncclRegNetHandles* next;
+};
+
 struct ncclReg {
   // common attributes
   size_t pages;
-  int refs;
+  int localRefs;
+  int graphRefs;
   uintptr_t addr;
   uint32_t state;
   // net reg
-  int nDevs;
-  int devs[MAXCHANNELS];
-  void** handles;
+  struct ncclRegNetHandles* netHandleHead;
   // nvls reg
   uintptr_t baseAddr;
   size_t baseSize;
@@ -50,11 +58,12 @@ struct ncclRegCache {
   struct ncclReg **slots;
   int capacity, population;
   uintptr_t pageSize;
-  void* sComms[MAXCHANNELS];
-  void* rComms[MAXCHANNELS];
 };
 
 ncclResult_t ncclRegCleanup(struct ncclComm* comm);
 ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** reg);
+ncclResult_t ncclCommGraphRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
+ncclResult_t ncclCommGraphDeregister(const ncclComm_t comm, struct ncclReg *handle);
+ncclResult_t ncclRegLocalIsValid(struct ncclReg *reg, bool *isValid);
 
 #endif
diff --git a/src/include/shmutils.h b/src/include/shmutils.h
index 43e8afb79a..097b4c6577 100644
--- a/src/include/shmutils.h
+++ b/src/include/shmutils.h
@@ -10,7 +10,7 @@
 #include "nccl.h"
 
 typedef void* ncclShmHandle_t;
-ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle);
+ncclResult_t ncclShmOpen(char* shmPath, size_t shmPathSize, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle);
 ncclResult_t ncclShmClose(ncclShmHandle_t handle);
 ncclResult_t ncclShmUnlink(ncclShmHandle_t handle);
 
diff --git a/src/include/socket.h b/src/include/socket.h
index 60a4138752..f0a3237cee 100644
--- a/src/include/socket.h
+++ b/src/include/socket.h
@@ -17,9 +17,6 @@
 
 #define MAX_IFS 16
 #define MAX_IF_NAME_SIZE 16
-#define SLEEP_INT            1000 // connection retry sleep interval in usec
-#define RETRY_REFUSED_TIMES   2e4 // connection refused retry times before reporting a timeout (20 sec)
-#define RETRY_TIMEDOUT_TIMES    3 // connection timed out retry times (each one can take 20s)
 #define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV)
 #define NCCL_SOCKET_MAGIC 0x564ab9f2fc4b9d6cULL
 
@@ -39,9 +36,10 @@ enum ncclSocketState {
   ncclSocketStateConnectPolling = 5,
   ncclSocketStateConnected = 6,
   ncclSocketStateReady = 7,
-  ncclSocketStateClosed = 8,
-  ncclSocketStateError = 9,
-  ncclSocketStateNum = 10
+  ncclSocketStateTerminating = 8,
+  ncclSocketStateClosed = 9,
+  ncclSocketStateError = 10,
+  ncclSocketStateNum = 11
 };
 
 enum ncclSocketType {
@@ -49,14 +47,14 @@ enum ncclSocketType {
   ncclSocketTypeBootstrap = 1,
   ncclSocketTypeProxy = 2,
   ncclSocketTypeNetSocket = 3,
-  ncclSocketTypeNetIb = 4
+  ncclSocketTypeNetIb = 4,
+  ncclSocketTypeRasNetwork = 5
 };
 
 struct ncclSocket {
   int fd;
   int acceptFd;
-  int timedOutRetries;
-  int refusedRetries;
+  int errorRetries;
   union ncclSocketAddress addr;
   volatile uint32_t* abortFlag;
   int asyncFlag;
@@ -64,15 +62,18 @@ struct ncclSocket {
   int salen;
   uint64_t magic;
   enum ncclSocketType type;
+  int customRetry;
+  int finalizeCounter; // Used to keep track of initial handshake for async sockets.
+  char finalizeBuffer[sizeof(uint64_t)]; // Used to keep track of initial handshake for async sockets.
 };
 
-const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1);
+const char *ncclSocketToString(const union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1);
 ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair);
 int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs);
 int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs);
 
 // Initialize a socket
-ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr = NULL, uint64_t magic = NCCL_SOCKET_MAGIC, enum ncclSocketType type = ncclSocketTypeUnknown, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0);
+ncclResult_t ncclSocketInit(struct ncclSocket* sock, const union ncclSocketAddress* addr = NULL, uint64_t magic = NCCL_SOCKET_MAGIC, enum ncclSocketType type = ncclSocketTypeUnknown, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0, int customRetry = 0);
 // Create a listening socket. sock->addr can be pre-filled with IP & port info. sock->fd is set after a successful call
 ncclResult_t ncclSocketListen(struct ncclSocket* sock);
 ncclResult_t ncclSocketGetAddr(struct ncclSocket* sock, union ncclSocketAddress* addr);
@@ -88,11 +89,12 @@ ncclResult_t ncclSocketSetFd(int fd, struct ncclSocket* sock);
 #define NCCL_SOCKET_SEND 0
 #define NCCL_SOCKET_RECV 1
 
-ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset);
+ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int* closed = NULL);
 ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset);
 ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size);
 ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size);
 ncclResult_t ncclSocketSendRecv(struct ncclSocket* sendSock, void* sendPtr, int sendSize, struct ncclSocket* recvSock, void* recvPtr, int recvSize);
 ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking);
+ncclResult_t ncclSocketShutdown(struct ncclSocket* sock, int how);
 ncclResult_t ncclSocketClose(struct ncclSocket* sock);
 #endif
diff --git a/src/include/transport.h b/src/include/transport.h
index cbeb613ca5..37187f69ea 100644
--- a/src/include/transport.h
+++ b/src/include/transport.h
@@ -28,7 +28,6 @@ extern struct ncclTransport netTransport;
 extern struct ncclTransport collNetTransport;
 
 extern struct ncclTransport* ncclTransports[];
-
 // Forward declarations
 struct ncclRing;
 struct ncclConnector;
@@ -115,16 +114,16 @@ struct ncclTransport {
 };
 
 ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
-ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL);
+ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex);
 ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* intraNodeP2pSupport, bool* directMode);
 
 ncclResult_t ncclNvlsInit(struct ncclComm* comm);
 ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent);
 ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm);
 ncclResult_t ncclNvlsTreeConnect(struct ncclComm* comm);
-ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, int* nCleanupQueueElts);
-ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv);
-ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size);
+ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, int* nCleanupQueueElts);
+ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv);
+ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size);
 ncclResult_t ncclNvlsFree(struct ncclComm* comm);
 
 enum { collNetRecv=0, collNetSend=1 };
@@ -143,4 +142,13 @@ ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTop
 ncclResult_t ncclCollNetChainBufferSetup(ncclComm_t comm);
 ncclResult_t ncclCollNetDirectBufferSetup(ncclComm_t comm);
 
+ncclResult_t ncclNetDeregBuffer(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void* handle);
+ncclResult_t ncclNetLocalRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, struct ncclConnector** peerConns, int nPeers, int* outRegBufFlag, void** outHandle);
+ncclResult_t ncclNetGraphRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, struct ncclConnector** peerConns, int nPeers, int* outRegBufFlag, void** outHandle, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, int* nCleanupQueueElts);
+
+ncclResult_t ncclRegisterP2pIpcBuffer(struct ncclComm* comm, void* userbuff, size_t size, int peerRank, int* regFlag, void** regAddr, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue);
+ncclResult_t ncclRegisterP2pNetBuffer(struct ncclComm* comm, void* userbuff, size_t size, struct ncclConnector* conn, int* regFlag, void** handle, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue);
+ncclResult_t ncclRegisterCollBuffers(struct ncclComm* comm, struct ncclTaskColl* info, void* outRegBufSend[NCCL_MAX_LOCAL_RANKS], void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS], struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, bool* regNeedConnect);
+ncclResult_t ncclRegisterCollNvlsBuffers(struct ncclComm* comm, struct ncclTaskColl* info, void* outRegBufSend[NCCL_MAX_LOCAL_RANKS], void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS], struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, bool* regNeedConnect);
+
 #endif
diff --git a/src/include/utils.h b/src/include/utils.h
index 5a1b749a76..383f678c87 100644
--- a/src/include/utils.h
+++ b/src/include/utils.h
@@ -49,8 +49,7 @@ inline uint64_t clockNano() {
   return uint64_t(ts.tv_sec)*1000*1000*1000 + ts.tv_nsec;
 }
 
-/* get any bytes of random data from /dev/urandom, return 0 if it succeeds; else
- * return -1 */
+/* get any bytes of random data from /dev/urandom, return ncclSuccess (0) if it succeeds. */
 inline ncclResult_t getRandomData(void* buffer, size_t bytes) {
   ncclResult_t ret = ncclSuccess;
   if (bytes > 0) {
diff --git a/src/init.cc b/src/init.cc
index 94c2fb10ee..5caaaae09b 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -17,6 +17,7 @@
 #include "graph.h"
 #include "argcheck.h"
 #include "tuner.h"
+#include "ras.h"
 #include <fcntl.h>
 #include <string.h>
 #include <errno.h>
@@ -182,6 +183,8 @@ static ncclResult_t commFree(ncclComm_t comm) {
   if (comm == NULL)
     return ncclSuccess;
 
+  NCCLCHECK(ncclRasCommFini(comm));
+
   /* in commReclaim, we have guaranteed only last rank which calls ncclCommDestroy() will
    * free all intra-process communicators; therefore, we only need to focus on local
    * resource cleanup in commFree(). */
@@ -193,7 +196,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
     }
   }
 
-  CUDACHECK(cudaMemPoolDestroy(comm->memPool));
+  if (comm->memPool) CUDACHECK(cudaMemPoolDestroy(comm->memPool));
 
   delete[] comm->userRedOps;
 
@@ -421,11 +424,6 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
 
   ncclIntruQueueConstruct(&comm->eventCallbackQueue);
 
-  //  setup intraComm0 and intraRanks 0 to default values to ensure proper cleanup of the communicator
-  comm->intraComm0 = comm;
-  comm->intraRank = 0;
-  comm->intraRanks = 1;
-
   return ncclSuccess;
 }
 
@@ -435,6 +433,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
   struct ncclDevCommAndChannels tmpCommAndChans;
   struct ncclDevCommAndChannels *devCommAndChans = NULL;
   struct ncclNvmlCCStatus ccStatus;
+  bool ccEnable;
 
   NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), ret, fail);
   NCCLCHECKGOTO(ncclCudaCallocAsync(&devCommAndChans, 1, comm->sharedRes->deviceStream.cudaStream), ret, fail);
@@ -448,7 +447,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
   tmpCommAndChans.comm.node = comm->node;
   tmpCommAndChans.comm.nNodes = comm->nNodes;
   tmpCommAndChans.comm.abortFlag = comm->abortFlagDev;
-  tmpCommAndChans.comm.isNvlink = ncclTopoPathAllNVLink(comm->topo);
+  tmpCommAndChans.comm.isAllNvlink = comm->isAllNvlink;
   for (int p=0; p < NCCL_NUM_PROTOCOLS; p++) {
     tmpCommAndChans.comm.buffSizes[p] = comm->buffSizes[p];
   }
@@ -458,11 +457,9 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
   comm->workArgsBytes = std::min<size_t>(ncclParamWorkArgsBytes(), ncclMaxKernelArgsSize(comm->cudaArch));
 
   memset(&ccStatus, 0, sizeof(ccStatus));
-  if (ncclNvmlGetCCStatus(&ccStatus) == ncclSuccess && ccStatus.CCEnabled) {
+  ccEnable = (ncclSuccess == ncclNvmlGetCCStatus(&ccStatus)) && (ccStatus.CCEnabled || ccStatus.multiGpuProtectedPCIE);
+  if (ccEnable) {
     comm->workFifoBytes = 0;
-    if (ccStatus.multiGpuCCEnabled == false && comm->rank == 0) {
-      WARN("CC On, Multi-GPU CC Off (No inter-GPU communication protection)");
-    }
   } else {
     comm->workFifoBytes = ncclParamWorkFifoBytes();
     if (0 != (comm->workFifoBytes & (comm->workFifoBytes-1))) {
@@ -473,7 +470,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
   }
 
   if (comm->rank == 0) {
-    INFO(NCCL_INIT, "CC %s, Multi-GPU CC %s, workFifoBytes %d", ccStatus.CCEnabled ? "On" : "Off", ccStatus.multiGpuCCEnabled ? "On" : "Off", comm->workFifoBytes);
+    INFO(NCCL_INIT, "CC %s, workFifoBytes %d", ccEnable ? "On" : "Off", comm->workFifoBytes);
   }
 
   if (ncclGdrCopy != NULL && ncclParamGdrCopyFifoEnable() == 1) {
@@ -608,9 +605,6 @@ NCCL_PARAM(P2pPciChunkSize, "P2P_PCI_CHUNKSIZE", (1 << 17)); /* 128 kB */
 NCCL_PARAM(P2pNvlChunkSize, "P2P_NVL_CHUNKSIZE", (1 << 19)); /* 512 kB */
 
 static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
-  int cpuArch, cpuVendor, cpuModel;
-  NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
-
   int64_t envs[NCCL_NUM_PROTOCOLS] = { ncclParamLlBuffSize(), ncclParamLl128BuffSize(), ncclParamBuffSize() };
   int defaults[NCCL_NUM_PROTOCOLS] = { DEFAULT_LL_BUFFSIZE, DEFAULT_LL128_BUFFSIZE, DEFAULT_BUFFSIZE };
 
@@ -619,7 +613,7 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
   }
 
   if (comm->nNodes > 1) comm->p2pChunkSize = ncclParamP2pNetChunkSize();
-  else if (ncclTopoPathAllNVLink(comm->topo)) comm->p2pChunkSize = ncclParamP2pNvlChunkSize();
+  else if (comm->isAllNvlink) comm->p2pChunkSize = ncclParamP2pNvlChunkSize();
   else comm->p2pChunkSize = ncclParamP2pPciChunkSize();
 
   // Make sure P2P chunksize is not larger than coll chunksize.
@@ -850,6 +844,14 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   } while(0);
 
   timers[TIMER_INIT_TOPO] = clockNano();
+
+  // Dump XML if requested by user
+  const char* dumpXmlFile;
+  dumpXmlFile = ncclGetEnv("NCCL_TOPO_DUMP_FILE");
+  if (dumpXmlFile) {
+    NCCLCHECKGOTO(ncclTopoGetSystem(comm, NULL, dumpXmlFile), ret, fail);
+  }
+
   // Topo detection / System graph creation
   NCCLCHECKGOTO(ncclTopoGetSystem(comm, &comm->topo), ret, fail);
   // Compute paths between GPUs and NICs
@@ -1076,9 +1078,9 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
       INFO(NCCL_INIT, "Communicator has %d nodes which is less than CollNet node threshold %d, disabling CollNet", comm->nNodes, collNetNodeThreshold);
       comm->collNetSupport = 0;
     }
-    // As long as there is more than 1 rank on any node, we need to disable collnet reg
-    comm->collNetRegSupport = (comm->maxLocalRanks == 1);
   }
+  comm->isAllNvlink = ncclTopoPathAllNVLink(comm->topo);
+  comm->isOneRPN = (comm->maxLocalRanks == 1);
 
   NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail);
   NCCLCHECKGOTO(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, graphs, parent), ret, fail);
@@ -1293,7 +1295,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   // launch NCCL kernels before all cuda mem allocation is complete. That could cause a deadlock.
   NCCLCHECKGOTO(devCommSetup(comm), ret, fail);
   timers[TIMER_INIT_CONNECT] = clockNano() -  timers[TIMER_INIT_CONNECT];
-
   /* Local intra-node barrier */
   NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
 
@@ -1338,6 +1339,7 @@ struct ncclCommInitRankAsyncJob {
   // for ncclCommSplit
   struct ncclComm* parent;
   int color, key;
+  int splitCount;
   // name of the function calling
   char funcName[NCCL_COMMINIT_FUNCNAME_LEN];
 };
@@ -1432,13 +1434,14 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
     timers[TIMER_INIT_ALLOC] = clockNano();
     NCCLCHECKGOTO(commAlloc(comm, job->parent, job->nranks, job->myrank), res, fail);
     timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC];
-    // obtain a unique hash for the comm, re-using part of the parent's hash, commHash is a 64bit struct (=16 hex), add the color
+    // obtain a unique hash for the comm, re-using part of the parent's hash, commHash is a 64bit struct (=16 hex),
+    // add unique split counter and the color
     ncclUniqueId tmpId;
     memset(&tmpId,0,sizeof(ncclUniqueId));// must set 0 here to avoid undefined bits
-    snprintf((char*)&tmpId, NCCL_UNIQUE_ID_BYTES, "%016lx-%d", job->parent->commHash, job->color);
+    snprintf((char*)&tmpId, NCCL_UNIQUE_ID_BYTES, "%016lx-%d-%d", job->parent->commHash, job->splitCount, job->color);
     comm->commHash = getHash(tmpId.internal, NCCL_UNIQUE_ID_BYTES);
-    INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p color %d key %d- Init START", job->funcName,
-         comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->color, job->key);
+    INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p splitCount %d color %d key %d- Init START", job->funcName,
+         comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->splitCount, job->color, job->key);
     timers[TIMER_INIT_BOOTSTRAP] = clockNano();
     NCCLCHECKGOTO(bootstrapSplit(comm->commHash, comm, job->parent, job->color, job->key, parentRanks), res, fail);
     timers[TIMER_INIT_BOOTSTRAP] = clockNano() - timers[TIMER_INIT_BOOTSTRAP];
@@ -1474,8 +1477,8 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
     /* unlink child abort flag. */
     __atomic_store_n(&job->parent->childAbortFlag, NULL, __ATOMIC_RELEASE);
     TRACE_CALL("ncclCommSplit(%p, %d, %d, %p, %d, %d)", job->parent, job->color, job->key, comm, comm->rank, comm->nRanks);
-    INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p color %d key %d - Init COMPLETE", job->funcName,
-         comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->color, job->key);
+    INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p splitCount %d color %d key %d - Init COMPLETE", job->funcName,
+         comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->splitCount, job->color, job->key);
   } else {
     // the name for the replay tool is ncclCommInitRank for all the variations
     TRACE_CALL("ncclCommInitRank(%p, %d, 0x%llx, %d, %d)", comm, comm->nRanks, commIdHash, comm->rank, comm->cudaDev);
@@ -1716,8 +1719,8 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, int nId
   comm->startMagic = comm->endMagic = NCCL_MAGIC; // Used to detect comm corruption.
   *comm->abortFlagRefCount = 1;
   NCCLCHECKGOTO(parseCommConfig(comm, config), res, fail);
-  /* start with ncclInternalError and will be changed to ncclSuccess if init succeeds. */
-  comm->initState = ncclInternalError;
+  /* start with ncclInProgress and will be changed to ncclSuccess if init succeeds. */
+  comm->initState = ncclInProgress;
   *newcomm = comm;
 
   NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail);
@@ -1749,6 +1752,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, int nId
 exit:
   return ncclGroupErrCheck(res);
 fail:
+  if (job) ncclCommInitJobFree(job);
   if (comm) {
     free(comm->abortFlag);
     if (comm->abortFlagDev) (void)ncclCudaHostFree((void*)comm->abortFlagDev);
@@ -1846,7 +1850,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
   NCCLCHECKGOTO(ncclGroupEnd(), ret, fail);
 
 exit:
-  cudaSetDevice(oldDev);
+  (void)cudaSetDevice(oldDev);
   free(gpuFlags);
   return ret;
 fail:
@@ -1926,14 +1930,9 @@ fail:
 static ncclResult_t commDestroySync(struct ncclAsyncJob* job_) {
   struct ncclCommFinalizeAsyncJob* job = (struct ncclCommFinalizeAsyncJob*) job_;
   ncclComm_t comm = job->comm;
-  int savedDevice;
-  int commDevice = comm->cudaDev;
   ncclResult_t ret = ncclSuccess;
 
-  CUDACHECKGOTO(cudaGetDevice(&savedDevice), ret, fail);
-  if (savedDevice != commDevice) {
-    CUDACHECKGOTO(cudaSetDevice(commDevice), ret, fail);
-  }
+  CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
 
   TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d asyncResult %d", comm, comm->rank, *comm->abortFlag, comm->asyncResult);
 
@@ -1963,10 +1962,6 @@ static ncclResult_t commDestroySync(struct ncclAsyncJob* job_) {
     WARN("ncclProxyStop: comm %p (rank = %d) destroys proxy resource error %d", comm, comm->rank, ret);
   }
 
-  if (savedDevice != commDevice) {
-    CUDACHECKGOTO(cudaSetDevice(savedDevice), ret, fail);
-  }
-
 exit:
   return ret;
 fail:
@@ -1974,25 +1969,12 @@ fail:
 }
 
 static ncclResult_t commCleanup(ncclComm_t comm) {
-  int savedDevice;
-  int commDevice = comm->cudaDev;
-
-  CUDACHECK(cudaGetDevice(&savedDevice));
-  if (savedDevice != commDevice) {
-    CUDACHECK(cudaSetDevice(commDevice));
-  }
-
+  CUDACHECK(cudaSetDevice(comm->cudaDev));
   if (comm->tuner != NULL) {
     NCCLCHECK(comm->tuner->destroy(comm->tunerContext));
     NCCLCHECK(ncclTunerPluginUnload(comm));
   }
-
   NCCLCHECK(commFree(comm));
-
-  if (savedDevice != commDevice) {
-    CUDACHECK(cudaSetDevice(savedDevice));
-  }
-
   return ncclSuccess;
 }
 
@@ -2099,6 +2081,7 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
   NVTX3_FUNC_WITH_PARAMS(CommDestroy, CommInitRankSchema, payload)
 
   TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId);
+  NCCLCHECK(ncclGroupStartInternal());
   // Try and prevent a double free of the comm struct (user error)
   if (comm->rank == -1 || comm->nRanks == -1 || comm->cudaDev == -1 || comm->busId == -1) {
     WARN("comm %p has already been destroyed", comm);
@@ -2113,6 +2096,8 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
   NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, commReclaim, NULL, free, comm), res, fail);
 
 exit:
+  ncclGroupErrCheck(res);
+  NCCLCHECK(ncclGroupEndInternal());
   return res;
 fail:
   goto exit;
@@ -2124,7 +2109,7 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) {
     NVTX3_FUNC_RANGE_IN(nccl_domain);
     return ncclSuccess;
   }
-
+  NCCLCHECK(ncclGroupStartInternal());
   // Ask anything that might still be running on the device to quit
   if (comm->childAbortFlag != nullptr) {
     __atomic_store_n(comm->childAbortFlag, 1, __ATOMIC_RELEASE);
@@ -2152,6 +2137,8 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) {
   NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, commReclaim, NULL, free, comm), res, fail);
 
 exit:
+  ncclGroupErrCheck(res);
+  NCCLCHECK(ncclGroupEndInternal());
   return ncclSuccess;
 fail:
   goto exit;
@@ -2218,14 +2205,15 @@ ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newc
       NCCLCHECKGOTO(parseCommConfig(childComm, config), res, fail);
     }
 
-    /* start with ncclInternalError and will be changed to ncclSuccess if init succeeds. */
-    childComm->initState = ncclInternalError;
+    /* start with ncclInProgress and will be changed to ncclSuccess if init succeeds. */
+    childComm->initState = ncclInProgress;
   }
 
   NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail);
   job->comm = childComm;
   job->newcomm = newcomm;
   job->parent = comm;
+  job->splitCount = ++comm->splitCount;
   job->color = color;
   job->key = key;
   job->cudaDev = comm->cudaDev;
@@ -2233,13 +2221,13 @@ ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newc
   NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, ncclCommInitRankFunc, NULL, free, comm), res, fail);
 
 exit:
-  cudaSetDevice(oldDev);
+  (void)cudaSetDevice(oldDev);
   (void)ncclGroupErrCheck(res);
   NCCLCHECK(ncclGroupEndInternal());
   return res;
 fail:
   if (childComm) {
-    if (comm && !comm->config.splitShare) {
+    if (!comm->config.splitShare) {
       free(childComm->abortFlag);
       if (childComm->abortFlagDev) ncclCudaHostFree(childComm->abortFlagDev);
       free(childComm->abortFlagRefCount);
@@ -2347,14 +2335,12 @@ ncclResult_t  ncclMemAlloc(void **ptr, size_t size) {
 
   CUDACHECK(cudaGetDevice(&cudaDev));
   CUCHECK(cuDeviceGet(&currentDev, cudaDev));
-  if (CUPFN(cuMulticastCreate) != NULL)
-    CUCHECK(cuDeviceGetAttribute(&mcSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, currentDev));
 
-  if (mcSupport) {
+  if (ncclCuMemEnable()) {
     int requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
     // Query device to see if FABRIC handle support is available
     flag = 0;
-    (void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev));;
+    (void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev));
     if (flag) requestedHandleTypes |= CU_MEM_HANDLE_TYPE_FABRIC;
     memprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
     memprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
@@ -2365,18 +2351,24 @@ ncclResult_t  ncclMemAlloc(void **ptr, size_t size) {
     CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev));
     if (flag) memprop.allocFlags.gpuDirectRDMACapable = 1;
     CUCHECK(cuMemGetAllocationGranularity(&memGran, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
-
-    /* mc property */
     CUDACHECK(cudaGetDeviceCount(&dcnt));
-    mcprop.size = size;
-    /* device cnt is a dummy value right now, it might affect mc granularity in the future. */
-    mcprop.numDevices = dcnt;
-    mcprop.handleTypes = requestedHandleTypes;
-    mcprop.flags = 0;
-    CUCHECK(cuMulticastGetGranularity(&mcGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
 
-    /* only size needs to be aligned to mcGran */
-    ALIGN_SIZE(size, mcGran);
+    if (CUPFN(cuMulticastCreate) != NULL) CUCHECK(cuDeviceGetAttribute(&mcSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, currentDev));
+    if (mcSupport) {
+      /* mc property */
+      mcprop.size = size;
+      /* device cnt is a dummy value right now, it might affect mc granularity in the future. */
+      mcprop.numDevices = dcnt;
+      mcprop.handleTypes = requestedHandleTypes;
+      mcprop.flags = 0;
+      CUCHECK(cuMulticastGetGranularity(&mcGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
+
+      /* only size needs to be aligned to mcGran */
+      ALIGN_SIZE(size, mcGran);
+    } else {
+      ALIGN_SIZE(size, memGran);
+    }
+
     if (requestedHandleTypes & CU_MEM_HANDLE_TYPE_FABRIC) {
       /* First try cuMemCreate() with FABRIC handle support and then remove if it fails */
       CUresult err = CUPFN(cuMemCreate(&handle, size, &memprop, 0));
@@ -2403,6 +2395,7 @@ ncclResult_t  ncclMemAlloc(void **ptr, size_t size) {
         accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
         CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
       }
+      if (0 == p2p && i != cudaDev) INFO(NCCL_ALLOC, "P2P not supported between GPU%d and GPU%d", cudaDev, i);
     }
     goto exit;
   }
@@ -2429,18 +2422,13 @@ ncclResult_t  ncclMemFree(void *ptr) {
   CUDACHECK(cudaGetDevice(&saveDevice));
 #if CUDART_VERSION >= 12010
   CUdevice ptrDev = 0;
-  int mcSupport = 0;
 
   if (ptr == NULL) goto fallback;
-
   if (ncclCudaLibraryInit() != ncclSuccess) goto fallback;
 
   CUCHECKGOTO(cuPointerGetAttribute((void*)&ptrDev, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, (CUdeviceptr)ptr), ret, fail);
-  if (CUPFN(cuMulticastCreate) != NULL)
-    CUCHECKGOTO(cuDeviceGetAttribute(&mcSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, ptrDev), ret, fail);
-
   CUDACHECKGOTO(cudaSetDevice((int)ptrDev), ret, fail);
-  if (mcSupport) {
+  if (ncclCuMemEnable()) {
     NCCLCHECKGOTO(ncclCuMemFree(ptr), ret, fail);
     goto exit;
   }
diff --git a/src/misc/cudawrap.cc b/src/misc/cudawrap.cc
index 03e3bde992..e5fec1e46c 100644
--- a/src/misc/cudawrap.cc
+++ b/src/misc/cudawrap.cc
@@ -11,7 +11,7 @@
 
 // This env var (NCCL_CUMEM_ENABLE) toggles cuMem API usage
 NCCL_PARAM(CuMemEnable, "CUMEM_ENABLE", -2);
-NCCL_PARAM(CuMemHostEnable, "CUMEM_HOST_ENABLE", 0);
+NCCL_PARAM(CuMemHostEnable, "CUMEM_HOST_ENABLE", -1);
 // Handle type used for cuMemCreate()
 CUmemAllocationHandleType ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
 
@@ -35,9 +35,6 @@ int ncclIsCuMemSupported() {
   // Query device to see if CUMEM VMM support is available
   CUCHECKGOTO(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, currentDev), ret, error);
   if (!flag) return 0;
-  // Query device to see if CUMEM RDMA support is available
-  CUCHECKGOTO(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev), ret, error);
-  if (!flag) return 0;
 error:
   return (ret == ncclSuccess);
 #endif
@@ -49,11 +46,31 @@ int ncclCuMemEnable() {
   return  param >= 0 ? param : (param == -2 && ncclCuMemSupported);
 }
 
+static int ncclCumemHostEnable = -1;
 int ncclCuMemHostEnable() {
+  if (ncclCumemHostEnable != -1)
+    return ncclCumemHostEnable;
 #if CUDART_VERSION < 12020
-  return 0;
+  ncclCumemHostEnable = 0;
+  return ncclCumemHostEnable;
 #else
-  return ncclParamCuMemHostEnable();
+  ncclResult_t ret = ncclSuccess;
+  int cudaDriverVersion;
+  int paramValue = -1;
+  CUDACHECKGOTO(cudaDriverGetVersion(&cudaDriverVersion), ret, error);
+  if (cudaDriverVersion < 12020) {
+    ncclCumemHostEnable = 0;
+  }
+  else {
+    paramValue = ncclParamCuMemHostEnable();
+    if (paramValue != -1)
+      ncclCumemHostEnable = paramValue;
+    else
+      ncclCumemHostEnable = (cudaDriverVersion >= 12060) ? 1 : 0;
+  }
+  return ncclCumemHostEnable;
+error:
+  return (ret == ncclSuccess);
 #endif
 }
 
@@ -218,10 +235,9 @@ static void initOnceFunc() {
   // Determine whether we support the cuMem APIs or not
   ncclCuMemSupported = ncclIsCuMemSupported();
 
-#if 12020 <= CUDART_VERSION && CUDART_VERSION <= 12030
-  /* To use cuMem* for host memory allocation, we need to create context on each
-   * visible device. This is workaround needed in CUDA 12.3 which is fixed in 12.4. */
-  if (ncclCuMemSupported && ncclCuMemHostEnable()) {
+  /* To use cuMem* for host memory allocation, we need to create context on each visible device.
+   * This is a workaround needed in CUDA 12.2 and CUDA 12.3 which is fixed in 12.4. */
+  if (ncclCuMemSupported && ncclCuMemHostEnable() && 12020 <= driverVersion && driverVersion <= 12030) {
     int deviceCnt, saveDevice;
     cudaGetDevice(&saveDevice);
     cudaGetDeviceCount(&deviceCnt);
@@ -231,7 +247,6 @@ static void initOnceFunc() {
     }
     cudaSetDevice(saveDevice);
   }
-#endif
   initResult = ret;
   return;
 error:
diff --git a/src/misc/ibvwrap.cc b/src/misc/ibvwrap.cc
index eb4e52b606..698465ca48 100644
--- a/src/misc/ibvwrap.cc
+++ b/src/misc/ibvwrap.cc
@@ -8,6 +8,7 @@
 #include <sys/types.h>
 #include <unistd.h>
 
+#include "ibvcore.h"
 #include "ibvsymbols.h"
 
 static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
@@ -53,7 +54,7 @@ ncclResult_t wrap_ibv_symbols(void) {
   } \
   int ret = container.call; \
   if (ret == ENOTSUP || ret == EOPNOTSUPP) { \
-    INFO(NCCL_NET, "Call to " name " failed with error %s errno %d", strerror(ret), ret); \
+    INFO(NCCL_NET, "Call to " name " not supported"); \
     *supported = 0; \
     return ncclSuccess; \
   } else if (ret != success_retval) { \
@@ -87,6 +88,14 @@ ncclResult_t wrap_ibv_symbols(void) {
   container.call; \
   return ncclSuccess;
 
+NCCL_PARAM(IbMQpRetryAll, "IB_MQP_RETRY_ALL", 0);
+NCCL_PARAM(IbMQpRetryCnt, "IB_MQP_RETRY_CNT", 34);
+NCCL_PARAM(IbMQpRetryTimeout, "IB_MQP_RETRY_SLEEP_MSEC", 100); // in milliseconds
+
+#define IBV_ERR_EQ(e, code)        (e == code || e == (-code))
+#define IBV_MQP_RETRY_ERRNO(e)     (IBV_ERR_EQ(e, ETIMEDOUT))
+#define IBV_MQP_RETRY_ERRNO_ALL(e) (ncclParamIbMQpRetryAll() ? (e != 0) : IBV_MQP_RETRY_ERRNO(e))
+
 ncclResult_t wrap_ibv_fork_init() {
   IBV_INT_CHECK(ibvSymbols, ibv_internal_fork_init, ibv_internal_fork_init(), -1, "ibv_fork_init");
 }
@@ -202,8 +211,87 @@ ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct i
   IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_create_qp, ibv_internal_create_qp(pd, qp_init_attr), *ret, NULL, "ibv_create_qp");
 }
 
-ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
-  IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_modify_qp, ibv_internal_modify_qp(qp, attr, attr_mask), 0, "ibv_modify_qp");
+static void ibvQpStateName(enum ibv_qp_state state, char* msg, const size_t len) {
+  switch (state) {
+  case (IBV_QPS_RESET): snprintf(msg, len, "RESET"); break;
+  case (IBV_QPS_INIT): snprintf(msg, len, "INIT"); break;
+  case (IBV_QPS_RTR): snprintf(msg, len, "RTR"); break;
+  case (IBV_QPS_RTS): snprintf(msg, len, "RTS"); break;
+  case (IBV_QPS_SQD): snprintf(msg, len, "SQD"); break;
+  case (IBV_QPS_SQE): snprintf(msg, len, "SQE"); break;
+  case (IBV_QPS_ERR): snprintf(msg, len, "ERR"); break;
+  case (IBV_QPS_UNKNOWN): snprintf(msg, len, "UNKNOWN"); break;
+  default: snprintf(msg, len, "NOT RECOGNIZED (%d)", state); break;
+  }
+}
+
+#define QP_ATTR(attr, userAttr, userFlag, mask) ((userFlag & mask) ? (userAttr) : (attr))
+
+static void ibvModifyQpLog(struct ibv_qp* qp, enum ibv_qp_state qpState, struct ibv_qp_attr* userAttr, int userFlag, char* msg, size_t msgLen) {
+  ncclResult_t res;
+  int portNum = -1, gidIndex = -1;
+  char localGidName[INET6_ADDRSTRLEN], remoteGidName[INET6_ADDRSTRLEN];
+  const char *localGidRes = NULL, *remoteGidRes = NULL;
+
+  char nextState[32], currState[32];
+  ibvQpStateName(qp->state, currState, sizeof(currState));
+  ibvQpStateName(qpState, nextState, sizeof(nextState));
+  char devName[IBV_SYSFS_NAME_MAX] = "";
+  snprintf(devName, sizeof(devName), "%s", (qp->pd->context) ? wrap_ibv_get_device_name(qp->pd->context->device) : "N/A");
+
+  struct ibv_qp_attr attr;
+  struct ibv_qp_init_attr init_attr;
+  int attr_mask = IBV_QP_PORT | IBV_QP_AV;
+  res = wrap_ibv_query_qp(qp, &attr, attr_mask, &init_attr);
+  struct ibv_qp_attr *qpAttr = (res == ncclSuccess) ? &attr : NULL;
+
+  // port info, portAttr can be NULL if not given by the user and query_qp failed
+  struct ibv_qp_attr *portAttr = QP_ATTR(qpAttr, userAttr, userFlag, IBV_QP_PORT);
+  portNum = portAttr ? portAttr->port_num : -1;
+
+  // address info, avAttr can be NULL if not given by the user and query_qp failed
+  struct ibv_qp_attr *avAttr = QP_ATTR(qpAttr, userAttr, userFlag, IBV_QP_AV);
+  if (avAttr && avAttr->ah_attr.is_global) {
+    union ibv_gid *remoteGid = &avAttr->ah_attr.grh.dgid;
+    remoteGidRes = ibvGetGidStr(remoteGid, remoteGidName, sizeof(remoteGidName));
+    // we need pd->context to retrieve local GID, skip if not there
+    if (!qp->pd->context) goto print;
+    gidIndex =  avAttr->ah_attr.grh.sgid_index;
+    union ibv_gid localGid;
+    NCCLCHECKGOTO(wrap_ibv_query_gid(qp->pd->context, portNum, gidIndex, &localGid), res, print);
+    localGidRes = ibvGetGidStr(&localGid, localGidName, sizeof(localGidName));
+  }
+
+print:
+  snprintf(msg, msgLen, "on dev %s:%d, curr state %s, next state %s, local GID index %d, local GID %s, remote GID %s",
+           devName, portNum, currState, nextState, gidIndex, localGidRes ? localGidName : "N/A", remoteGidRes ? remoteGidName : "N/A");
+  return;
+}
+
+ncclResult_t wrap_ibv_modify_qp(struct ibv_qp* qp, struct ibv_qp_attr* attr, int attr_mask) {
+  char qpMsg[1024];
+  int ret = 0, attempts = 0;
+  int maxCnt = (int)ncclParamIbMQpRetryCnt() + 1; // number of attempts = number of retry + 1
+  int timeOut = (int)ncclParamIbMQpRetryTimeout();
+  CHECK_NOT_NULL(ibvSymbols, ibv_internal_modify_qp);
+  do {
+    if (attempts > 0) {
+      unsigned int sleepTime = timeOut * attempts;
+      ibvModifyQpLog(qp, attr->qp_state, attr, attr_mask, qpMsg, sizeof(qpMsg));
+      INFO(NCCL_NET, "Call to ibv_modify_qp failed with %d %s, %s, retrying %d/%d after %u msec of sleep", ret, strerror(ret), qpMsg, attempts, maxCnt, sleepTime);
+      // sleep before retrying
+      struct timespec tv = {.tv_sec = sleepTime / 1000, .tv_nsec = (sleepTime % 1000) * ((long)1e6)};
+      nanosleep(&tv, NULL);
+    }
+    ret = ibvSymbols.ibv_internal_modify_qp(qp, attr, attr_mask);
+    attempts++;
+  } while (IBV_MQP_RETRY_ERRNO_ALL(ret) && attempts < maxCnt);
+  if (ret != 0) {
+    ibvModifyQpLog(qp, attr->qp_state, attr, attr_mask, qpMsg, sizeof(qpMsg));
+    WARN("Call to ibv_modify_qp failed with %d %s, %s", ret, strerror(ret), qpMsg);
+    return ncclSystemError;
+  }
+  return ncclSuccess;
 }
 
 ncclResult_t wrap_ibv_query_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
diff --git a/src/misc/ipcsocket.cc b/src/misc/ipcsocket.cc
index 2d17f47e69..23746b3c5c 100644
--- a/src/misc/ipcsocket.cc
+++ b/src/misc/ipcsocket.cc
@@ -189,14 +189,16 @@ ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen,
 
   TRACE(NCCL_INIT, "UDS: Sending hdr %p len %d fd %d to UDS socket %s", hdr, hdrLen, sendFd, temp);
 
-  msg.msg_control = control_un.control;
-  msg.msg_controllen = sizeof(control_un.control);
+  if (sendFd != -1) {
+    msg.msg_control = control_un.control;
+    msg.msg_controllen = sizeof(control_un.control);
 
-  cmptr = CMSG_FIRSTHDR(&msg);
-  cmptr->cmsg_len = CMSG_LEN(sizeof(int));
-  cmptr->cmsg_level = SOL_SOCKET;
-  cmptr->cmsg_type = SCM_RIGHTS;
-  memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd));
+    cmptr = CMSG_FIRSTHDR(&msg);
+    cmptr->cmsg_len = CMSG_LEN(sizeof(int));
+    cmptr->cmsg_level = SOL_SOCKET;
+    cmptr->cmsg_type = SCM_RIGHTS;
+    memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd));
+  }
 
   msg.msg_name = (void *)&cliaddr;
   msg.msg_namelen = sizeof(struct sockaddr_un);
diff --git a/src/misc/nvmlwrap.cc b/src/misc/nvmlwrap.cc
index f441af80b1..66ba2d4c85 100644
--- a/src/misc/nvmlwrap.cc
+++ b/src/misc/nvmlwrap.cc
@@ -311,19 +311,19 @@ ncclResult_t ncclNvmlGetCCStatus(struct ncclNvmlCCStatus *status) {
       status->CCEnabled = false;
 
     if (ccInfo.settingV12040.multiGpuMode == NVML_CC_SYSTEM_MULTIGPU_PROTECTED_PCIE)
-      status->multiGpuCCEnabled = true;
+      status->multiGpuProtectedPCIE = true;
     else
-      status->multiGpuCCEnabled = false;
+      status->multiGpuProtectedPCIE = false;
   } else if (pfn_nvmlSystemGetConfComputeState != NULL) {
     NVMLTRY(nvmlSystemGetConfComputeState, &ccInfo.settingV12020);
     if (ccInfo.settingV12020.ccFeature == NVML_CC_SYSTEM_FEATURE_ENABLED)
       status->CCEnabled = true;
     else
       status->CCEnabled = false;
-    status->multiGpuCCEnabled = false;
+    status->multiGpuProtectedPCIE = false;
   } else {
     status->CCEnabled = false;
-    status->multiGpuCCEnabled = false;
+    status->multiGpuProtectedPCIE = false;
   }
   return ncclSuccess;
 }
diff --git a/src/misc/profiler.cc b/src/misc/profiler.cc
index 9a4adf5795..c9fb2a869f 100644
--- a/src/misc/profiler.cc
+++ b/src/misc/profiler.cc
@@ -16,9 +16,110 @@ static pthread_mutex_t profilerLock = PTHREAD_MUTEX_INITIALIZER;
 static int profilerPluginRefCount;
 static void* profilerPluginLib;
 static ncclProfiler_t* ncclProfiler;
+static ncclProfiler_v2_t ncclProfiler_v1_as_v2;
+static ncclProfiler_v1_t* ncclProfiler_v1;
+
+static uint8_t ncclStringToFunc(const char* func) {
+  if (0 == strcmp(func, "AllGather")) return ncclFuncAllGather;
+  if (0 == strcmp(func, "AllReduce")) return ncclFuncAllReduce;
+  if (0 == strcmp(func, "Broadcast")) return ncclFuncBroadcast;
+  if (0 == strcmp(func, "Recv")) return ncclFuncRecv;
+  if (0 == strcmp(func, "Reduce")) return ncclFuncReduce;
+  if (0 == strcmp(func, "ReduceScatter")) return ncclFuncReduceScatter;
+  if (0 == strcmp(func, "SendRecv")) return ncclFuncSendRecv;
+  return ncclFuncSend;
+}
+
+static uint8_t ncclStringToAlgo(const char* algo) {
+  if (0 == strcmp(algo, "TREE")) return NCCL_ALGO_TREE;
+  if (0 == strcmp(algo, "RING")) return NCCL_ALGO_RING;
+  if (0 == strcmp(algo, "COLLNET_DIRECT")) return NCCL_ALGO_COLLNET_DIRECT;
+  if (0 == strcmp(algo, "COLLNET_CHAIN")) return NCCL_ALGO_COLLNET_CHAIN;
+  if (0 == strcmp(algo, "NVLS")) return NCCL_ALGO_NVLS;
+  if (0 == strcmp(algo, "NVLS_TREE")) return NCCL_ALGO_NVLS_TREE;
+  return NCCL_ALGO_PAT;
+}
+
+static uint8_t ncclStringToProto(const char* proto) {
+  if (0 == strcmp(proto, "LL")) return NCCL_PROTO_LL;
+  if (0 == strcmp(proto, "LL128")) return NCCL_PROTO_LL128;
+  return NCCL_PROTO_SIMPLE;
+}
+
+static uint8_t ncclStringToDatatype(const char* dt) {
+  if (0 == strcmp(dt, "ncclInt8")) return ncclInt8;
+  if (0 == strcmp(dt, "ncclInt32")) return ncclInt32;
+  if (0 == strcmp(dt, "ncclUint32")) return ncclUint32;
+  if (0 == strcmp(dt, "ncclInt64")) return ncclInt64;
+  if (0 == strcmp(dt, "ncclUint64")) return ncclUint64;
+  if (0 == strcmp(dt, "ncclFloat16")) return ncclFloat16;
+  if (0 == strcmp(dt, "ncclFloat32")) return ncclFloat32;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+  if (0 == strcmp(dt, "ncclBfloat16")) return ncclBfloat16;
+#endif
+  return ncclFloat64;
+}
+
+static ncclResult_t ncclProfiler_v1_as_v2_startEvent(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr) {
+  ncclProfilerEventDescr_v1_t eDescr_v1 = { 0 };
+  eDescr_v1.type = eDescr->type;
+  eDescr_v1.parentObj = eDescr->parentObj;
+  eDescr_v1.rank = eDescr->rank;
+  switch(eDescr->type) {
+    case ncclProfileGroup: break;
+    case ncclProfileColl: {
+      eDescr_v1.coll.name = eDescr->coll.name;
+      eDescr_v1.coll.commHash = eDescr->coll.commHash;
+      eDescr_v1.coll.seqNumber = eDescr->coll.seqNumber;
+      eDescr_v1.coll.func = ncclStringToFunc(eDescr->coll.func);
+      eDescr_v1.coll.sendBuff = eDescr->coll.sendBuff;
+      eDescr_v1.coll.recvBuff = eDescr->coll.recvBuff;
+      eDescr_v1.coll.count = eDescr->coll.count;
+      eDescr_v1.coll.root = eDescr->coll.root;
+      eDescr_v1.coll.datatype = ncclStringToDatatype(eDescr->coll.datatype);
+      eDescr_v1.coll.op = 0; // removed in v2
+      eDescr_v1.coll.trafficBytes = eDescr->coll.trafficBytes;
+      eDescr_v1.coll.nMaxChannels = eDescr->coll.nMaxChannels;
+      eDescr_v1.coll.nWarps = eDescr->coll.nWarps;
+      eDescr_v1.coll.algo = ncclStringToAlgo(eDescr->coll.algo);
+      eDescr_v1.coll.proto = ncclStringToProto(eDescr->coll.proto);
+    } break;
+    case ncclProfileP2p: {
+      eDescr_v1.p2p.name = eDescr->p2p.name;
+      eDescr_v1.p2p.commHash = eDescr->p2p.commHash;
+      eDescr_v1.p2p.func = ncclStringToFunc(eDescr->p2p.func);
+      eDescr_v1.p2p.buff = eDescr->p2p.buff;
+      eDescr_v1.p2p.count = eDescr->p2p.count;
+      eDescr_v1.p2p.datatype = ncclStringToDatatype(eDescr->p2p.datatype);
+      eDescr_v1.p2p.peer = eDescr->p2p.peer;
+    } break;
+    case ncclProfileProxyOp: {
+      eDescr_v1.proxyOp.pid = eDescr->proxyOp.pid;
+      eDescr_v1.proxyOp.channelId = eDescr->proxyOp.channelId;
+      eDescr_v1.proxyOp.peer = eDescr->proxyOp.peer;
+      eDescr_v1.proxyOp.nSteps = eDescr->proxyOp.nSteps;
+      eDescr_v1.proxyOp.chunkSize = eDescr->proxyOp.chunkSize;
+      eDescr_v1.proxyOp.isSend = eDescr->proxyOp.isSend;
+    } break;
+    case ncclProfileProxyStep: {
+      eDescr_v1.proxyStep.step = eDescr->proxyStep.step;
+    } break;
+    case ncclProfileProxyCtrl: break;
+    default:;
+  }
+  return ncclProfiler_v1->startEvent(context, eHandle, &eDescr_v1);
+}
+
+static ncclResult_t ncclProfiler_v1_as_v2_init(void** context, int* eActivationMask) {
+  ncclProfiler_v1->init(context, eActivationMask);
+  ncclProfiler_v1_as_v2.startEvent = ncclProfiler_v1_as_v2_startEvent;
+  ncclProfiler_v1_as_v2.stopEvent = ncclProfiler_v1->stopEvent;
+  ncclProfiler_v1_as_v2.recordEventState = ncclProfiler_v1->recordEventState;
+  ncclProfiler_v1_as_v2.finalize = ncclProfiler_v1->finalize;
+  return ncclSuccess;
+}
 
 #define MAX_STR_LEN 256
-#define NCCL_PROFILER_PLUGIN_SYMBOL "ncclProfiler_v1"
 
 static void* tryOpenLib(char* name, int *err, char* errStr) {
   if (nullptr == name || strlen(name) == 0) {
@@ -33,7 +134,7 @@ static void* tryOpenLib(char* name, int *err, char* errStr) {
   if (nullptr == handle) {
     strncpy(errStr, dlerror(), MAX_STR_LEN);
     errStr[MAX_STR_LEN] = 0;
-    if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) {
+    if (name && strstr(errStr, name) && strstr(errStr, "No such file or directory")) {
       *err = ENOENT;
     }
   }
@@ -116,10 +217,21 @@ static ncclResult_t ncclProfilerPluginLoad(void) {
     goto fail;
   }
 
-  ncclProfiler = (ncclProfiler_t*)dlsym(profilerPluginLib, NCCL_PROFILER_PLUGIN_SYMBOL);
+  ncclProfiler = (ncclProfiler_v2_t*)dlsym(profilerPluginLib, "ncclProfiler_v2");
   if (ncclProfiler == nullptr) {
-    INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find " NCCL_PROFILER_PLUGIN_SYMBOL ".");
-    goto fail;
+    INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v2.");
+    ncclProfiler_v1 = (ncclProfiler_v1_t*)dlsym(profilerPluginLib, "ncclProfiler_v1");
+    if (ncclProfiler_v1 == nullptr) {
+      INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v1.");
+      goto fail;
+    } else {
+      ncclProfiler = &ncclProfiler_v1_as_v2;
+      ncclProfiler_v1_as_v2.name = ncclProfiler_v1->name;
+      ncclProfiler_v1_as_v2.init = ncclProfiler_v1_as_v2_init;
+      INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded ncclProfiler_v1.");
+    }
+  } else {
+    INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded ncclProfiler_v2.");
   }
 
   ++profilerPluginRefCount;
@@ -247,7 +359,7 @@ ncclResult_t ncclProfilerStartGroupEvent(struct ncclKernelPlan* plan) {
   eActivationMaskGroup = __atomic_load_n(&eActivationMask, __ATOMIC_RELAXED);
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
     if (eActivationMaskGroup & (ncclProfileColl | ncclProfileP2p | ncclProfileProxyOp | ncclProfileProxyStep)) {
-      ncclProfilerEventDescr_v1_t eDescr = { 0 };
+      ncclProfilerEventDescr_t eDescr = { 0 };
       eDescr.type = ncclProfileGroup;
       ncclProfiler->startEvent(plan->comm->profilerContext, &plan->groupEventHandle, &eDescr);
     }
@@ -279,20 +391,17 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) {
         eDescr.coll.name = plan->comm->commName;
         eDescr.coll.commHash = plan->comm->commHash;
         eDescr.coll.seqNumber = plan->comm->seqNumber[ct->func]++;
-        eDescr.coll.func = ct->func;
+        eDescr.coll.func = ncclFuncToString(ct->func);
         eDescr.coll.sendBuff = ct->sendbuff;
         eDescr.coll.recvBuff = ct->recvbuff;
         eDescr.coll.count = ct->count;
         eDescr.coll.root = ct->root;
-        eDescr.coll.datatype = ct->datatype;
-        eDescr.coll.op = ct->opHost;
+        eDescr.coll.datatype = ncclDatatypeToString(ct->datatype);
         eDescr.coll.trafficBytes = ct->trafficBytes;
         eDescr.coll.nMaxChannels = ct->nMaxChannels;
         eDescr.coll.nWarps = ct->nWarps;
-        eDescr.coll.algo = ct->algorithm;
-        eDescr.coll.proto = ct->protocol;
-        eDescr.coll.isCollnet = ct->isCollnet;
-        eDescr.coll.isNvls = ct->isNvls;
+        eDescr.coll.algo = ncclAlgoToString(ct->algorithm);
+        eDescr.coll.proto = ncclProtoToString(ct->protocol);
         ncclProfiler->startEvent(plan->comm->profilerContext, &ct->eventHandle, &eDescr);
 
         // update collective task with group event activation mask
@@ -307,10 +416,10 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) {
         eDescr.rank = plan->comm->rank;
         eDescr.p2p.name = plan->comm->commName;
         eDescr.p2p.commHash = plan->comm->commHash;
-        eDescr.p2p.func = pt->func;
+        eDescr.p2p.func = ncclFuncToString(pt->func);
         eDescr.p2p.buff = pt->buff;
         eDescr.p2p.count = pt->count;
-        eDescr.p2p.datatype = pt->datatype;
+        eDescr.p2p.datatype = ncclDatatypeToString(pt->datatype);
         eDescr.p2p.peer = pt->root;
         ncclProfiler->startEvent(plan->comm->profilerContext, &pt->eventHandle, &eDescr);
 
@@ -345,6 +454,11 @@ ncclResult_t ncclProfilerStopTaskEvents(struct ncclKernelPlan* plan) {
   return ncclSuccess;
 }
 
+// Bellow we set the proxy descriptor step number to DIVUP(step, args->sliceSteps).
+// The reason is that for some ncclOp (e.g. AllReduce) one network transfer is
+// made of sliceSteps steps rather than one step. In the profiler we are still
+// interested in whole network transfers though, so we account for this when
+// computing the actual network step number.
 ncclResult_t ncclProfilerStartSendProxyOpEvent(int s, struct ncclProxyArgs* args) {
   TIME_START_EVENT(proxyOpStart);
   struct ncclProxySubArgs* sub = &args->subs[s];
@@ -354,13 +468,13 @@ ncclResult_t ncclProfilerStartSendProxyOpEvent(int s, struct ncclProxyArgs* args
       eDescr.type = ncclProfileProxyOp;
       eDescr.parentObj = sub->taskEventHandle;
       eDescr.rank = sub->rank;
-      eDescr.proxyOp.pid = args->pid;
+      eDescr.proxyOp.pid = sub->pid;
       eDescr.proxyOp.channelId = sub->channelId;
       eDescr.proxyOp.peer = sub->peer;
-      eDescr.proxyOp.nSteps = sub->nsteps;
-      eDescr.proxyOp.chunkSize = args->chunkSize;
+      eDescr.proxyOp.nSteps = DIVUP(sub->nsteps, args->sliceSteps);
+      eDescr.proxyOp.chunkSize = args->chunkSize * args->sliceSteps;
       eDescr.proxyOp.isSend = 1;
-      ncclProfiler->startEvent(args->profilerContext, &sub->opEventHandle, &eDescr);
+      ncclProfiler->startEvent(sub->profilerContext, &sub->opEventHandle, &eDescr);
     }
   }
   TIME_STOP_EVENT(proxyOpStart);
@@ -376,13 +490,13 @@ ncclResult_t ncclProfilerStartRecvProxyOpEvent(int s, struct ncclProxyArgs* args
       eDescr.type = ncclProfileProxyOp;
       eDescr.parentObj = sub->taskEventHandle;
       eDescr.rank = sub->rank;
-      eDescr.proxyOp.pid = args->pid;
+      eDescr.proxyOp.pid = sub->pid;
       eDescr.proxyOp.channelId = sub->channelId;
       eDescr.proxyOp.peer = sub->peer;
-      eDescr.proxyOp.nSteps = sub->nsteps;
-      eDescr.proxyOp.chunkSize = args->chunkSize;
+      eDescr.proxyOp.nSteps = DIVUP(sub->nsteps, args->sliceSteps);
+      eDescr.proxyOp.chunkSize = args->chunkSize * args->sliceSteps;
       eDescr.proxyOp.isSend = 0;
-      ncclProfiler->startEvent(args->profilerContext, &sub->opEventHandle, &eDescr);
+      ncclProfiler->startEvent(sub->profilerContext, &sub->opEventHandle, &eDescr);
     }
   }
   TIME_STOP_EVENT(proxyOpStart);
@@ -400,53 +514,50 @@ ncclResult_t ncclProfilerStopProxyOpEvent(int s, struct ncclProxyArgs* args) {
   return ncclSuccess;
 }
 
-ncclResult_t ncclProfilerStartSendProxyStepEvents(int s, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi) {
+ncclResult_t ncclProfilerStartSendProxyStepEvent(int s, struct ncclProxyArgs* args, int stepId) {
   TIME_START_EVENT(proxyStepStart);
   struct ncclProxySubArgs* sub = &args->subs[s];
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
     if (sub->opEventHandle && (sub->eActivationMask & ncclProfileProxyStep)) {
-      for (uint64_t step = stepLo; step < stepHi; step++) {
-        ncclProfilerEventDescr_t eDescr = { 0 };
-        eDescr.type = ncclProfileProxyStep;
-        eDescr.parentObj = sub->opEventHandle;
-        eDescr.rank = sub->rank;
-        eDescr.proxyStep.step = step;
-        ncclProfiler->startEvent(args->profilerContext, &sub->stepEventHandles[step%NCCL_STEPS], &eDescr);
-      }
+      int step_ = DIVUP(stepId, args->sliceSteps);
+      ncclProfilerEventDescr_t eDescr = { 0 };
+      eDescr.type = ncclProfileProxyStep;
+      eDescr.parentObj = sub->opEventHandle;
+      eDescr.rank = sub->rank;
+      eDescr.proxyStep.step = step_;
+      ncclProfiler->startEvent(sub->profilerContext, &sub->stepEventHandles[step_%NCCL_STEPS], &eDescr);
     }
   }
   TIME_STOP_EVENT(proxyStepStart);
   return ncclSuccess;
 }
 
-ncclResult_t ncclProfilerStartRecvProxyStepEvents(int s, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi) {
+ncclResult_t ncclProfilerStartRecvProxyStepEvent(int s, struct ncclProxyArgs* args, int stepId) {
   TIME_START_EVENT(proxyStepStart);
   struct ncclProxySubArgs* sub = &args->subs[s];
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
     if (sub->opEventHandle && (sub->eActivationMask & ncclProfileProxyStep)) {
-      for (uint64_t step = stepLo; step < stepHi; step++) {
-        ncclProfilerEventDescr_t eDescr = { 0 };
-        eDescr.type = ncclProfileProxyStep;
-        eDescr.parentObj = sub->opEventHandle;
-        eDescr.rank = sub->rank;
-        eDescr.proxyStep.step = step;
-        ncclProfiler->startEvent(args->profilerContext, &sub->stepEventHandles[step%NCCL_STEPS], &eDescr);
-      }
+      int step_ = DIVUP(stepId, args->sliceSteps);
+      ncclProfilerEventDescr_t eDescr = { 0 };
+      eDescr.type = ncclProfileProxyStep;
+      eDescr.parentObj = sub->opEventHandle;
+      eDescr.rank = sub->rank;
+      eDescr.proxyStep.step = step_;
+      ncclProfiler->startEvent(sub->profilerContext, &sub->stepEventHandles[step_%NCCL_STEPS], &eDescr);
     }
   }
   TIME_STOP_EVENT(proxyStepStart);
   return ncclSuccess;
 }
 
-ncclResult_t ncclProfilerStopProxyStepEvents(int s, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi) {
+ncclResult_t ncclProfilerStopProxyStepEvent(int s, struct ncclProxyArgs* args, int stepId) {
   TIME_START_EVENT(proxyStepStop);
   struct ncclProxySubArgs* sub = &args->subs[s];
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
-    for (uint64_t step = stepLo; step < stepHi; step++) {
-      if (sub->stepEventHandles[step%NCCL_STEPS]) {
-        ncclProfiler->stopEvent(sub->stepEventHandles[step%NCCL_STEPS]);
-        sub->stepEventHandles[step%NCCL_STEPS] = NULL;
-      }
+    int step_ = DIVUP(stepId, args->sliceSteps);
+    if (sub->stepEventHandles[step_%NCCL_STEPS]) {
+      ncclProfiler->stopEvent(sub->stepEventHandles[step_%NCCL_STEPS]);
+      sub->stepEventHandles[step_%NCCL_STEPS] = NULL;
     }
   }
   TIME_STOP_EVENT(proxyStepStop);
@@ -484,8 +595,8 @@ ncclResult_t ncclProfilerRecordProxyOpEventState(int s, struct ncclProxyArgs* ar
   TIME_START_EVENT(proxyOpRecord);
   struct ncclProxySubArgs* sub = &args->subs[s];
   if (__builtin_expect(ncclProfiler != NULL, 0) && sub->opEventHandle) {
-    ncclProfilerEventStateArgs_t a = { 0 };
-    a.proxyOp.steps = steps;
+    ncclProfilerEventStateArgs_t a = { };
+    a.proxyOp.steps = DIVUP(steps, args->sliceSteps);
     a.proxyOp.transSize = transSize;
     ncclProfiler->recordEventState(sub->opEventHandle, eState, &a);
   }
@@ -493,14 +604,13 @@ ncclResult_t ncclProfilerRecordProxyOpEventState(int s, struct ncclProxyArgs* ar
   return ncclSuccess;
 }
 
-ncclResult_t ncclProfilerRecordProxyStepEventStates(int s, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi, ncclProfilerEventState_t eState) {
+ncclResult_t ncclProfilerRecordProxyStepEventState(int s, struct ncclProxyArgs* args, int stepId, ncclProfilerEventState_t eState) {
   TIME_START_EVENT(proxyStepRecord);
   struct ncclProxySubArgs* sub = &args->subs[s];
   if (__builtin_expect(ncclProfiler != NULL, 0) && sub->opEventHandle) {
-    for (uint64_t step = stepLo; step < stepHi; step++) {
-      if (sub->stepEventHandles[step%NCCL_STEPS]) {
-        ncclProfiler->recordEventState(sub->stepEventHandles[step%NCCL_STEPS], eState, 0);
-      }
+    int step_ = DIVUP(stepId, args->sliceSteps);
+    if (sub->stepEventHandles[step_%NCCL_STEPS]) {
+      ncclProfiler->recordEventState(sub->stepEventHandles[step_%NCCL_STEPS], eState, 0);
     }
   }
   TIME_STOP_EVENT(proxyStepRecord);
@@ -510,7 +620,7 @@ ncclResult_t ncclProfilerRecordProxyStepEventStates(int s, struct ncclProxyArgs*
 ncclResult_t ncclProfilerRecordProxyCtrlEventState(void* eHandle, int appended, ncclProfilerEventState_t eState) {
   TIME_START_EVENT(proxyCtrlRecord);
   if (__builtin_expect(ncclProfiler != NULL, 0) && eHandle && __atomic_load_n(&eActivationMask, __ATOMIC_RELAXED) & ncclProfileProxyCtrl) {
-    ncclProfilerEventStateArgs_t args = { 0 };
+    ncclProfilerEventStateArgs_t args = { };
     args.proxyCtrl.appendedProxyOps = appended;
     ncclProfiler->recordEventState(eHandle, eState, &args);
   }
diff --git a/src/misc/shmutils.cc b/src/misc/shmutils.cc
index daf3b338db..eb9cd10156 100644
--- a/src/misc/shmutils.cc
+++ b/src/misc/shmutils.cc
@@ -45,7 +45,7 @@ static void shmHandleInit(int fd, char* shmPath, size_t shmSize, size_t realShmS
   return;
 }
 
-ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle) {
+ncclResult_t ncclShmOpen(char* shmPath, size_t shmPathSize, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle) {
   int fd = -1;
   char* hptr = NULL;
   void* dptr = NULL;
@@ -62,7 +62,7 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de
      * refcount references; when the peer attaches, it should pass -1 to reduce one reference count. When it
      * goes down to 0, unlink should be called in order to delete shared memory file. */
     if (shmPath[0] == '\0') {
-      sprintf(shmPath, "/dev/shm/nccl-XXXXXX");
+      snprintf(shmPath, shmPathSize, "/dev/shm/nccl-XXXXXX");
     retry_mkstemp:
       fd = mkstemp(shmPath);
       if (fd < 0) {
@@ -70,7 +70,7 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de
           INFO(NCCL_ALL, "mkstemp: Failed to create %s, error: %s (%d) - retrying", shmPath, strerror(errno), errno);
           goto retry_mkstemp;
         }
-        WARN("Error: failed to create shared memory file %p, error %s (%d)", shmPath, strerror(errno), errno);
+        WARN("Error: failed to create shared memory file %s, error %s (%d)", shmPath, strerror(errno), errno);
         ret = ncclSystemError;
         goto fail;
       }
diff --git a/src/misc/socket.cc b/src/misc/socket.cc
index 93e577e05d..dfb4e6888a 100644
--- a/src/misc/socket.cc
+++ b/src/misc/socket.cc
@@ -12,6 +12,18 @@
 #include <ifaddrs.h>
 #include <net/if.h>
 #include "param.h"
+#include <time.h>
+
+NCCL_PARAM(RetryCnt, "SOCKET_RETRY_CNT", 34);
+NCCL_PARAM(RetryTimeOut, "SOCKET_RETRY_SLEEP_MSEC", 100);
+static void msleep(unsigned int time_msec) {
+  const long c_1e6 = 1e6;
+  struct timespec tv = (struct timespec){
+      .tv_sec = time_msec / 1000,
+      .tv_nsec = (time_msec % 1000) * c_1e6,
+  };
+  nanosleep(&tv, NULL);
+}
 
 static ncclResult_t socketProgressOpt(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int block, int* closed) {
   int bytes = 0;
@@ -26,8 +38,13 @@ static ncclResult_t socketProgressOpt(int op, struct ncclSocket* sock, void* ptr
       return ncclSuccess;
     }
     if (bytes == -1) {
+      if ((op == NCCL_SOCKET_SEND && errno == EPIPE) || (op == NCCL_SOCKET_RECV && errno == ECONNRESET)) {
+        *closed = 1;
+        return ncclSuccess;
+      }
       if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
-        WARN("socketProgressOpt: Call to recv from %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno));
+        WARN("socketProgressOpt: Call to %s %s failed : %s", (op == NCCL_SOCKET_RECV ? "recv from" : "send to"),
+             ncclSocketToString(&sock->addr, line), strerror(errno));
         return ncclRemoteError;
       } else {
         bytes = 0;
@@ -38,17 +55,22 @@ static ncclResult_t socketProgressOpt(int op, struct ncclSocket* sock, void* ptr
       INFO(NCCL_NET, "socketProgressOpt: abort called");
       return ncclInternalError;
     }
-  } while (bytes > 0 && (*offset) < size);
+  } while (sock->asyncFlag == 0 && bytes > 0 && (*offset) < size);
   return ncclSuccess;
 }
 
-static ncclResult_t socketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
+static ncclResult_t socketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int* pclosed = NULL) {
   int closed;
   NCCLCHECK(socketProgressOpt(op, sock, ptr, size, offset, 0 /*block*/, &closed));
   if (closed) {
-    char line[SOCKET_NAME_MAXLEN+1];
-    WARN("socketProgress: Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line, 0));
-    return ncclRemoteError;
+    if (pclosed) {
+      *pclosed = closed;
+      return ncclSuccess;
+    } else {
+      char line[SOCKET_NAME_MAXLEN+1];
+      WARN("socketProgress: Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line, 0));
+      return ncclRemoteError;
+    }
   }
   return ncclSuccess;
 }
@@ -63,9 +85,9 @@ static ncclResult_t socketWait(int op, struct ncclSocket* sock, void* ptr, int s
  *
  * Output: "IPv4/IPv6 address<port>"
  */
-const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm /*= 1*/) {
+const char *ncclSocketToString(const union ncclSocketAddress *addr, char *buf, const int numericHostForm /*= 1*/) {
   if (buf == NULL || addr == NULL) return NULL;
-  struct sockaddr *saddr = &addr->sa;
+  const struct sockaddr *saddr = &addr->sa;
   if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) { buf[0]='\0'; return buf; }
   char host[NI_MAXHOST], service[NI_MAXSERV];
   /* NI_NUMERICHOST: If set, then the numeric form of the hostname is returned.
@@ -370,10 +392,9 @@ ncclResult_t ncclSocketListen(struct ncclSocket* sock) {
   if (socketToPort(&sock->addr)) {
     // Port is forced by env. Make sure we get the port.
     int opt = 1;
-#if defined(SO_REUSEPORT)
-    SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt");
-#else
     SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)), "setsockopt");
+#if defined(SO_REUSEPORT)
+    SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt");
 #endif
   }
 
@@ -412,6 +433,15 @@ static ncclResult_t socketTryAccept(struct ncclSocket* sock) {
   sock->fd = accept(sock->acceptFd, (struct sockaddr*)&sock->addr, &socklen);
   if (sock->fd != -1) {
     sock->state = ncclSocketStateAccepted;
+  } else if (errno == ENETDOWN || errno == EPROTO || errno == ENOPROTOOPT || errno == EHOSTDOWN ||
+             errno == ENONET || errno == EHOSTUNREACH || errno == EOPNOTSUPP || errno == ENETUNREACH) {
+    /* per accept's man page, for linux sockets, the following errors might be already pending errors
+     * and should be considered as EAGAIN. To avoid infinite loop in case of errors, we use the retry count*/
+    if (++sock->errorRetries == ncclParamRetryCnt()) {
+      WARN("socketTryAccept: exceeded error retry count (%d), %s", sock->errorRetries, strerror(errno));
+      return ncclSystemError;
+    }
+    INFO(NCCL_ALL, "Call to accept returned %s, retrying", strerror(errno));
   } else if (errno != EAGAIN && errno != EWOULDBLOCK) {
     WARN("socketTryAccept: Accept failed: %s", strerror(errno));
     return ncclSystemError;
@@ -419,72 +449,118 @@ static ncclResult_t socketTryAccept(struct ncclSocket* sock) {
   return ncclSuccess;
 }
 
+static ncclResult_t socketSetFlags(struct ncclSocket* sock) {
+  const int one = 1;
+  /* Set socket as non-blocking if async or if we need to be able to abort */
+  if ((sock->asyncFlag || sock->abortFlag) && sock->fd >= 0) {
+    int flags;
+    SYSCHECK(flags = fcntl(sock->fd, F_GETFL), "fcntl");
+    SYSCHECK(fcntl(sock->fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
+  }
+  SYSCHECK(setsockopt(sock->fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt");
+  return ncclSuccess;
+}
+
 static ncclResult_t socketFinalizeAccept(struct ncclSocket* sock) {
   uint64_t magic;
   enum ncclSocketType type;
-  int received = 0;
-  const int one = 1;
-  SYSCHECK(setsockopt(sock->fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt");
+  int received;
+  // once accepted, linux sockets do NOT inherit file status flags such as O_NONBLOCK (BSD ones do)
+  NCCLCHECK(socketSetFlags(sock));
 
-  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, sock, &magic, sizeof(magic), &received));
-  if (received == 0) return ncclSuccess;
-  NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, &magic, sizeof(magic), &received));
-  if (magic != sock->magic) {
-    WARN("socketFinalizeAccept: wrong magic %lx != %lx", magic, sock->magic);
-    close(sock->fd);
-    sock->fd = -1;
-    // Ignore spurious connection and accept again
-    sock->state = ncclSocketStateAccepting;
-    return ncclSuccess;
-  } else {
-    received = 0;
-    NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, &type, sizeof(type), &received));
-    if (type != sock->type) {
-      WARN("socketFinalizeAccept: wrong type %d != %d", type, sock->type);
-      sock->state = ncclSocketStateError;
+  if (sock->asyncFlag == 0 || sock->finalizeCounter < sizeof(magic)) {
+    if (sock->asyncFlag == 0) {
+      received = 0;
+      NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, &magic, sizeof(magic), &received));
+    } else {
+      received = sock->finalizeCounter;
+      NCCLCHECK(socketProgress(NCCL_SOCKET_RECV, sock, sock->finalizeBuffer, sizeof(magic), &received));
+      sock->finalizeCounter = received;
+      if (received < sizeof(magic)) return ncclSuccess;
+      memcpy(&magic, sock->finalizeBuffer, sizeof(magic));
+    }
+    if (magic != sock->magic) {
+      WARN("socketFinalizeAccept: wrong magic %lx != %lx", magic, sock->magic);
       close(sock->fd);
       sock->fd = -1;
-      return ncclInternalError;
-    } else {
-      sock->state = ncclSocketStateReady;
+      // Ignore spurious connection and accept again
+      sock->state = ncclSocketStateAccepting;
+      return ncclSuccess;
     }
   }
+  if (sock->asyncFlag == 0) {
+    received = 0;
+    NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, &type, sizeof(type), &received));
+  } else {
+    received = sock->finalizeCounter - sizeof(magic);
+    NCCLCHECK(socketProgress(NCCL_SOCKET_RECV, sock, sock->finalizeBuffer, sizeof(type), &received));
+    sock->finalizeCounter = received + sizeof(magic);
+    if (received < sizeof(type)) return ncclSuccess;
+    memcpy(&type, sock->finalizeBuffer, sizeof(type));
+  }
+  if (type != sock->type) {
+    WARN("socketFinalizeAccept: wrong type %d != %d", type, sock->type);
+    sock->state = ncclSocketStateError;
+    close(sock->fd);
+    sock->fd = -1;
+    return ncclInternalError;
+  } else {
+    sock->state = ncclSocketStateReady;
+  }
   return ncclSuccess;
 }
 
-static ncclResult_t socketStartConnect(struct ncclSocket* sock) {
-  /* blocking/non-blocking connect() is determined by asyncFlag. */
-  int ret = connect(sock->fd, &sock->addr.sa, sock->salen);
-
-  if (ret == 0) {
+static ncclResult_t socketResetFd(struct ncclSocket* sock) {
+  ncclResult_t ret = ncclSuccess;
+  int fd = -1;
+  SYSCHECKGOTO(fd = socket(sock->addr.sa.sa_family, SOCK_STREAM, 0), "socket", ret, cleanup);
+  // if sock->fd is valid, close it and reuse its number
+  if (sock->fd != -1) {
+    SYSCHECKGOTO(dup2(fd, sock->fd), "dup2", ret, cleanup);
+    SYSCHECKGOTO(close(fd), "close", ret, cleanup);
+  } else {
+    sock->fd = fd;
+  }
+  NCCLCHECKGOTO(socketSetFlags(sock), ret, exit);
+exit:
+  return ret;
+cleanup:
+  // cleanup fd, leave sock->fd untouched
+  if (fd != -1) {
+    (void)close(fd);
+  }
+  goto exit;
+}
+static ncclResult_t socketConnectCheck(struct ncclSocket* sock, int errCode, const char funcName[]) {
+  if (errCode == 0) {
     sock->state = ncclSocketStateConnected;
-    return ncclSuccess;
-  } else if (errno == EINPROGRESS) {
+  } else if (errCode == EINPROGRESS) {
     sock->state = ncclSocketStateConnectPolling;
-    return ncclSuccess;
-  } else if (errno == ECONNREFUSED) {
-    if (++sock->refusedRetries == RETRY_REFUSED_TIMES) {
-      sock->state = ncclSocketStateError;
-      WARN("socketStartConnect: exceeded retries (%d)", sock->refusedRetries);
-      return ncclRemoteError;
+  } else if (errCode == ETIMEDOUT || errCode == EHOSTUNREACH || errCode == ECONNREFUSED) {
+    if (sock->customRetry == 0) {
+      if (sock->errorRetries++ == ncclParamRetryCnt()) {
+        sock->state = ncclSocketStateError;
+        WARN("%s: connect returned %s, exceeded error retry count (%d)", funcName, strerror(errCode), sock->errorRetries);
+        return ncclRemoteError;
+      }
+      unsigned int sleepTime = sock->errorRetries * ncclParamRetryTimeOut();
+      INFO(NCCL_ALL, "%s: connect returned %s, retrying (%d/%ld) after sleep for %u msec", funcName, strerror(errCode), sock->errorRetries, ncclParamRetryCnt(), sleepTime);
+      msleep(sleepTime);
     }
-    usleep(SLEEP_INT);
-    if (sock->refusedRetries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno));
-    return ncclSuccess;
-  } else if (errno == ETIMEDOUT) {
-    if (++sock->timedOutRetries == RETRY_TIMEDOUT_TIMES) {
-      sock->state = ncclSocketStateError;
-      WARN("socketStartConnect: exceeded timeouts (%d)", sock->timedOutRetries);
-      return ncclRemoteError;
-    }
-    usleep(SLEEP_INT);
-    return ncclSuccess;
+    NCCLCHECK(socketResetFd(sock)); /* in case of failure in connect, socket state is unspecified */
+    sock->state = ncclSocketStateConnecting;
   } else {
     char line[SOCKET_NAME_MAXLEN+1];
     sock->state = ncclSocketStateError;
-    WARN("socketStartConnect: Connect to %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno));
+    WARN("%s: Connect to %s failed : %s", funcName, ncclSocketToString(&sock->addr, line), strerror(errCode));
     return ncclSystemError;
   }
+  return ncclSuccess;
+}
+static ncclResult_t socketStartConnect(struct ncclSocket* sock) {
+  /* blocking/non-blocking connect() is determined by asyncFlag. */
+  int ret = connect(sock->fd, &sock->addr.sa, sock->salen);
+  return socketConnectCheck(sock, (ret == -1) ? errno : 0, __func__);
 }
 
 static ncclResult_t socketPollConnect(struct ncclSocket* sock) {
@@ -509,33 +585,7 @@ static ncclResult_t socketPollConnect(struct ncclSocket* sock) {
 
   /* check socket status */
   SYSCHECK(getsockopt(sock->fd, SOL_SOCKET, SO_ERROR, (void*)&ret, &rlen), "getsockopt");
-
-  if (ret == 0) {
-    sock->state = ncclSocketStateConnected;
-  } else if (ret == ECONNREFUSED) {
-    if (++sock->refusedRetries == RETRY_REFUSED_TIMES) {
-      sock->state = ncclSocketStateError;
-      WARN("socketPollConnect: exceeded retries (%d)", sock->refusedRetries);
-      return ncclRemoteError;
-    }
-    if (sock->refusedRetries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno));
-    usleep(SLEEP_INT);
-    sock->state = ncclSocketStateConnecting;
-  } else if (ret == ETIMEDOUT) {
-    if (++sock->timedOutRetries == RETRY_TIMEDOUT_TIMES) {
-      sock->state = ncclSocketStateError;
-      WARN("socketPollConnect: exceeded timeouts (%d)", sock->timedOutRetries);
-      return ncclRemoteError;
-    }
-    usleep(SLEEP_INT);
-    sock->state = ncclSocketStateConnecting;
-  } else if (ret != EINPROGRESS) {
-    sock->state = ncclSocketStateError;
-    char line[SOCKET_NAME_MAXLEN+1];
-    WARN("socketPollConnect: Connect to %s returned %d(%s) errno %d(%s)", ncclSocketToString(&sock->addr, line), ret, strerror(ret), errno, strerror(errno));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
+  return socketConnectCheck(sock, ret, __func__);
 }
 
 ncclResult_t ncclSocketPollConnect(struct ncclSocket* sock) {
@@ -548,12 +598,24 @@ ncclResult_t ncclSocketPollConnect(struct ncclSocket* sock) {
 }
 
 static ncclResult_t socketFinalizeConnect(struct ncclSocket* sock) {
-  int sent = 0;
-  NCCLCHECK(socketProgress(NCCL_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent));
-  if (sent == 0) return ncclSuccess;
-  NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent));
-  sent = 0;
-  NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, &sock->type, sizeof(sock->type), &sent));
+  int sent;
+  if (sock->asyncFlag == 0) {
+    sent = 0;
+    NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent));
+    sent = 0;
+    NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, &sock->type, sizeof(sock->type), &sent));
+  } else {
+    if (sock->finalizeCounter < sizeof(sock->magic)) {
+      sent = sock->finalizeCounter;
+      NCCLCHECK(socketProgress(NCCL_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent));
+      sock->finalizeCounter = sent;
+      if (sent < sizeof(sock->magic)) return ncclSuccess;
+    }
+    sent = sock->finalizeCounter - sizeof(sock->magic);
+    NCCLCHECK(socketProgress(NCCL_SOCKET_SEND, sock, &sock->type, sizeof(sock->type), &sent));
+    sock->finalizeCounter = sent + sizeof(sock->magic);
+    if (sent < sizeof(sock->type)) return ncclSuccess;
+  }
   sock->state = ncclSocketStateReady;
   return ncclSuccess;
 }
@@ -598,7 +660,6 @@ ncclResult_t ncclSocketConnect(struct ncclSocket* sock) {
 #ifdef ENABLE_TRACE
   char line[SOCKET_NAME_MAXLEN+1];
 #endif
-  const int one = 1;
 
   if (sock == NULL) {
     WARN("ncclSocketConnect: pass NULL socket");
@@ -616,9 +677,8 @@ ncclResult_t ncclSocketConnect(struct ncclSocket* sock) {
   }
   TRACE(NCCL_INIT|NCCL_NET,"Connecting to socket %s", ncclSocketToString(&sock->addr, line));
 
-  SYSCHECK(setsockopt(sock->fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt");
-
   sock->state = ncclSocketStateConnecting;
+  sock->finalizeCounter = 0;
   do {
     NCCLCHECK(socketProgressState(sock));
   } while (sock->asyncFlag == 0 &&
@@ -664,6 +724,7 @@ ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listen
     memcpy(sock, listenSock, sizeof(struct ncclSocket));
     sock->acceptFd = listenSock->fd;
     sock->state = ncclSocketStateAccepting;
+    sock->finalizeCounter = 0;
   }
 
   do {
@@ -694,12 +755,11 @@ exit:
   return ret;
 }
 
-ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr, uint64_t magic, enum ncclSocketType type, volatile uint32_t* abortFlag, int asyncFlag) {
+ncclResult_t ncclSocketInit(struct ncclSocket* sock, const union ncclSocketAddress* addr, uint64_t magic, enum ncclSocketType type, volatile uint32_t* abortFlag, int asyncFlag, int customRetry) {
   ncclResult_t ret = ncclSuccess;
 
   if (sock == NULL) goto exit;
-  sock->timedOutRetries = 0;
-  sock->refusedRetries = 0;
+  sock->errorRetries = 0;
   sock->abortFlag = abortFlag;
   sock->asyncFlag = asyncFlag;
   sock->state = ncclSocketStateInitialized;
@@ -707,6 +767,7 @@ ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* ad
   sock->type = type;
   sock->fd = -1;
   sock->acceptFd = -1;
+  sock->customRetry = customRetry;
 
   if (addr) {
     /* IPv4/IPv6 support */
@@ -718,28 +779,14 @@ ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* ad
       WARN("ncclSocketInit: connecting to address %s with family %d is neither AF_INET(%d) nor AF_INET6(%d)",
           ncclSocketToString(&sock->addr, line), family, AF_INET, AF_INET6);
       ret = ncclInternalError;
-      goto fail;
+      goto exit;
     }
     sock->salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
-
-    /* Connect to a hostname / port */
-    sock->fd = socket(family, SOCK_STREAM, 0);
-    if (sock->fd == -1) {
-      WARN("ncclSocketInit: Socket creation failed : %s", strerror(errno));
-      ret = ncclSystemError;
-      goto fail;
-    }
+    // in case of error, we close the fd before returning as it's unclear if the caller has to use ncclSocketClose for cleanup
+    NCCLCHECKGOTO(socketResetFd(sock), ret, fail);
   } else {
     memset(&sock->addr, 0, sizeof(union ncclSocketAddress));
   }
-
-  /* Set socket as non-blocking if async or if we need to be able to abort */
-  if ((sock->asyncFlag || sock->abortFlag) && sock->fd >= 0) {
-    int flags;
-    SYSCHECKGOTO(flags = fcntl(sock->fd, F_GETFL), "fcntl", ret, fail);
-    SYSCHECKGOTO(fcntl(sock->fd, F_SETFL, flags | O_NONBLOCK), "fcntl", ret, fail);
-  }
-
 exit:
   return ret;
 fail:
@@ -750,12 +797,12 @@ fail:
   goto exit;
 }
 
-ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
+ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int* closed) {
   if (sock == NULL) {
     WARN("ncclSocketProgress: pass NULL socket");
     return ncclInvalidArgument;
   }
-  NCCLCHECK(socketProgress(op, sock, ptr, size, offset));
+  NCCLCHECK(socketProgress(op, sock, ptr, size, offset, closed));
   return ncclSuccess;
 }
 
@@ -788,7 +835,7 @@ ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size) {
     WARN("ncclSocketRecv: pass NULL socket");
     return ncclInvalidArgument;
   }
-  if (sock->state != ncclSocketStateReady) {
+  if (sock->state != ncclSocketStateReady && sock->state != ncclSocketStateTerminating) {
     WARN("ncclSocketRecv: socket state (%d) is not ready", sock->state);
     return ncclInternalError;
   }
@@ -802,7 +849,8 @@ ncclResult_t ncclSocketSendRecv(struct ncclSocket* sendSock, void* sendPtr, int
     WARN("ncclSocketSendRecv: invalid socket %p/%p", sendSock, recvSock);
     return ncclInternalError;
   }
-  if (sendSock->state != ncclSocketStateReady || recvSock->state != ncclSocketStateReady) {
+  if (sendSock->state != ncclSocketStateReady ||
+      (recvSock->state != ncclSocketStateReady && recvSock->state != ncclSocketStateTerminating)) {
     WARN("ncclSocketSendRecv: socket state (%d/%d) is not ready", sendSock->state, recvSock->state);
     return ncclInternalError;
   }
@@ -846,9 +894,20 @@ ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int
   return ncclSuccess;
 }
 
-ncclResult_t ncclSocketClose(struct ncclSocket* sock) {
+// Make it possible to close just one part of a socket.
+ncclResult_t ncclSocketShutdown(struct ncclSocket* sock, int how) {
   if (sock != NULL) {
     if (sock->fd >= 0) {
+      shutdown(sock->fd, how);
+    }
+    sock->state = ncclSocketStateTerminating;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketClose(struct ncclSocket* sock) {
+  if (sock != NULL) {
+    if (sock->state > ncclSocketStateNone && sock->state < ncclSocketStateNum && sock->fd >= 0) {
       /* shutdown() is needed to send FIN packet to proxy thread; shutdown() is not affected
        * by refcount of fd, but close() is. close() won't close a fd and send FIN packet if
        * the fd is duplicated (e.g. fork()). So shutdown() guarantees the correct and graceful
diff --git a/src/misc/tuner.cc b/src/misc/tuner.cc
index f1a9756f13..267e12a032 100644
--- a/src/misc/tuner.cc
+++ b/src/misc/tuner.cc
@@ -16,9 +16,11 @@
 pthread_mutex_t tunerPluginLock = PTHREAD_MUTEX_INITIALIZER;
 static int tunerPluginRefCount;
 static void* tunerPluginLib = nullptr;
-static ncclTuner_v3_t* tunerSymbol = nullptr;
+static ncclTuner_v4_t* tunerSymbol = nullptr;
+static ncclTuner_v3_t* ncclTuner_v3 = nullptr;
 static ncclTuner_v2_t* ncclTuner_v2 = nullptr;
-static ncclTuner_v3_t ncclTuner_v2_as_v3;
+static ncclTuner_v4_t ncclTuner_v2_as_v4;
+static ncclTuner_v4_t ncclTuner_v3_as_v4;
 
 static int hasNvlsSupport(float** collCostTable) {
   // Requirements for support of different algorithms:
@@ -39,7 +41,20 @@ static int hasCollNetSupport(float** collCostTable) {
   return (table[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] == NCCL_ALGO_PROTO_IGNORE) ? 0 : 1;
 }
 
-static ncclResult_t ncclTuner_v2_as_v3_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo __attribute__((unused)), int numProto __attribute__((unused)), int* nChannels) {
+static ncclResult_t ncclTuner_v3_as_v4_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo, int numProto, int regBuff __attribute__((unused)), int* nChannels) {
+  NCCLCHECK(ncclTuner_v3->getCollInfo(context, collType, nBytes, numPipeOps, collCostTable, numAlgo, numProto,  nChannels));
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclTuner_v3_as_v4_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void** context) {
+  NCCLCHECK(ncclTuner_v3->init(nRanks, nNodes, logFunction, context));
+  ncclTuner_v3_as_v4.name = ncclTuner_v3->name;
+  ncclTuner_v3_as_v4.getCollInfo = ncclTuner_v3_as_v4_getCollInfo;
+  ncclTuner_v3_as_v4.destroy = ncclTuner_v3->destroy;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclTuner_v2_as_v4_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo __attribute__((unused)), int numProto __attribute__((unused)), int regBuff __attribute__((unused)), int* nChannels) {
   int algorithm = NCCL_ALGO_UNDEF;
   int protocol = NCCL_PROTO_UNDEF;
   int nvlsSupport = hasNvlsSupport(collCostTable);
@@ -53,11 +68,11 @@ static ncclResult_t ncclTuner_v2_as_v3_getCollInfo(void* context, ncclFunc_t col
   return ncclSuccess;
 }
 
-static ncclResult_t ncclTuner_v2_as_v3_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void** context) {
+static ncclResult_t ncclTuner_v2_as_v4_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void** context) {
   NCCLCHECK(ncclTuner_v2->init(nRanks, nNodes, logFunction, context));
-  ncclTuner_v2_as_v3.name = ncclTuner_v2->name;
-  ncclTuner_v2_as_v3.getCollInfo = ncclTuner_v2_as_v3_getCollInfo;
-  ncclTuner_v2_as_v3.destroy = ncclTuner_v2->destroy;
+  ncclTuner_v2_as_v4.name = ncclTuner_v2->name;
+  ncclTuner_v2_as_v4.getCollInfo = ncclTuner_v2_as_v4_getCollInfo;
+  ncclTuner_v2_as_v4.destroy = ncclTuner_v2->destroy;
   return ncclSuccess;
 }
 
@@ -198,18 +213,26 @@ ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm) {
     goto fail;
   }
 
-  tunerSymbol = (ncclTuner_v3_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v3");
+  tunerSymbol = (ncclTuner_v4_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v4");
   if (tunerSymbol == nullptr) {
-    INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.");
-    ncclTuner_v2 = (ncclTuner_v2_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v2");
-    if (ncclTuner_v2 == nullptr) {
-      INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.");
-      dlclose(tunerPluginLib);
-      goto fail;
+    INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.");
+    ncclTuner_v3 = (ncclTuner_v3_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v3");
+    if (ncclTuner_v3 == nullptr) {
+      INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.");
+      ncclTuner_v2 = (ncclTuner_v2_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v2");
+      if (ncclTuner_v2 == nullptr) {
+        INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.");
+        dlclose(tunerPluginLib);
+        goto fail;
+      } else {
+        ncclTuner_v2_as_v4.init = ncclTuner_v2_as_v4_init;
+        ncclTuner_v2_as_v4.name = ncclTuner_v2->name;
+        tunerSymbol = &ncclTuner_v2_as_v4;
+      }
     } else {
-      ncclTuner_v2_as_v3.init = ncclTuner_v2_as_v3_init;
-      ncclTuner_v2_as_v3.name = ncclTuner_v2->name;
-      tunerSymbol = &ncclTuner_v2_as_v3;
+      ncclTuner_v3_as_v4.init = ncclTuner_v3_as_v4_init;
+      ncclTuner_v3_as_v4.name = ncclTuner_v3->name;
+      tunerSymbol = &ncclTuner_v3_as_v4;
     }
   }
 
diff --git a/src/nccl.h.in b/src/nccl.h.in
index 431ecb5546..8a6f94e24c 100644
--- a/src/nccl.h.in
+++ b/src/nccl.h.in
@@ -12,6 +12,9 @@
 #if CUDART_VERSION >= 11000
 #include <cuda_bf16.h>
 #endif
+#if CUDART_VERSION >= 11080
+#include <cuda_fp8.h>
+#endif
 
 #define NCCL_MAJOR ${nccl:Major}
 #define NCCL_MINOR ${nccl:Minor}
@@ -183,6 +186,10 @@ const char* pncclGetErrorString(ncclResult_t result);
 const char*  ncclGetLastError(ncclComm_t comm);
 const char* pncclGetLastError(ncclComm_t comm);
 
+/* Reload environment variables that determine logging. */
+void  ncclResetDebugInit();
+void pncclResetDebugInit();
+
 /* Checks whether the comm has encountered any asynchronous errors */
 ncclResult_t  ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
 ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
@@ -236,12 +243,10 @@ typedef enum { ncclInt8       = 0, ncclChar       = 0,
                ncclFloat16    = 6, ncclHalf       = 6,
                ncclFloat32    = 7, ncclFloat      = 7,
                ncclFloat64    = 8, ncclDouble     = 8,
-#if defined(__CUDA_BF16_TYPES_EXIST__)
                ncclBfloat16   = 9,
-               ncclNumTypes   = 10
-#else
-               ncclNumTypes   = 9
-#endif
+               ncclFloat8e4m3 = 10,
+               ncclFloat8e5m2 = 11,
+               ncclNumTypes   = 12
 } ncclDataType_t;
 
 /* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */
diff --git a/src/net.cc b/src/net.cc
index 97a8c73816..13e8c2b517 100644
--- a/src/net.cc
+++ b/src/net.cc
@@ -15,20 +15,95 @@
 //#include <sys/stat.h>
 //#include <unistd.h>
 
-static ncclNet_v8_t ncclNet_v5_as_v8;
-static ncclNet_v8_t ncclNet_v6_as_v8;
-static ncclNet_v8_t ncclNet_v7_as_v8;
+static ncclNet_v9_t ncclNet_v5_as_v9;
+static ncclNet_v9_t ncclNet_v6_as_v9;
+static ncclNet_v9_t ncclNet_v7_as_v9;
+static ncclNet_v9_t ncclNet_v8_as_v9;
 static ncclNet_v5_t *ncclNet_v5;
 static ncclNet_v6_t *ncclNet_v6;
 static ncclNet_v7_t *ncclNet_v7;
-static ncclCollNet_v8_t ncclCollNet_v5_as_v8;
-static ncclCollNet_v8_t ncclCollNet_v6_as_v8;
-static ncclCollNet_v8_t ncclCollNet_v7_as_v8;
+static ncclNet_v8_t *ncclNet_v8;
+static ncclCollNet_v9_t ncclCollNet_v5_as_v9;
+static ncclCollNet_v9_t ncclCollNet_v6_as_v9;
+static ncclCollNet_v9_t ncclCollNet_v7_as_v9;
+static ncclCollNet_v9_t ncclCollNet_v8_as_v9;
 static ncclCollNet_v5_t *ncclCollNet_v5;
 static ncclCollNet_v6_t *ncclCollNet_v6;
 static ncclCollNet_v7_t *ncclCollNet_v7;
+static ncclCollNet_v8_t *ncclCollNet_v8;
 
-static ncclResult_t ncclNet_v7_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) {
+#define MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two.
+#define MAX_COLLNET_SIZE (512*1024*1024L) //Set for initial collent plugins when size was not dynamically queried
+
+static ncclResult_t ncclNet_v8_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
+  ncclNetProperties_v8_t p8;
+  ncclResult_t ans = ncclNet_v8->getProperties(dev, &p8);
+  if (ans != ncclSuccess) return ans;
+  props->name = p8.name;
+  props->pciPath = p8.pciPath;
+  props->guid = p8.guid;
+  props->ptrSupport = p8.ptrSupport;
+  props->regIsGlobal = p8.regIsGlobal;
+  props->forceFlush = 0;
+  props->speed = p8.speed;
+  props->port = p8.port;
+  props->maxComms = p8.maxComms;
+  props->maxRecvs = p8.maxRecvs;
+  props->latency = p8.latency;
+  props->netDeviceType = p8.netDeviceType;
+  props->netDeviceVersion = p8.netDeviceVersion;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclNet_v8_as_v9_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
+   int sizeInt;
+   if (size > MAX_NET_SIZE) return ncclInternalError;
+   sizeInt = (int)size;
+   ncclResult_t ans = ncclNet_v8->isend(sendComm, data, sizeInt, tag, mhandle, request);
+   return ans;
+}
+
+static ncclResult_t ncclNet_v8_as_v9_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
+   int sizesInt[NCCL_PROXY_MAX_SUBS];
+   //reset to NULL if optional receive completion is set
+   if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = NULL;
+   for (int i=0; i<n; i++) {
+     if (sizes[i] > MAX_NET_SIZE) return ncclInternalError;
+     sizesInt[i] = (int) sizes[i];
+   }
+   ncclResult_t ans = ncclNet_v8->irecv(recvComm, n, data, sizesInt, tags, mhandles, request);
+   return ans;
+}
+
+static ncclResult_t ncclNet_v8_as_v9_init(ncclDebugLogger_t logfn) {
+  NCCLCHECK(ncclNet_v8->init(logfn));
+  ncclNet_v8_as_v9.name = ncclNet_v8->name;
+  ncclNet_v8_as_v9.devices = ncclNet_v8->devices;
+  ncclNet_v8_as_v9.getProperties = ncclNet_v8_as_v9_getProperties;
+  ncclNet_v8_as_v9.listen = ncclNet_v8->listen;
+  ncclNet_v8_as_v9.connect = ncclNet_v8->connect;
+  ncclNet_v8_as_v9.accept =  ncclNet_v8->accept;
+  ncclNet_v8_as_v9.regMr = ncclNet_v8->regMr;
+  ncclNet_v8_as_v9.regMrDmaBuf = ncclNet_v8->regMrDmaBuf;
+  ncclNet_v8_as_v9.deregMr = ncclNet_v8->deregMr;
+  ncclNet_v8_as_v9.isend = ncclNet_v8_as_v9_isend;
+  ncclNet_v8_as_v9.irecv = ncclNet_v8_as_v9_irecv;
+  ncclNet_v8_as_v9.iflush = ncclNet_v8->iflush;
+  ncclNet_v8_as_v9.test = ncclNet_v8->test;
+  ncclNet_v8_as_v9.closeSend = ncclNet_v8->closeSend;
+  ncclNet_v8_as_v9.closeRecv = ncclNet_v8->closeRecv;
+  ncclNet_v8_as_v9.closeListen = ncclNet_v8->closeListen;
+  ncclNet_v8_as_v9.getDeviceMr = ncclNet_v8->getDeviceMr;
+  ncclNet_v8_as_v9.irecvConsumed = ncclNet_v8->irecvConsumed;
+  ncclNet_v8_as_v9.makeVDevice   = NULL;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclNet_v7_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
   ncclNetProperties_v7_t p7;
   ncclResult_t ans = ncclNet_v7->getProperties(dev, &p7);
   if (ans != ncclSuccess) return ans;
@@ -37,6 +112,7 @@ static ncclResult_t ncclNet_v7_as_v8_getProperties(int dev, ncclNetProperties_v8
   props->guid = p7.guid;
   props->ptrSupport = p7.ptrSupport;
   props->regIsGlobal = 0;
+  props->forceFlush = 0;
   props->speed = p7.speed;
   props->port = p7.port;
   props->maxComms = p7.maxComms;
@@ -44,38 +120,63 @@ static ncclResult_t ncclNet_v7_as_v8_getProperties(int dev, ncclNetProperties_v8
   props->latency = p7.latency;
   props->netDeviceType = p7.netDeviceType;
   props->netDeviceVersion = p7.netDeviceVersion;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
   return ncclSuccess;
 }
 
-static ncclResult_t ncclNet_v7_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
+static ncclResult_t ncclNet_v7_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
   if (size >= 1UL<<31) return ncclInternalError;
   return ncclNet_v7->regMr(comm, data, (int) size, type, mhandle);
 }
 
-static ncclResult_t ncclNet_v7_as_v8_init(ncclDebugLogger_t logfn) {
+static ncclResult_t ncclNet_v7_as_v9_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
+   int sizeInt;
+   if (size > MAX_NET_SIZE) return ncclInternalError;
+   sizeInt = (int)size;
+   ncclResult_t ans = ncclNet_v7->isend(sendComm, data, sizeInt, tag, mhandle, request);
+   return ans;
+}
+
+static ncclResult_t ncclNet_v7_as_v9_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
+   int sizesInt[NCCL_PROXY_MAX_SUBS];
+   //reset to NULL if optional receive completion is set
+   if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = NULL;
+   for (int i=0; i<n; i++) {
+     if (sizes[i] > MAX_NET_SIZE) return ncclInternalError;
+     sizesInt[i] = (int) sizes[i];
+   }
+   ncclResult_t ans = ncclNet_v7->irecv(recvComm, n, data, sizesInt, tags, mhandles, request);
+   return ans;
+}
+
+static ncclResult_t ncclNet_v7_as_v9_init(ncclDebugLogger_t logfn) {
   NCCLCHECK(ncclNet_v7->init(logfn));
-  ncclNet_v7_as_v8.name = ncclNet_v7->name;
-  ncclNet_v7_as_v8.devices = ncclNet_v7->devices;
-  ncclNet_v7_as_v8.getProperties = ncclNet_v7_as_v8_getProperties; // ncclNet_v5->getProperties;
-  ncclNet_v7_as_v8.listen = ncclNet_v7->listen;
-  ncclNet_v7_as_v8.connect = ncclNet_v7->connect;
-  ncclNet_v7_as_v8.accept =  ncclNet_v7->accept;
-  ncclNet_v7_as_v8.regMr = ncclNet_v7_as_v8_regMr;
-  ncclNet_v7_as_v8.regMrDmaBuf = ncclNet_v7->regMrDmaBuf;
-  ncclNet_v7_as_v8.deregMr = ncclNet_v7->deregMr;
-  ncclNet_v7_as_v8.isend = ncclNet_v7->isend;
-  ncclNet_v7_as_v8.irecv = ncclNet_v7->irecv;
-  ncclNet_v7_as_v8.iflush = ncclNet_v7->iflush;
-  ncclNet_v7_as_v8.test = ncclNet_v7->test;
-  ncclNet_v7_as_v8.closeSend = ncclNet_v7->closeSend;
-  ncclNet_v7_as_v8.closeRecv = ncclNet_v7->closeRecv;
-  ncclNet_v7_as_v8.closeListen = ncclNet_v7->closeListen;
-  ncclNet_v7_as_v8.getDeviceMr = ncclNet_v7->getDeviceMr;
-  ncclNet_v7_as_v8.irecvConsumed = ncclNet_v7->irecvConsumed;
+  ncclNet_v7_as_v9.name = ncclNet_v7->name;
+  ncclNet_v7_as_v9.devices = ncclNet_v7->devices;
+  ncclNet_v7_as_v9.getProperties = ncclNet_v7_as_v9_getProperties; // ncclNet_v5->getProperties;
+  ncclNet_v7_as_v9.listen = ncclNet_v7->listen;
+  ncclNet_v7_as_v9.connect = ncclNet_v7->connect;
+  ncclNet_v7_as_v9.accept =  ncclNet_v7->accept;
+  ncclNet_v7_as_v9.regMr = ncclNet_v7_as_v9_regMr;
+  ncclNet_v7_as_v9.regMrDmaBuf = ncclNet_v7->regMrDmaBuf;
+  ncclNet_v7_as_v9.deregMr = ncclNet_v7->deregMr;
+  ncclNet_v7_as_v9.isend = ncclNet_v7_as_v9_isend;
+  ncclNet_v7_as_v9.irecv = ncclNet_v7_as_v9_irecv;
+  ncclNet_v7_as_v9.iflush = ncclNet_v7->iflush;
+  ncclNet_v7_as_v9.test = ncclNet_v7->test;
+  ncclNet_v7_as_v9.closeSend = ncclNet_v7->closeSend;
+  ncclNet_v7_as_v9.closeRecv = ncclNet_v7->closeRecv;
+  ncclNet_v7_as_v9.closeListen = ncclNet_v7->closeListen;
+  ncclNet_v7_as_v9.getDeviceMr = ncclNet_v7->getDeviceMr;
+  ncclNet_v7_as_v9.irecvConsumed = ncclNet_v7->irecvConsumed;
+  ncclNet_v7_as_v9.makeVDevice  = NULL;
   return ncclSuccess;
 }
 
-static ncclResult_t ncclNet_v6_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) {
+static ncclResult_t ncclNet_v6_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
   ncclNetProperties_v6_t p6;
   ncclResult_t ans = ncclNet_v6->getProperties(dev, &p6);
   if (ans != ncclSuccess) return ans;
@@ -84,6 +185,7 @@ static ncclResult_t ncclNet_v6_as_v8_getProperties(int dev, ncclNetProperties_v8
   props->guid = p6.guid;
   props->ptrSupport = p6.ptrSupport;
   props->regIsGlobal = 0;
+  props->forceFlush = 0;
   props->speed = p6.speed;
   props->port = p6.port;
   props->maxComms = p6.maxComms;
@@ -91,46 +193,71 @@ static ncclResult_t ncclNet_v6_as_v8_getProperties(int dev, ncclNetProperties_v8
   props->latency = p6.latency;
   props->netDeviceType = NCCL_NET_DEVICE_HOST;
   props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
   return ncclSuccess;
 }
 
-static ncclResult_t ncclNet_v6_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
+static ncclResult_t ncclNet_v6_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
   if (size >= 1UL<<31) return ncclInternalError;
   return ncclNet_v6->regMr(comm, data, (int) size, type, mhandle);
 }
 
-static ncclResult_t ncclNet_v6_as_v8_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
+static ncclResult_t ncclNet_v6_as_v9_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
   return ncclNet_v6->connect(dev, handle, sendComm);
 }
 
-static ncclResult_t ncclNet_v6_as_v8_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
+static ncclResult_t ncclNet_v6_as_v9_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
   return ncclNet_v6->accept(listenComm, recvComm);
 }
 
-static ncclResult_t ncclNet_v6_as_v8_init(ncclDebugLogger_t logfn) {
+static ncclResult_t ncclNet_v6_as_v9_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
+   int sizeInt;
+   if (size > MAX_NET_SIZE) return ncclInternalError;
+   sizeInt = (int)size;
+   ncclResult_t ans = ncclNet_v6->isend(sendComm, data, sizeInt, tag, mhandle, request);
+   return ans;
+}
+
+static ncclResult_t ncclNet_v6_as_v9_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
+   int sizesInt[NCCL_PROXY_MAX_SUBS];
+   //reset to NULL if optional receive completion is set
+   if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = NULL;
+   for (int i=0; i<n; i++) {
+     if (sizes[i] > MAX_NET_SIZE) return ncclInternalError;
+     sizesInt[i] = (int) sizes[i];
+   }
+   ncclResult_t ans = ncclNet_v6->irecv(recvComm, n, data, sizesInt, tags, mhandles, request);
+   return ans;
+}
+
+static ncclResult_t ncclNet_v6_as_v9_init(ncclDebugLogger_t logfn) {
   NCCLCHECK(ncclNet_v6->init(logfn));
-  ncclNet_v6_as_v8.name = ncclNet_v6->name;
-  ncclNet_v6_as_v8.devices = ncclNet_v6->devices;
-  ncclNet_v6_as_v8.getProperties = ncclNet_v6_as_v8_getProperties; // ncclNet_v5->getProperties;
-  ncclNet_v6_as_v8.listen = ncclNet_v6->listen;
-  ncclNet_v6_as_v8.connect = ncclNet_v6_as_v8_connect;
-  ncclNet_v6_as_v8.accept =  ncclNet_v6_as_v8_accept;
-  ncclNet_v6_as_v8.regMr = ncclNet_v6_as_v8_regMr;
-  ncclNet_v6_as_v8.regMrDmaBuf = ncclNet_v6->regMrDmaBuf;
-  ncclNet_v6_as_v8.deregMr = ncclNet_v6->deregMr;
-  ncclNet_v6_as_v8.isend = ncclNet_v6->isend;
-  ncclNet_v6_as_v8.irecv = ncclNet_v6->irecv;
-  ncclNet_v6_as_v8.iflush = ncclNet_v6->iflush;
-  ncclNet_v6_as_v8.test = ncclNet_v6->test;
-  ncclNet_v6_as_v8.closeSend = ncclNet_v6->closeSend;
-  ncclNet_v6_as_v8.closeRecv = ncclNet_v6->closeRecv;
-  ncclNet_v6_as_v8.closeListen = ncclNet_v6->closeListen;
-  ncclNet_v6_as_v8.getDeviceMr = NULL;
-  ncclNet_v6_as_v8.irecvConsumed = NULL;
+  ncclNet_v6_as_v9.name = ncclNet_v6->name;
+  ncclNet_v6_as_v9.devices = ncclNet_v6->devices;
+  ncclNet_v6_as_v9.getProperties = ncclNet_v6_as_v9_getProperties;
+  ncclNet_v6_as_v9.listen = ncclNet_v6->listen;
+  ncclNet_v6_as_v9.connect = ncclNet_v6_as_v9_connect;
+  ncclNet_v6_as_v9.accept =  ncclNet_v6_as_v9_accept;
+  ncclNet_v6_as_v9.regMr = ncclNet_v6_as_v9_regMr;
+  ncclNet_v6_as_v9.regMrDmaBuf = ncclNet_v6->regMrDmaBuf;
+  ncclNet_v6_as_v9.deregMr = ncclNet_v6->deregMr;
+  ncclNet_v6_as_v9.isend = ncclNet_v6_as_v9_isend;
+  ncclNet_v6_as_v9.irecv = ncclNet_v6_as_v9_irecv;
+  ncclNet_v6_as_v9.iflush = ncclNet_v6->iflush;
+  ncclNet_v6_as_v9.test = ncclNet_v6->test;
+  ncclNet_v6_as_v9.closeSend = ncclNet_v6->closeSend;
+  ncclNet_v6_as_v9.closeRecv = ncclNet_v6->closeRecv;
+  ncclNet_v6_as_v9.closeListen = ncclNet_v6->closeListen;
+  ncclNet_v6_as_v9.getDeviceMr = NULL;
+  ncclNet_v6_as_v9.irecvConsumed = NULL;
+  ncclNet_v6_as_v9.makeVDevice  = NULL;
   return ncclSuccess;
 }
 
-static ncclResult_t ncclNet_v5_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) {
+static ncclResult_t ncclNet_v5_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
   ncclNetProperties_v6_t p6;
   ncclResult_t ans = ncclNet_v5->getProperties(dev, &p6);
   if (ans != ncclSuccess) return ans;
@@ -139,6 +266,7 @@ static ncclResult_t ncclNet_v5_as_v8_getProperties(int dev, ncclNetProperties_v8
   props->guid = p6.guid;
   props->ptrSupport = p6.ptrSupport;
   props->regIsGlobal = 0;
+  props->forceFlush = 0;
   props->speed = p6.speed;
   props->port = p6.port;
   props->maxComms = p6.maxComms;
@@ -146,48 +274,73 @@ static ncclResult_t ncclNet_v5_as_v8_getProperties(int dev, ncclNetProperties_v8
   props->latency = p6.latency;
   props->netDeviceType    = NCCL_NET_DEVICE_HOST;
   props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
   return ncclSuccess;
 }
 
-static ncclResult_t ncclNet_v5_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
+static ncclResult_t ncclNet_v5_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
   if (size >= 1UL<<31) return ncclInternalError;
   return ncclNet_v5->regMr(comm, data, (int) size, type, mhandle);
 }
 
-static ncclResult_t ncclNet_v5_as_v8_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
+static ncclResult_t ncclNet_v5_as_v9_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
   return ncclNet_v5->connect(dev, handle, sendComm);
 }
 
-static ncclResult_t ncclNet_v5_as_v8_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
+static ncclResult_t ncclNet_v5_as_v9_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
   return ncclNet_v5->accept(listenComm, recvComm);
 }
 
+static ncclResult_t ncclNet_v5_as_v9_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
+   int sizeInt;
+   if (size > MAX_NET_SIZE) return ncclInternalError;
+   sizeInt = (int)size;
+   ncclResult_t ans = ncclNet_v5->isend(sendComm, data, sizeInt, tag, mhandle, request);
+   return ans;
+}
+
+static ncclResult_t ncclNet_v5_as_v9_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
+   int sizesInt[NCCL_PROXY_MAX_SUBS];
+   //reset to NULL if optional receive completion is set
+   if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = NULL;
+   for (int i=0; i<n; i++) {
+     if (sizes[i] > MAX_NET_SIZE) return ncclInternalError;
+     sizesInt[i] = (int) sizes[i];
+   }
+   ncclResult_t ans = ncclNet_v5->irecv(recvComm, n, data, sizesInt, tags, mhandles, request);
+   return ans;
+}
+
 // We use a wrapper around the v5 init to copy over the struct contents
 // post-init since they may not be initialized before hand.
-static ncclResult_t ncclNet_v5_as_v8_init(ncclDebugLogger_t logfn) {
+static ncclResult_t ncclNet_v5_as_v9_init(ncclDebugLogger_t logfn) {
   NCCLCHECK(ncclNet_v5->init(logfn));
-  ncclNet_v5_as_v8.name = ncclNet_v5->name;
-  ncclNet_v5_as_v8.devices = ncclNet_v5->devices;
-  ncclNet_v5_as_v8.getProperties = ncclNet_v5_as_v8_getProperties;
-  ncclNet_v5_as_v8.listen = ncclNet_v5->listen;
-  ncclNet_v5_as_v8.connect = ncclNet_v5_as_v8_connect;
-  ncclNet_v5_as_v8.accept =  ncclNet_v5_as_v8_accept;
-  ncclNet_v5_as_v8.regMr = ncclNet_v5_as_v8_regMr;
-  ncclNet_v5_as_v8.regMrDmaBuf = NULL;
-  ncclNet_v5_as_v8.deregMr = ncclNet_v5->deregMr;
-  ncclNet_v5_as_v8.isend = ncclNet_v5->isend;
-  ncclNet_v5_as_v8.irecv = ncclNet_v5->irecv;
-  ncclNet_v5_as_v8.iflush = ncclNet_v5->iflush;
-  ncclNet_v5_as_v8.test = ncclNet_v5->test;
-  ncclNet_v5_as_v8.closeSend = ncclNet_v5->closeSend;
-  ncclNet_v5_as_v8.closeRecv = ncclNet_v5->closeRecv;
-  ncclNet_v5_as_v8.closeListen = ncclNet_v5->closeListen;
-  ncclNet_v5_as_v8.getDeviceMr = NULL;
-  ncclNet_v5_as_v8.irecvConsumed = NULL;
+  ncclNet_v5_as_v9.name = ncclNet_v5->name;
+  ncclNet_v5_as_v9.devices = ncclNet_v5->devices;
+  ncclNet_v5_as_v9.getProperties = ncclNet_v5_as_v9_getProperties;
+  ncclNet_v5_as_v9.listen = ncclNet_v5->listen;
+  ncclNet_v5_as_v9.connect = ncclNet_v5_as_v9_connect;
+  ncclNet_v5_as_v9.accept =  ncclNet_v5_as_v9_accept;
+  ncclNet_v5_as_v9.regMr = ncclNet_v5_as_v9_regMr;
+  ncclNet_v5_as_v9.regMrDmaBuf = NULL;
+  ncclNet_v5_as_v9.deregMr = ncclNet_v5->deregMr;
+  ncclNet_v5_as_v9.isend = ncclNet_v5_as_v9_isend;
+  ncclNet_v5_as_v9.irecv = ncclNet_v5_as_v9_irecv;
+  ncclNet_v5_as_v9.iflush = ncclNet_v5->iflush;
+  ncclNet_v5_as_v9.test = ncclNet_v5->test;
+  ncclNet_v5_as_v9.closeSend = ncclNet_v5->closeSend;
+  ncclNet_v5_as_v9.closeRecv = ncclNet_v5->closeRecv;
+  ncclNet_v5_as_v9.closeListen = ncclNet_v5->closeListen;
+  ncclNet_v5_as_v9.getDeviceMr = NULL;
+  ncclNet_v5_as_v9.irecvConsumed = NULL;
+  ncclNet_v5_as_v9.makeVDevice = NULL;
   return ncclSuccess;
 }
 
-static ncclResult_t ncclCollNet_v5_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) {
+static ncclResult_t ncclCollNet_v5_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
   ncclNetProperties_v6_t p6;
   ncclResult_t ans = ncclCollNet_v5->getProperties(dev, &p6);
   if (ans != ncclSuccess) return ans;
@@ -196,6 +349,7 @@ static ncclResult_t ncclCollNet_v5_as_v8_getProperties(int dev, ncclNetPropertie
   props->guid = p6.guid;
   props->ptrSupport = p6.ptrSupport;
   props->regIsGlobal = 0;
+  props->forceFlush = 0;
   props->speed = p6.speed;
   props->port = p6.port;
   props->maxComms = p6.maxComms;
@@ -203,38 +357,52 @@ static ncclResult_t ncclCollNet_v5_as_v8_getProperties(int dev, ncclNetPropertie
   props->latency = p6.latency;
   props->netDeviceType    = NCCL_NET_DEVICE_HOST;
   props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
   return ncclSuccess;
 }
 
-static ncclResult_t ncclCollNet_v5_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
+static ncclResult_t ncclCollNet_v5_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
   if (size >= 1UL<<31) return ncclInternalError;
   return ncclCollNet_v5->regMr(comm, data, (int) size, type, mhandle);
 }
 
+static ncclResult_t ncclCollNet_v5_as_v9_iallreduce(void* collComm, void* sendData, void* recvData, size_t count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
+   int countInt;
+   if (count > MAX_NET_SIZE) return ncclInternalError;
+   countInt = (int)count;
+   ncclResult_t ans = ncclCollNet_v5->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp,
+                  sendMhandle, recvMhandle, request);
+   return ans;
+}
+
 // We use a wrapper around the v5 init to copy over the struct contents
 // post-init since they may not be initialized before hand.
-static ncclResult_t ncclCollNet_v5_as_v8_init(ncclDebugLogger_t logfn) {
+static ncclResult_t ncclCollNet_v5_as_v9_init(ncclDebugLogger_t logfn) {
   NCCLCHECK(ncclCollNet_v5->init(logfn));
-  ncclCollNet_v5_as_v8.name = ncclCollNet_v5->name;
-  ncclCollNet_v5_as_v8.devices = ncclCollNet_v5->devices;
-  ncclCollNet_v5_as_v8.getProperties = ncclCollNet_v5_as_v8_getProperties;
-  ncclCollNet_v5_as_v8.listen = ncclCollNet_v5->listen;
-  ncclCollNet_v5_as_v8.connect = ncclCollNet_v5->connect;
-  ncclCollNet_v5_as_v8.reduceSupport = ncclCollNet_v5->reduceSupport;
-  ncclCollNet_v5_as_v8.regMr = ncclCollNet_v5_as_v8_regMr;
-  ncclCollNet_v5_as_v8.regMrDmaBuf = NULL;
-  ncclCollNet_v5_as_v8.deregMr = ncclCollNet_v5->deregMr;
-  ncclCollNet_v5_as_v8.iallreduce = ncclCollNet_v5->iallreduce;
-  ncclCollNet_v5_as_v8.iallgather = nullptr;
-  ncclCollNet_v5_as_v8.ireducescatter = nullptr;
-  ncclCollNet_v5_as_v8.iflush = ncclCollNet_v5->iflush;
-  ncclCollNet_v5_as_v8.test = ncclCollNet_v5->test;
-  ncclCollNet_v5_as_v8.closeColl = ncclCollNet_v5->closeColl;
-  ncclCollNet_v5_as_v8.closeListen = ncclCollNet_v5->closeListen;
+  ncclCollNet_v5_as_v9.name = ncclCollNet_v5->name;
+  ncclCollNet_v5_as_v9.devices = ncclCollNet_v5->devices;
+  ncclCollNet_v5_as_v9.getProperties = ncclCollNet_v5_as_v9_getProperties;
+  ncclCollNet_v5_as_v9.listen = ncclCollNet_v5->listen;
+  ncclCollNet_v5_as_v9.connect = ncclCollNet_v5->connect;
+  ncclCollNet_v5_as_v9.reduceSupport = ncclCollNet_v5->reduceSupport;
+  ncclCollNet_v5_as_v9.regMr = ncclCollNet_v5_as_v9_regMr;
+  ncclCollNet_v5_as_v9.regMrDmaBuf = NULL;
+  ncclCollNet_v5_as_v9.deregMr = ncclCollNet_v5->deregMr;
+  ncclCollNet_v5_as_v9.iallreduce = ncclCollNet_v5_as_v9_iallreduce;
+  ncclCollNet_v5_as_v9.iallgather = nullptr;
+  ncclCollNet_v5_as_v9.ireducescatter = nullptr;
+  ncclCollNet_v5_as_v9.iflush = ncclCollNet_v5->iflush;
+  ncclCollNet_v5_as_v9.test = ncclCollNet_v5->test;
+  ncclCollNet_v5_as_v9.closeColl = ncclCollNet_v5->closeColl;
+  ncclCollNet_v5_as_v9.closeListen = ncclCollNet_v5->closeListen;
   return ncclSuccess;
 }
 
-static ncclResult_t ncclCollNet_v6_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) {
+static ncclResult_t ncclCollNet_v6_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
   ncclNetProperties_v6_t p6;
   ncclResult_t ans = ncclCollNet_v6->getProperties(dev, &p6);
   if (ans != ncclSuccess) return ans;
@@ -243,6 +411,7 @@ static ncclResult_t ncclCollNet_v6_as_v8_getProperties(int dev, ncclNetPropertie
   props->guid = p6.guid;
   props->ptrSupport = p6.ptrSupport;
   props->regIsGlobal = 0;
+  props->forceFlush = 0;
   props->speed = p6.speed;
   props->port = p6.port;
   props->maxComms = p6.maxComms;
@@ -250,38 +419,52 @@ static ncclResult_t ncclCollNet_v6_as_v8_getProperties(int dev, ncclNetPropertie
   props->latency = p6.latency;
   props->netDeviceType    = NCCL_NET_DEVICE_HOST;
   props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
   return ncclSuccess;
 }
 
-static ncclResult_t ncclCollNet_v6_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
+static ncclResult_t ncclCollNet_v6_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
   if (size >= 1UL<<31) return ncclInternalError;
   return ncclCollNet_v6->regMr(comm, data, (int) size, type, mhandle);
 }
 
+static ncclResult_t ncclCollNet_v6_as_v9_iallreduce(void* collComm, void* sendData, void* recvData, size_t count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
+   int countInt;
+   if (count > MAX_NET_SIZE) return ncclInternalError;
+   countInt = (int)count;
+   ncclResult_t ans = ncclCollNet_v6->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp,
+                  sendMhandle, recvMhandle, request);
+   return ans;
+}
+
 // We use a wrapper around the v6 init to copy over the struct contents
 // post-init since they may not be initialized before hand.
-static ncclResult_t ncclCollNet_v6_as_v8_init(ncclDebugLogger_t logfn) {
+static ncclResult_t ncclCollNet_v6_as_v9_init(ncclDebugLogger_t logfn) {
   NCCLCHECK(ncclCollNet_v6->init(logfn));
-  ncclCollNet_v6_as_v8.name = ncclCollNet_v6->name;
-  ncclCollNet_v6_as_v8.devices = ncclCollNet_v6->devices;
-  ncclCollNet_v6_as_v8.getProperties = ncclCollNet_v6_as_v8_getProperties;
-  ncclCollNet_v6_as_v8.listen = ncclCollNet_v6->listen;
-  ncclCollNet_v6_as_v8.connect = ncclCollNet_v6->connect;
-  ncclCollNet_v6_as_v8.reduceSupport = ncclCollNet_v6->reduceSupport;
-  ncclCollNet_v6_as_v8.regMr = ncclCollNet_v6_as_v8_regMr;
-  ncclCollNet_v6_as_v8.regMrDmaBuf = ncclCollNet_v6->regMrDmaBuf;
-  ncclCollNet_v6_as_v8.deregMr = ncclCollNet_v6->deregMr;
-  ncclCollNet_v6_as_v8.iallreduce = ncclCollNet_v6->iallreduce;
-  ncclCollNet_v6_as_v8.iallgather = nullptr;
-  ncclCollNet_v6_as_v8.ireducescatter = nullptr;
-  ncclCollNet_v6_as_v8.iflush = ncclCollNet_v6->iflush;
-  ncclCollNet_v6_as_v8.test = ncclCollNet_v6->test;
-  ncclCollNet_v6_as_v8.closeColl = ncclCollNet_v6->closeColl;
-  ncclCollNet_v6_as_v8.closeListen = ncclCollNet_v6->closeListen;
+  ncclCollNet_v6_as_v9.name = ncclCollNet_v6->name;
+  ncclCollNet_v6_as_v9.devices = ncclCollNet_v6->devices;
+  ncclCollNet_v6_as_v9.getProperties = ncclCollNet_v6_as_v9_getProperties;
+  ncclCollNet_v6_as_v9.listen = ncclCollNet_v6->listen;
+  ncclCollNet_v6_as_v9.connect = ncclCollNet_v6->connect;
+  ncclCollNet_v6_as_v9.reduceSupport = ncclCollNet_v6->reduceSupport;
+  ncclCollNet_v6_as_v9.regMr = ncclCollNet_v6_as_v9_regMr;
+  ncclCollNet_v6_as_v9.regMrDmaBuf = ncclCollNet_v6->regMrDmaBuf;
+  ncclCollNet_v6_as_v9.deregMr = ncclCollNet_v6->deregMr;
+  ncclCollNet_v6_as_v9.iallreduce = ncclCollNet_v6_as_v9_iallreduce;
+  ncclCollNet_v6_as_v9.iallgather = nullptr;
+  ncclCollNet_v6_as_v9.ireducescatter = nullptr;
+  ncclCollNet_v6_as_v9.iflush = ncclCollNet_v6->iflush;
+  ncclCollNet_v6_as_v9.test = ncclCollNet_v6->test;
+  ncclCollNet_v6_as_v9.closeColl = ncclCollNet_v6->closeColl;
+  ncclCollNet_v6_as_v9.closeListen = ncclCollNet_v6->closeListen;
   return ncclSuccess;
 }
 
-static ncclResult_t ncclCollNet_v7_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) {
+static ncclResult_t ncclCollNet_v7_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
   ncclNetProperties_v7_t p7;
   ncclResult_t ans = ncclCollNet_v7->getProperties(dev, &p7);
   if (ans != ncclSuccess) return ans;
@@ -290,6 +473,7 @@ static ncclResult_t ncclCollNet_v7_as_v8_getProperties(int dev, ncclNetPropertie
   props->guid = p7.guid;
   props->ptrSupport = p7.ptrSupport;
   props->regIsGlobal = 0;
+  props->forceFlush = 0;
   props->speed = p7.speed;
   props->port = p7.port;
   props->maxComms = p7.maxComms;
@@ -297,47 +481,150 @@ static ncclResult_t ncclCollNet_v7_as_v8_getProperties(int dev, ncclNetPropertie
   props->latency = p7.latency;
   props->netDeviceType    = NCCL_NET_DEVICE_HOST;
   props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
   return ncclSuccess;
 }
 
-static ncclResult_t ncclCollNet_v7_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
+static ncclResult_t ncclCollNet_v7_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
   if (size >= 1UL<<31) return ncclInternalError;
   return ncclCollNet_v7->regMr(comm, data, (int) size, type, mhandle);
 }
 
+static ncclResult_t ncclCollNet_v7_as_v9_iallreduce(void* collComm, void* sendData, void* recvData, size_t count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
+   int countInt;
+   if (count > MAX_NET_SIZE) return ncclInternalError;
+   countInt = (int)count;
+   ncclResult_t ans = ncclCollNet_v7->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp,
+                  sendMhandle, recvMhandle, request);
+   return ans;
+}
+
 // We use a wrapper around the v7 init to copy over the struct contents
 // post-init since they may not be initialized before hand.
-static ncclResult_t ncclCollNet_v7_as_v8_init(ncclDebugLogger_t logfn) {
+static ncclResult_t ncclCollNet_v7_as_v9_init(ncclDebugLogger_t logfn) {
   NCCLCHECK(ncclCollNet_v7->init(logfn));
-  ncclCollNet_v7_as_v8.name = ncclCollNet_v7->name;
-  ncclCollNet_v7_as_v8.devices = ncclCollNet_v7->devices;
-  ncclCollNet_v7_as_v8.getProperties = ncclCollNet_v7_as_v8_getProperties;
-  ncclCollNet_v7_as_v8.listen = ncclCollNet_v7->listen;
-  ncclCollNet_v7_as_v8.connect = ncclCollNet_v7->connect;
-  ncclCollNet_v7_as_v8.reduceSupport = ncclCollNet_v7->reduceSupport;
-  ncclCollNet_v7_as_v8.regMr = ncclCollNet_v7_as_v8_regMr;
-  ncclCollNet_v7_as_v8.regMrDmaBuf = ncclCollNet_v7->regMrDmaBuf;
-  ncclCollNet_v7_as_v8.deregMr = ncclCollNet_v7->deregMr;
-  ncclCollNet_v7_as_v8.iallreduce = ncclCollNet_v7->iallreduce;
-  ncclCollNet_v7_as_v8.iallgather = nullptr;
-  ncclCollNet_v7_as_v8.ireducescatter = nullptr;
-  ncclCollNet_v7_as_v8.iflush = ncclCollNet_v7->iflush;
-  ncclCollNet_v7_as_v8.test = ncclCollNet_v7->test;
-  ncclCollNet_v7_as_v8.closeColl = ncclCollNet_v7->closeColl;
-  ncclCollNet_v7_as_v8.closeListen = ncclCollNet_v7->closeListen;
+  ncclCollNet_v7_as_v9.name = ncclCollNet_v7->name;
+  ncclCollNet_v7_as_v9.devices = ncclCollNet_v7->devices;
+  ncclCollNet_v7_as_v9.getProperties = ncclCollNet_v7_as_v9_getProperties;
+  ncclCollNet_v7_as_v9.listen = ncclCollNet_v7->listen;
+  ncclCollNet_v7_as_v9.connect = ncclCollNet_v7->connect;
+  ncclCollNet_v7_as_v9.reduceSupport = ncclCollNet_v7->reduceSupport;
+  ncclCollNet_v7_as_v9.regMr = ncclCollNet_v7_as_v9_regMr;
+  ncclCollNet_v7_as_v9.regMrDmaBuf = ncclCollNet_v7->regMrDmaBuf;
+  ncclCollNet_v7_as_v9.deregMr = ncclCollNet_v7->deregMr;
+  ncclCollNet_v7_as_v9.iallreduce = ncclCollNet_v7_as_v9_iallreduce;
+  ncclCollNet_v7_as_v9.iallgather = nullptr;
+  ncclCollNet_v7_as_v9.ireducescatter = nullptr;
+  ncclCollNet_v7_as_v9.iflush = ncclCollNet_v7->iflush;
+  ncclCollNet_v7_as_v9.test = ncclCollNet_v7->test;
+  ncclCollNet_v7_as_v9.closeColl = ncclCollNet_v7->closeColl;
+  ncclCollNet_v7_as_v9.closeListen = ncclCollNet_v7->closeListen;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclCollNet_v8_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
+  ncclNetProperties_v8_t p8;
+  ncclResult_t ans = ncclCollNet_v8->getProperties(dev, &p8);
+  if (ans != ncclSuccess) return ans;
+  props->name = p8.name;
+  props->pciPath = p8.pciPath;
+  props->guid = p8.guid;
+  props->ptrSupport = p8.ptrSupport;
+  props->regIsGlobal = p8.regIsGlobal;
+  props->forceFlush = 0;
+  props->speed = p8.speed;
+  props->port = p8.port;
+  props->maxComms = p8.maxComms;
+  props->maxRecvs = p8.maxRecvs;
+  props->latency = p8.latency;
+  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
+  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclCollNet_v8_as_v9_iallreduce(void* collComm, void* sendData, void* recvData, size_t count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
+   int countInt;
+   if (count > MAX_NET_SIZE) return ncclInternalError;
+   countInt = (int)count;
+   ncclResult_t ans = ncclCollNet_v8->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp,
+                  sendMhandle, recvMhandle, request);
+   return ans;
+}
+
+static ncclResult_t ncclCollNet_v8_as_v9_iallgather (void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v9_t* recvParts,
+                           size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                           void* sendMhandle, void** request) {
+   ncclNetSGE_v8_t recvPartsInt;
+   if (nRecvParts > 1) return ncclInternalError;
+   if (recvParts->size > MAX_COLLNET_SIZE) return ncclInternalError;
+   recvPartsInt.mhandle = recvParts->mhandle;
+   recvPartsInt.address = recvParts->address;
+   recvPartsInt.size = (int)recvParts->size;
+   ncclResult_t ans = ncclCollNet_v8->iallgather(collComm, sendData, nRecvParts, &recvPartsInt,
+                   bytesPerRank, windowOffset, windowBytes,
+                   sendMhandle, request);
+   return ans;
+}
+
+static ncclResult_t ncclCollNet_v8_as_v9_ireducescatter(void* collComm, int nSendParts, ncclNetSGE_v9_t* sendParts, void* recvData,
+                               size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                               ncclDataType_t dataType, ncclRedOp_t redOp,
+                               void* recvMhandle, void** request) {
+   ncclNetSGE_v8_t sendPartsInt;
+   if (nSendParts > 1) return ncclInternalError;
+   if (sendParts->size > MAX_COLLNET_SIZE) return ncclInternalError;
+   sendPartsInt.mhandle = sendParts->mhandle;
+   sendPartsInt.address = sendParts->address;
+   sendPartsInt.size = (int)sendParts->size;
+   ncclResult_t ans = ncclCollNet_v8->ireducescatter(collComm, nSendParts, &sendPartsInt,
+                   recvData, bytesPerRank, windowOffset, windowBytes,
+                   dataType, redOp,
+                  recvMhandle, request);
+   return ans;
+}
+
+// We use a wrapper around the v8 init to copy over the struct contents
+// post-init since they may not be initialized before hand.
+static ncclResult_t ncclCollNet_v8_as_v9_init(ncclDebugLogger_t logfn) {
+  NCCLCHECK(ncclCollNet_v8->init(logfn));
+  ncclCollNet_v8_as_v9.name = ncclCollNet_v8->name;
+  ncclCollNet_v8_as_v9.devices = ncclCollNet_v8->devices;
+  ncclCollNet_v8_as_v9.getProperties = ncclCollNet_v8_as_v9_getProperties;
+  ncclCollNet_v8_as_v9.listen = ncclCollNet_v8->listen;
+  ncclCollNet_v8_as_v9.connect = ncclCollNet_v8->connect;
+  ncclCollNet_v8_as_v9.reduceSupport = ncclCollNet_v8->reduceSupport;
+  ncclCollNet_v8_as_v9.regMr = ncclCollNet_v8->regMr;
+  ncclCollNet_v8_as_v9.regMrDmaBuf = ncclCollNet_v8->regMrDmaBuf;
+  ncclCollNet_v8_as_v9.deregMr = ncclCollNet_v8->deregMr;
+  ncclCollNet_v8_as_v9.iallreduce = ncclCollNet_v8_as_v9_iallreduce;
+  ncclCollNet_v8_as_v9.iallgather = ncclCollNet_v8_as_v9_iallgather;
+  ncclCollNet_v8_as_v9.ireducescatter = ncclCollNet_v8_as_v9_ireducescatter;
+  ncclCollNet_v8_as_v9.iflush = ncclCollNet_v8->iflush;
+  ncclCollNet_v8_as_v9.test = ncclCollNet_v8->test;
+  ncclCollNet_v8_as_v9.closeColl = ncclCollNet_v8->closeColl;
+  ncclCollNet_v8_as_v9.closeListen = ncclCollNet_v8->closeListen;
   return ncclSuccess;
 }
 
 static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER;
-ncclNet_t* ncclNets[3] = { nullptr, &ncclNetIb, &ncclNetSocket };
-ncclCollNet_t* ncclCollNets[3] = { nullptr, nullptr, nullptr };
+ncclNet_t* ncclNets[NCCL_NET_MAX_PLUGINS] = { nullptr, &ncclNetIb, &ncclNetSocket };
+ncclCollNet_t* ncclCollNets[NCCL_NET_MAX_PLUGINS] = { nullptr, nullptr, nullptr };
 enum ncclNetState {
   ncclNetStateInit = 0,
   ncclNetStateEnabled = 1,
   ncclNetStateDisabled = 2
 };
-enum ncclNetState ncclNetStates[3] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
-enum ncclNetState ncclCollNetStates[3] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
+enum ncclNetState ncclNetStates[NCCL_NET_MAX_PLUGINS] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
+enum ncclNetState ncclCollNetStates[NCCL_NET_MAX_PLUGINS] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
 
 #define MAX_STR_LEN 255
 
@@ -443,72 +730,93 @@ ncclResult_t ncclNetPluginLoad(struct ncclComm* comm) {
     goto fail;
   }
 
-  ncclNets[0] = (ncclNet_v8_t*)dlsym(netPluginLib, "ncclNetPlugin_v8");
+  ncclNets[0] = (ncclNet_v9_t*)dlsym(netPluginLib, "ncclNetPlugin_v9");
   if (ncclNets[0] == nullptr) {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v8 symbol.");
-    // Try v7 plugin
-    ncclNet_v7 = (ncclNet_v7_t*)dlsym(netPluginLib, "ncclNetPlugin_v7");
-    if (ncclNet_v7 == nullptr) {
-      // Try v6 plugin
-      ncclNet_v6 = (ncclNet_v6_t*)dlsym(netPluginLib, "ncclNetPlugin_v6");
-      if (ncclNet_v6 == nullptr) {
-        // Try v5 plugin
-        ncclNet_v5 = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5");
-        if (ncclNet_v5 == nullptr) {
-          INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin symbol (>= v5). ncclNetPlugin symbols v4 and lower are not supported.");
-          goto fail;
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v9 symbol.");
+    ncclNet_v8 = (ncclNet_v8_t*)dlsym(netPluginLib, "ncclNetPlugin_v8");
+    if (ncclNet_v8 == nullptr) {
+      // Try v7 plugin
+      ncclNet_v7 = (ncclNet_v7_t*)dlsym(netPluginLib, "ncclNetPlugin_v7");
+      if (ncclNet_v7 == nullptr) {
+        // Try v6 plugin
+        ncclNet_v6 = (ncclNet_v6_t*)dlsym(netPluginLib, "ncclNetPlugin_v6");
+        if (ncclNet_v6 == nullptr) {
+          // Try v5 plugin
+          ncclNet_v5 = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5");
+          if (ncclNet_v5 == nullptr) {
+            INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin symbol (>= v5). ncclNetPlugin symbols v4 and lower are not supported.");
+            goto fail;
+          } else {
+            ncclNets[0] = &ncclNet_v5_as_v9;
+            ncclNet_v5_as_v9.init = ncclNet_v5_as_v9_init;
+            // Set the name right away to allow for NCCL_NET=... to work
+            ncclNet_v5_as_v9.name = ncclNet_v5->name;
+            INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v5)", ncclNets[0]->name);
+          }
         } else {
-          ncclNets[0] = &ncclNet_v5_as_v8;
-          ncclNet_v5_as_v8.init = ncclNet_v5_as_v8_init;
+          ncclNets[0] = &ncclNet_v6_as_v9;
+          ncclNet_v6_as_v9.init = ncclNet_v6_as_v9_init;
           // Set the name right away to allow for NCCL_NET=... to work
-          ncclNet_v5_as_v8.name = ncclNet_v5->name;
-          INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v5)", ncclNets[0]->name);
+          ncclNet_v6_as_v9.name = ncclNet_v6->name;
+          INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v6)", ncclNets[0]->name);
         }
       } else {
-        ncclNets[0] = &ncclNet_v6_as_v8;
-        ncclNet_v6_as_v8.init = ncclNet_v6_as_v8_init;
+        ncclNets[0] = &ncclNet_v7_as_v9;
+        ncclNet_v7_as_v9.init = ncclNet_v7_as_v9_init;
         // Set the name right away to allow for NCCL_NET=... to work
-        ncclNet_v6_as_v8.name = ncclNet_v6->name;
-        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v6)", ncclNets[0]->name);
+        ncclNet_v7_as_v9.name = ncclNet_v7->name;
+        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v7)", ncclNets[0]->name);
       }
     } else {
-      ncclNets[0] = &ncclNet_v7_as_v8;
-      ncclNet_v7_as_v8.init = ncclNet_v7_as_v8_init;
+      ncclNets[0] = &ncclNet_v8_as_v9;
+      ncclNet_v8_as_v9.init = ncclNet_v8_as_v9_init;
       // Set the name right away to allow for NCCL_NET=... to work
-      ncclNet_v7_as_v8.name = ncclNet_v7->name;
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v7)", ncclNets[0]->name);
+      ncclNet_v8_as_v9.name = ncclNet_v8->name;
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v8)", ncclNets[0]->name);
     }
+  } else {
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v9)", ncclNets[0]->name);
   }
 
   // Check for CollNet
-  ncclCollNets[0] = (ncclCollNet_v8_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v8");
+  ncclCollNets[0] = (ncclCollNet_v9_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v9");
   if (ncclCollNets[0] == nullptr) {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v8 symbol.");
-    ncclCollNet_v7 = (ncclCollNet_v7_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v7");
-    if (ncclCollNet_v7 == nullptr) {
-      ncclCollNet_v6 = (ncclCollNet_v6_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v6");
-      if (ncclCollNet_v6 == nullptr) {
-        ncclCollNet_v5 = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5");
-        if (ncclCollNet_v5 == nullptr) {
-          INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin symbol (>= v5). ncclCollNetPlugin symbols v4 and lower are not supported.");
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v9 symbol.");
+    ncclCollNet_v8 = (ncclCollNet_v8_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v8");
+    if (ncclCollNet_v8 == nullptr) {
+      ncclCollNet_v7 = (ncclCollNet_v7_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v7");
+      if (ncclCollNet_v7 == nullptr) {
+        ncclCollNet_v6 = (ncclCollNet_v6_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v6");
+        if (ncclCollNet_v6 == nullptr) {
+          ncclCollNet_v5 = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5");
+          if (ncclCollNet_v5 == nullptr) {
+            INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin symbol (>= v5). ncclCollNetPlugin symbols v4 and lower are not supported.");
+          } else {
+            ncclCollNets[0] = &ncclCollNet_v5_as_v9;
+            ncclCollNet_v5_as_v9.init = ncclCollNet_v5_as_v9_init;
+            ncclCollNet_v5_as_v9.name = ncclCollNet_v5->name;
+            INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v5)", ncclCollNets[0]->name);
+          }
         } else {
-          ncclCollNets[0] = &ncclCollNet_v5_as_v8;
-          ncclCollNet_v5_as_v8.init = ncclCollNet_v5_as_v8_init;
-          ncclCollNet_v5_as_v8.name = ncclCollNet_v5->name;
-          INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v5)", ncclCollNets[0]->name);
+         ncclCollNets[0] = &ncclCollNet_v6_as_v9;
+         ncclCollNet_v6_as_v9.init = ncclCollNet_v6_as_v9_init;
+         ncclCollNet_v6_as_v9.name = ncclCollNet_v6->name;
+         INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v6)", ncclCollNets[0]->name);
         }
       } else {
-        ncclCollNets[0] = &ncclCollNet_v6_as_v8;
-        ncclCollNet_v6_as_v8.init = ncclCollNet_v6_as_v8_init;
-        ncclCollNet_v6_as_v8.name = ncclCollNet_v6->name;
-        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v6)", ncclCollNets[0]->name);
+       ncclCollNets[0] = &ncclCollNet_v7_as_v9;
+       ncclCollNet_v7_as_v9.init = ncclCollNet_v7_as_v9_init;
+       ncclCollNet_v7_as_v9.name = ncclCollNet_v7->name;
+       INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v7)", ncclCollNets[0]->name);
       }
     } else {
-      ncclCollNets[0] = &ncclCollNet_v7_as_v8;
-      ncclCollNet_v7_as_v8.init = ncclCollNet_v7_as_v8_init;
-      ncclCollNet_v7_as_v8.name = ncclCollNet_v7->name;
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v7)", ncclCollNets[0]->name);
+      ncclCollNets[0] = &ncclCollNet_v8_as_v9;
+      ncclCollNet_v8_as_v9.init = ncclCollNet_v8_as_v9_init;
+      ncclCollNet_v8_as_v9.name = ncclCollNet_v8->name;
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v8)", ncclCollNets[0]->name);
     }
+  } else {
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v9)", ncclCollNets[0]->name);
   }
 
   ++netPluginRefCount;
@@ -539,6 +847,8 @@ ncclResult_t ncclNetPluginUnload(struct ncclComm* comm) {
     ncclCollNets[0] = nullptr;
     netPluginStatus = netPluginLoadReady;
     comm->netPluginLoaded = 0;
+    for (int i = 0; i < NCCL_NET_MAX_PLUGINS; ++i)
+      ncclCollNetStates[i] = ncclNetStates[i] = ncclNetStateInit;
   }
   pthread_mutex_unlock(&netPluginLock);
   return ncclSuccess;
@@ -561,7 +871,7 @@ ncclResult_t ncclNetCheckDeviceVersion(struct ncclComm* comm, ncclNet_t* net, in
         return ncclInternalError;
       }
     default:
-      WARN("Unknown device code index");
+      WARN("Unknown device code index %d \n", type);
       return ncclInternalError;
   }
 
@@ -715,8 +1025,9 @@ cleanup1:
 
 int ncclNetVersion(struct ncclComm* comm) {
   return
-    (comm->ncclNet == &ncclNet_v5_as_v8) ? 5 :
-    (comm->ncclNet == &ncclNet_v6_as_v8) ? 6 :
-    (comm->ncclNet == &ncclNet_v7_as_v8) ? 7 :
-    8;
+    (comm->ncclNet == &ncclNet_v5_as_v9) ? 5 :
+    (comm->ncclNet == &ncclNet_v6_as_v9) ? 6 :
+    (comm->ncclNet == &ncclNet_v7_as_v9) ? 7 :
+    (comm->ncclNet == &ncclNet_v8_as_v9) ? 8 :
+    9;
 }
diff --git a/src/proxy.cc b/src/proxy.cc
index 5e657c0a4a..bd8188a378 100644
--- a/src/proxy.cc
+++ b/src/proxy.cc
@@ -364,7 +364,11 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr
   sub->channelId = op->channelId;
   sub->nsteps = op->nsteps;
   sub->nbytes = op->nbytes;
+  sub->chunkSize = op->chunkSize;
   sub->offset = 0;
+  sub->loopSize = op->loopSize;
+  sub->loopOffset = op->loopOffset;
+  sub->isOneRPN = op->isOneRPN;
   sub->peer = op->peer;
   sub->reg = op->reg;
   sub->sendMhandle = op->sendMhandle;
@@ -374,8 +378,9 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr
   sub->eActivationMask = op->eActivationMask;
   sub->taskEventHandle = op->taskEventHandle;
   sub->rank = op->rank;
-  args->pid = op->pid;
-  args->profilerContext = op->profilerContext;
+  sub->pid = op->pid;
+  sub->profilerContext = op->profilerContext;
+  sub->ringAlgo = op->ringAlgo;
   args->nsubs = subIndex+1;
   if (subIndex) {
     if ((args->sliceSteps != op->sliceSteps) ||
@@ -404,6 +409,7 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr
   args->pattern = op->pattern;
   args->protocol = op->protocol;
   args->coll = op->coll;
+  args->algorithm = op->algorithm;
   args->specifics = op->specifics;
   args->state = ncclProxyOpReady;
   args->progress = op->connection->tcomm->proxyProgress;
@@ -485,6 +491,7 @@ static ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyCon
   }
   if (op->next != -1) __builtin_prefetch(pool->ops+op->next); // Prefetch next free op
   memcpy(op, proxyOp, sizeof(struct ncclProxyOp));
+  if (proxyOp->ringAlgo) proxyOp->ringAlgo->incRefCount();
   op->next = -1;
   op->connection = proxyConn->connection;
   if (proxyOps->nextOps == -1) {
@@ -601,13 +608,15 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool
     } break;
   case ncclPatternPatUp: {
       // Run full algorithm to count the number of steps for each peer.
-      int *nstepsSend, *nstepsRecv;
-      const int rank = comm->rank, nranks = comm->nRanks;
-      NCCLCHECK(ncclCalloc(&nstepsSend, log2Up(nranks)));
-      NCCLCHECK(ncclCalloc(&nstepsRecv, log2Up(nranks)));
+      ncclResult_t result = ncclSuccess;
       const ssize_t size = op->nbytes/comm->nRanks;
-      PatRSAlgorithm<char> algo(op->chunkSize, NCCL_STEPS, 0, size, size, op->chunkSize, rank, nranks);
       int last = 0;
+      int *nstepsSend = NULL, *nstepsRecv = NULL;
+      const int rank = comm->rank, nranks = comm->nRanks;
+      PatRSAlgorithm<char> algo(op->chunkSize, NCCL_STEPS, 0, size, size, op->chunkSize, rank, nranks);
+      NCCLCHECKGOTO(ncclCalloc(&nstepsSend, log2Up(nranks)), result, exit_pat_up);
+      NCCLCHECKGOTO(ncclCalloc(&nstepsRecv, log2Up(nranks)), result, exit_pat_up);
+
       while (last == 0) {
         int recvDim, sendDim, recvOffset, sendOffset, sendStepOffset, postRecv, postSend, nelem;
         size_t inpIx, outIx;
@@ -619,24 +628,30 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool
         if (nstepsSend[i]) {
           int sendPeer = (rank + (1<<i)) % nranks;
           op->nsteps = nstepsSend[i];
-          NCCLCHECK(SaveProxy(comm, channel, proxySend, sendPeer, op, 0, justInquire));
+          NCCLCHECKGOTO(SaveProxy(comm, channel, proxySend, sendPeer, op, 0, justInquire), result, exit_pat_up);
         }
         if (nstepsRecv[i]) {
           int recvPeer = (rank - (1<<i) + nranks) % nranks;
           op->nsteps = nstepsRecv[i];
-          NCCLCHECK(SaveProxy(comm, channel, proxyRecv, recvPeer, op, 0, justInquire));
+          NCCLCHECKGOTO(SaveProxy(comm, channel, proxyRecv, recvPeer, op, 0, justInquire), result, exit_pat_up);
         }
       }
+    exit_pat_up:
+      free(nstepsSend);
+      free(nstepsRecv);
+      NCCLCHECK(result);
     } break;
   case ncclPatternPatDown: {
       // Run full algorithm to count the number of steps for each peer.
-      int *nstepsSend, *nstepsRecv;
-      const int rank = comm->rank, nranks = comm->nRanks;
-      NCCLCHECK(ncclCalloc(&nstepsSend, log2Up(nranks)));
-      NCCLCHECK(ncclCalloc(&nstepsRecv, log2Up(nranks)));
+      ncclResult_t result = ncclSuccess;
       const ssize_t size = op->nbytes/comm->nRanks;
-      PatAGAlgorithm<char> algo(op->chunkSize, NCCL_STEPS, 0, size, size, op->chunkSize, rank, nranks);
       int last = 0;
+      int *nstepsSend = NULL, *nstepsRecv = NULL;
+      const int rank = comm->rank, nranks = comm->nRanks;
+      PatAGAlgorithm<char> algo(op->chunkSize, NCCL_STEPS, 0, size, size, op->chunkSize, rank, nranks);
+      NCCLCHECKGOTO(ncclCalloc(&nstepsSend, log2Up(nranks)), result, exit_pat_down);
+      NCCLCHECKGOTO(ncclCalloc(&nstepsRecv, log2Up(nranks)), result, exit_pat_down);
+
       while (last == 0) {
         int recvDim, sendDim, recvOffset, sendOffset, recvStepOffset, postRecv, postSend, nelem;
         size_t inpIx, outIx;
@@ -648,14 +663,18 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool
         if (nstepsSend[i]) {
           int sendPeer = (rank - (1<<i) + nranks) % nranks;
           op->nsteps = nstepsSend[i];
-          NCCLCHECK(SaveProxy(comm, channel, proxySend, sendPeer, op, 0, justInquire));
+          NCCLCHECKGOTO(SaveProxy(comm, channel, proxySend, sendPeer, op, 0, justInquire), result, exit_pat_down);
         }
         if (nstepsRecv[i]) {
           int recvPeer = (rank + (1<<i)) % nranks;
           op->nsteps = nstepsRecv[i];
-          NCCLCHECK(SaveProxy(comm, channel, proxyRecv, recvPeer, op, 0, justInquire));
+          NCCLCHECKGOTO(SaveProxy(comm, channel, proxyRecv, recvPeer, op, 0, justInquire), result, exit_pat_down);
         }
       }
+    exit_pat_down:
+      free(nstepsSend);
+      free(nstepsRecv);
+      NCCLCHECK(result);
     } break;
   case ncclPatternSend:
   case ncclPatternRecv: {
@@ -735,23 +754,17 @@ static ncclResult_t ncclProxyGetPostedOps(struct ncclProxyState* proxyState, int
 
   if (state->active == NULL) {
     pthread_mutex_lock(&pool->mutex);
-    while (pool->nextOps == -1 && !state->stop) {
+    if (pool->nextOps == -1 && !state->stop) {
       ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle);
       ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlSleep);
       pthread_cond_wait(&pool->cond, &pool->mutex);
       ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlWakeup);
       ncclProfilerStopProxyCtrlEvent(eHandle);
     }
-    if (state->stop) { // We might have been woken up to stop.
-      pthread_mutex_unlock(&pool->mutex);
-      return ncclSuccess;
-    }
   }
-
   state->nextOps = pool->nextOps;
   pool->nextOps = pool->nextOpsEnd = -1;
   pthread_mutex_unlock(&pool->mutex);
-  if (state->nextOps == -1) return ncclInternalError;
 
 process_nextops:
   ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle);
@@ -889,7 +902,7 @@ void* ncclProxyProgress(void *proxyState_) {
    * ncclParamProgressAppendOpFreq(). If they are equal, we will append proxy ops. This will decrease the
    * frequency of calling ncclProxyGetPostedOps() and reduce the perf impact. */
   int proxyOpAppendCounter = 0;
-  while (state->stop == 0 || (state->stop == 1 && state->active)) {
+  do {
     int idle = 1;
     ncclResult_t ret = progressOps(proxyState, state, state->active, &idle);
     if (ret != ncclSuccess) {
@@ -902,12 +915,11 @@ void* ncclProxyProgress(void *proxyState_) {
     if (lastIdle == 0 && idle == 1) ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlIdle);
     if (lastIdle == 1 && idle == 0) ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlActive);
     ncclProfilerStopProxyCtrlEvent(eHandle);
-    if (idle || (++proxyOpAppendCounter == ncclParamProgressAppendOpFreq())) {
+    if (idle || !state->active || (++proxyOpAppendCounter == ncclParamProgressAppendOpFreq())) {
       int added = 0;
       proxyOpAppendCounter = 0;
       TIME_START(3);
-      if (state->stop == 0)
-        ret = ncclProxyGetPostedOps(proxyState, &added);
+      ret = ncclProxyGetPostedOps(proxyState, &added);
       if (added) { TIME_STOP(3); } else { TIME_CANCEL(3); }
       if (ret != ncclSuccess) {
         __atomic_store_n(&proxyState->asyncResult, ret, __ATOMIC_RELEASE);
@@ -918,7 +930,7 @@ void* ncclProxyProgress(void *proxyState_) {
       }
     }
     lastIdle = idle;
-  }
+  } while (state->stop == 0 || (state->stop == 1 && state->active));
   return NULL;
 }
 
@@ -1090,7 +1102,7 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
     strncpy(poolPath+sizeof("/dev/shm/nccl-")-1, resp.devShmPath, sizeof("XXXXXX")-1);
     struct ncclProxyOps* proxyOps = sharedProxyState->proxyOps + proxyConn->tpLocalRank;
     if (proxyOps->pool == NULL) {
-      NCCLCHECK(ncclShmOpen(poolPath, sizeof(struct ncclProxyOpsPool), (void**)(&proxyOps->pool), NULL, -1, &proxyOps->handle));
+      NCCLCHECK(ncclShmOpen(poolPath, sizeof(poolPath), sizeof(struct ncclProxyOpsPool), (void**)(&proxyOps->pool), NULL, -1, &proxyOps->handle));
       proxyOps->nextOps = proxyOps->nextOpsEnd = proxyOps->freeOp = -1;
     }
   }
@@ -1293,7 +1305,7 @@ static ncclResult_t proxyProgressInit(struct ncclProxyState* proxyState) {
 
     char shmPath[sizeof("/dev/shm/nccl-XXXXXX")];
     shmPath[0] = '\0';
-    NCCLCHECK(ncclShmOpen(shmPath, size, (void**)&pool, NULL, proxyState->tpLocalnRanks, &state->handle));
+    NCCLCHECK(ncclShmOpen(shmPath, sizeof(shmPath), size, (void**)&pool, NULL, proxyState->tpLocalnRanks, &state->handle));
     // Init pool
     pool->nextOps = -1;
 
@@ -1372,7 +1384,7 @@ static ncclResult_t proxyQueryFd(struct ncclProxyState* proxyState, int rank, vo
   ncclResult_t ret = ncclSuccess;
 
   NCCLCHECKGOTO(ncclIpcSocketInit(&ipcSock, proxyState->tpRank, hash^1, proxyState->abortFlag), ret, exit);
-  NCCLCHECKGOTO(ncclIpcSocketSendMsg(&ipcSock, &rmtFd, sizeof(int), rmtFd, rank, hash), ret, exit);
+  NCCLCHECKGOTO(ncclIpcSocketSendMsg(&ipcSock, &rmtFd, sizeof(int), -1, rank, hash), ret, exit);
 exit:
   NCCLCHECK(ncclIpcSocketClose(&ipcSock));
   return ncclSuccess;
@@ -1603,7 +1615,7 @@ void* ncclProxyService(void* _args) {
       if (pollfds[s].fd == -1) continue;
 
       // Progress all ops for this ncclProxyLocalPeer
-      if (stop == PROXY_ABORT && ncclCuMemEnable() && ncclCuMemHostEnable() && !proxyState->directMode) closeConn = 1;
+      if (stop == PROXY_ABORT && ncclCuMemEnable() && ncclCuMemHostEnable() && !proxyState->directMode && __atomic_load_n(&proxyState->stop, __ATOMIC_ACQUIRE)) closeConn = 1;
       ncclProxyAsyncOp* op = peer->asyncOps;
       while (op != nullptr) {
         ncclProxyAsyncOp* opnext = op->next; /* in case op is freed in proxyProgressAsync */
@@ -1692,11 +1704,17 @@ static ncclResult_t proxyUDSRecvReq(struct ncclProxyState* proxyState, int reqFd
 
   NCCLCHECK(ncclIpcSocketRecvMsg(&proxyState->ipcSock, &hdr, sizeof(hdr), &rmtFd));
   if (hdr.type == ncclProxyMsgGetFd) {
-    // cuMem API support
+    // cuMem API support for non-UB case, and rmtFd is not used since UDS proxy thread need to export
+    // fd from handle and send it back to the main thread to import the buffer. We just need to close
+    // this dummy rmtFd.
     uint64_t handle = *(uint64_t*)hdr.data;
     INFO(NCCL_PROXY, "proxyUDSRecvReq::ncclProxyMsgGetFd rank %d opId %p handle=0x%lx", hdr.rank, hdr.opId, handle);
+    close(rmtFd);
     return proxyGetFd(proxyState, hdr.rank, hdr.opId, handle);
   } else if (hdr.type == ncclProxyMsgQueryFd) {
+    // remote main thread registers buffer into this rank, it querys rmtFd of this rank through UDS
+    // and the rmtFd is returned unchanged back to remote main thread which will use rmtFd to call into
+    // proxy service thread for buffer registration.
     INFO(NCCL_PROXY, "proxyUDSRecvReq::proxyQueryFd rank %d opId %p rmtFd %d", hdr.rank, hdr.opId, rmtFd);
     return proxyQueryFd(proxyState, hdr.rank, hdr.opId, rmtFd);
   }
@@ -1743,7 +1761,7 @@ void* ncclProxyServiceUDS(void* _args) {
     }
   }
 
-  ncclIpcSocketClose(&proxyState->ipcSock);
+  (void)ncclIpcSocketClose(&proxyState->ipcSock);
   INFO(NCCL_PROXY, "[Proxy Service UDS] exit: stop %d abortFlag %d", proxyState->stop, *proxyState->abortFlag);
   return NULL;
 }
@@ -1800,15 +1818,10 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) {
     struct ncclProxyState* sharedProxyState = comm->proxyState;
 
     if ((comm->proxyRefCountOld = ncclAtomicRefCountDecrement(&sharedProxyState->refCount)) == 0) {
-      if (comm->proxyState->threadUDS) {
-        // UDS support
-        __atomic_store_n(&comm->proxyState->stop, 1, __ATOMIC_RELEASE);
-      }
-
       if (*comm->abortFlag == 0 && sharedProxyState->peerAddresses) {
         struct ncclSocket sock;
         int type = ncclProxyMsgStop;
-        ncclSocketInit(&sock, sharedProxyState->peerAddresses + comm->topParentRanks[comm->rank], comm->sharedRes->magic, ncclSocketTypeProxy, comm->abortFlag);
+        NCCLCHECK(ncclSocketInit(&sock, sharedProxyState->peerAddresses + comm->topParentRanks[comm->rank], comm->sharedRes->magic, ncclSocketTypeProxy, comm->abortFlag));
         if (ncclSocketConnect(&sock) == ncclSuccess) {
           (void)ncclSocketSend(&sock, &type, sizeof(int));
         }
@@ -1835,6 +1848,8 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) {
           }
         }
       }
+      // Now we notify proxy service and UDS thread to exit.
+      __atomic_store_n(&comm->proxyState->stop, 1, __ATOMIC_RELEASE);
     }
   }
 
diff --git a/src/ras/client.cc b/src/ras/client.cc
new file mode 100644
index 0000000000..8061cef4e6
--- /dev/null
+++ b/src/ras/client.cc
@@ -0,0 +1,318 @@
+/*************************************************************************
+ * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <cerrno>
+#include <climits>
+#include <cstddef>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <getopt.h>
+#include <netdb.h>
+#include <unistd.h>
+
+#include "nccl.h"
+#define NCCL_RAS_CLIENT // Only pull client-specific definitions from the header file below.
+#include "ras_internal.h"
+
+#define STR2(v) #v
+#define STR(v) STR2(v)
+
+// Local timeout increment compared to the '-t' argument, in seconds.
+#define TIMEOUT_INCREMENT 1
+
+static const char* hostName = "localhost";
+static const char* port = STR(NCCL_RAS_CLIENT_PORT);
+static int timeout = -1;
+static bool verbose = false;
+static int sock = -1;
+
+static void printUsage(const char* argv0) {
+  fprintf(stderr,
+          "Usage: %s [OPTION]...\n"
+          "Query the state of a running NCCL job.\n"
+          "\nOptions:\n"
+          "  -h, --host=HOST     Host name or IP address of the RAS client socket of the\n"
+          "                      NCCL job to connect to (localhost by default)\n"
+          "  -p, --port=PORT     TCP port of the RAS client socket of the NCCL job\n"
+          "                      (" STR(NCCL_RAS_CLIENT_PORT) " by default)\n"
+          "  -t, --timeout=SECS  Maximum time for the local NCCL process to wait for\n"
+          "                      responses from other NCCL processes\n"
+          "                      (" STR(RAS_COLLECTIVE_LEG_TIMEOUT_SEC) " secs by default; 0 disables the timeout)\n"
+          "  -v, --verbose       Increase the verbosity level of the RAS output\n"
+          "      --help          Print this help and exit\n"
+          "      --version       Print the version number and exit\n", argv0);
+}
+
+static void parseArgs(int argc, char** argv) {
+  int c;
+  int optIdx = 0;
+  struct option longOpts[] = {
+    {"host",    required_argument, NULL, 'h'},
+    {"port",    required_argument, NULL, 'p'},
+    {"timeout", required_argument, NULL, 't'},
+    {"verbose", no_argument,       NULL, 'v'},
+    {"help",    no_argument,       NULL, 'e'},
+    {"version", no_argument,       NULL, 'r'},
+    {0}
+  };
+
+  while ((c = getopt_long(argc, argv, "h:p:t:v", longOpts, &optIdx)) != -1) {
+    switch (c) {
+      case 'h':
+        hostName = optarg;
+        break;
+      case 'p':
+        port = optarg;
+        break;
+      case 't': {
+        char* endPtr = nullptr;
+        timeout = strtol(optarg, &endPtr, 10);
+        if (timeout < 0 || !endPtr || *endPtr != '\0') {
+          fprintf(stderr, "Invalid timeout: %s\n", optarg);
+          exit(1);
+        }
+        break;
+      }
+      case 'v':
+        verbose = true;
+        break;
+      case 'e':
+        printUsage(argv[0]);
+        exit(0);
+      case 'r':
+        fprintf(stderr, "NCCL RAS client version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "."
+                STR(NCCL_PATCH) NCCL_SUFFIX "\n");
+        exit(0);
+      default:
+        printUsage(argv[0]);
+        exit(1);
+    }
+  }
+}
+
+static ssize_t socketWrite(int fd, const void* buf, size_t count) {
+  size_t done = 0;
+  do {
+    ssize_t ret;
+    ret = write(fd, ((const char*)buf)+done, count-done);
+    if (ret == -1) {
+      if (errno != EINTR)
+        return -1;
+      continue;
+    }
+    done += ret;
+  } while (done < count);
+
+  return done;
+}
+
+// Reads a message from RAS.  Assumes that the message ends with '\n' (will continue reading until the terminating
+// newline, unless false is passed as untilNewLine).
+// Terminates the buffer with '\0'.  Returns the number of bytes read (excluding the added terminating '\0').
+static ssize_t rasRead(int fd, void* buf, size_t count, bool untilNewline = true) {
+  char* bufChar = (char*)buf;
+  size_t done = 0;
+  do {
+    ssize_t ret;
+    ret = read(fd, bufChar+done, count-1-done);
+    if (ret == -1) {
+      if (errno != EINTR)
+        return -1;
+      continue;
+    }
+    if (ret == 0)
+      break; // EOF
+    done += ret;
+  } while (untilNewline && (done == 0 || bufChar[done-1] != '\n'));
+  bufChar[done] = '\0';
+
+  return done;
+}
+
+static int connectToNCCL() {
+  struct addrinfo hints = {0};
+  struct addrinfo* addrInfo = nullptr;
+  int ret;
+  char msgBuf[1024];
+  int bytes;
+  struct timeval tv = {TIMEOUT_INCREMENT, 0};
+
+retry:
+  hints.ai_family = AF_UNSPEC;
+  hints.ai_socktype = SOCK_STREAM;
+  if ((ret = getaddrinfo(hostName, port, &hints, &addrInfo)) != 0) {
+    fprintf(stderr, "Resolving %s:%s: %s\n", hostName, port, gai_strerror(ret));
+    goto fail;
+  }
+  for (struct addrinfo* ai = addrInfo; ai; ai = ai->ai_next) {
+    char hostBuf[NI_MAXHOST], portBuf[NI_MAXSERV];
+    int err;
+    sock = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
+    if (sock == -1) {
+      perror("socket");
+      continue;
+    }
+    // Initially start with a small, 1-sec timeout to quickly eliminate non-responsive processes...
+    if (timeout && (setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof tv) != 0 ||
+                    setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof tv) != 0)) {
+      perror("setsockopt");
+      // Non-fatal; fall through.
+    }
+    if (connect(sock, ai->ai_addr, ai->ai_addrlen) == 0)
+      break;
+    err = errno;
+    if (getnameinfo(ai->ai_addr, ai->ai_addrlen, hostBuf, sizeof(hostBuf), portBuf, sizeof(portBuf),
+                    NI_NUMERICHOST | NI_NUMERICSERV) != 0) {
+      strcpy(hostBuf, hostName);
+      strcpy(portBuf, port);
+    }
+    fprintf(stderr, "Connecting to %s:%s: %s\n", hostBuf, portBuf, strerror(err));
+    close(sock);
+    sock = -1;
+  }
+  freeaddrinfo(addrInfo);
+  addrInfo = nullptr;
+
+  if (sock == -1) {
+    fprintf(stderr, "Failed to connect to the NCCL RAS service!\n"
+            "Please make sure that the NCCL job has the RAS service enabled and that\n"
+            "%s.\n",
+            (strcmp(hostName, "localhost") || strcmp(port, STR(NCCL_RAS_CLIENT_PORT)) ?
+            "the host/port arguments are correct and match NCCL_RAS_ADDR" :
+            "the RAS client was started on a node where the NCCL job is running"));
+    goto fail;
+  }
+
+  // Exchange the RAS client handshake.
+  strcpy(msgBuf, "CLIENT PROTOCOL " STR(NCCL_RAS_CLIENT_PROTOCOL) "\n");
+  if (socketWrite(sock, msgBuf, strlen(msgBuf)) != strlen(msgBuf)) {
+    if (errno == EAGAIN || errno == EWOULDBLOCK) {
+      goto timeout;
+    }
+    perror("write to socket");
+    goto fail;
+  }
+  bytes = rasRead(sock, msgBuf, sizeof(msgBuf));
+  if (bytes < 0) {
+    if (errno == EAGAIN || errno == EWOULDBLOCK) {
+      goto timeout;
+    }
+    perror("read socket");
+    goto fail;
+  }
+  if (bytes == 0) {
+    fprintf(stderr, "NCCL unexpectedly closed the connection\n");
+    goto fail;
+  }
+  if (strncasecmp(msgBuf, "SERVER PROTOCOL ", strlen("SERVER PROTOCOL "))) {
+    fprintf(stderr, "Unexpected response from NCCL: %s\n", msgBuf);
+    goto fail;
+  }
+  if (strtol(msgBuf+strlen("SERVER PROTOCOL "), nullptr, 10) != NCCL_RAS_CLIENT_PROTOCOL) {
+    fprintf(stderr, "NCCL RAS protocol version mismatch (NCCL: %s; RAS client: %d)!\n"
+            "Will try to continue in spite of that...\n", msgBuf+strlen("SERVER PROTOCOL "), NCCL_RAS_CLIENT_PROTOCOL);
+  }
+
+  if (timeout >= 0) {
+    snprintf(msgBuf, sizeof(msgBuf), "TIMEOUT %d\n", timeout);
+    if (socketWrite(sock, msgBuf, strlen(msgBuf)) != strlen(msgBuf)) {
+      if (errno == EAGAIN || errno == EWOULDBLOCK) {
+        goto timeout;
+      }
+      perror("write to socket");
+      goto fail;
+    }
+    bytes = rasRead(sock, msgBuf, sizeof(msgBuf));
+    if (bytes < 0) {
+      if (errno == EAGAIN || errno == EWOULDBLOCK) {
+        goto timeout;
+      }
+      perror("read socket");
+      goto fail;
+    }
+    if (bytes == 0) {
+      fprintf(stderr, "NCCL unexpectedly closed the connection\n");
+      goto fail;
+    }
+    if (strcasecmp(msgBuf, "OK\n")) {
+      fprintf(stderr, "Unexpected response from NCCL: %s\n", msgBuf);
+      goto fail;
+    }
+  }
+  if (timeout) {
+    // Increase the socket timeout to accommodate NCCL timeout.
+    tv.tv_sec += (timeout > 0 ? timeout : RAS_COLLECTIVE_LEG_TIMEOUT_SEC) + RAS_COLLECTIVE_EXTRA_TIMEOUT_SEC;
+    if (setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof tv) != 0) {
+      perror("setsockopt");
+      // Non-fatal; fall through.
+    }
+  }
+
+  return 0;
+fail:
+  if (addrInfo)
+    freeaddrinfo(addrInfo);
+  if (sock != -1)
+    (void)close(sock);
+  return 1;
+timeout:
+  fprintf(stderr, "Connection timed out; retrying...\n");
+  (void)close(sock);
+  goto retry;
+}
+
+int getNCCLStatus() {
+  char msgBuf[4096];
+  int bytes;
+  snprintf(msgBuf, sizeof(msgBuf), "%sSTATUS\n", (verbose ? "VERBOSE " : ""));
+  if (socketWrite(sock, msgBuf, strlen(msgBuf)) != strlen(msgBuf)) {
+    if (errno == EAGAIN || errno == EWOULDBLOCK)
+      fprintf(stderr, "Connection timed out\n");
+    else
+      perror("write to socket");
+    return 1;
+  }
+  for (;;) {
+    bytes = rasRead(sock, msgBuf, sizeof(msgBuf), /*untileNewLine*/false);
+    if (bytes < 0) {
+      if (errno == EAGAIN || errno == EWOULDBLOCK)
+        fprintf(stderr, "Connection timed out\n");
+      else
+        perror("read socket");
+      return 1;
+    }
+    if (bytes == 0) // EOF
+      break;
+    if (fwrite(msgBuf, 1, bytes, stdout) != bytes) {
+      fprintf(stderr, "fwrite to stdout failed!\n");
+      return 1;
+    }
+    if (fflush(stdout) != 0) {
+      perror("fflush stdout");
+      return 1;
+    }
+  }
+  return 0;
+}
+
+int main(int argc, char** argv) {
+  parseArgs(argc, argv);
+
+  if (connectToNCCL())
+    return 1;
+
+  if (getNCCLStatus()) {
+    (void)close(sock);
+    return 1;
+  }
+
+  if (close(sock) == -1) {
+    perror("close socket");
+    return 1;
+  }
+  return 0;
+}
diff --git a/src/ras/client_support.cc b/src/ras/client_support.cc
new file mode 100644
index 0000000000..414a1ed94f
--- /dev/null
+++ b/src/ras/client_support.cc
@@ -0,0 +1,1755 @@
+/*************************************************************************
+ * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#define NDEBUG // Comment out duriyng development only!
+#include <cassert>
+#include <cstdarg>
+#include <cstddef>
+
+#include "alloc.h"
+#include "checks.h"
+#include "comm.h"
+#include "nccl.h"
+#include "utils.h"
+#include "ras_internal.h"
+
+// Outlier count above which we don't print individual details about each of them.
+#define RAS_CLIENT_DETAIL_THRESHOLD 10
+// Fraction of the count of the total above which we don't consider another set to be an outlier.
+#define RAS_CLIENT_OUTLIER_FRACTION 0.25
+// Fraction of the count of the total below which a set is considered to be an outlier.
+#define RAS_CLIENT_VERBOSE_OUTLIER_FRACTION 0.5
+
+#define STR2(v) #v
+#define STR(v) STR2(v)
+
+// The RAS client listening socket of this RAS thread (normally port 28028).
+int rasClientListeningSocket = -1;
+
+// Auxiliary structure used when processing the results.  Helps with statistics gathering and sorting.
+struct rasValCount {
+  uint64_t value; // The observed value.
+  int count; // The number of occurences of this value in the results.
+  int firstIdx; // The index of the first occurence of this value in the results.
+};
+
+// Used in rasAuxComm below.  The values are bitmasks so that they can be combined.
+typedef enum {
+  RAS_ACS_UNKNOWN = 1, // Set if a peer did not provide info about a given communicator.
+  RAS_ACS_INIT = 2,
+  RAS_ACS_RUNNING = 4,
+  RAS_ACS_FINALIZE = 8,
+  RAS_ACS_ABORT = 16
+} rasACStatus;
+
+// Used in rasAuxComm below.  The values are bitmasks so that they can be combined (with the exception of RAS_ACE_OK).
+typedef enum {
+  RAS_ACE_OK = 0,
+  RAS_ACE_MISMATCH = 1,
+  RAS_ACE_ERROR = 2,
+  RAS_ACE_INCOMPLETE = 4
+} rasACError;
+
+// Auxiliary structure used when processing the results.  Helps with sorting and includes additional statistics
+// on the number of peers and nodes for a communicator.
+struct rasAuxComm {
+  struct rasCollComms::comm* comm;
+  int nPeers;
+  int nNodes;
+  int ranksPerNodeMin;
+  int ranksPerNodeMax;
+  unsigned int status; // Bitmask of rasACStatus values.
+  unsigned int errors; // Bitmask of rasACError values.
+  uint64_t firstCollOpCount; // collOpCount of the first rank, to compare against.
+};
+
+// Connected RAS clients.
+struct rasClient* rasClients;
+int nRasClients;
+
+// Minimum byte count to increment the output buffer size by if it's too small.
+#define RAS_OUT_INCREMENT 4096
+
+// Internal buffer for storing the formatted results.
+static char* rasOutBuffer = nullptr;
+static int nRasOutBuffer = 0; // Does _not_ include the terminating '\0' (which _is_ present in the buffer).
+static int rasOutBufferSize = 0;
+
+// We use them all over the place; no point in wasting the stack...
+static char lineBuf[1024]; // Temporary buffer used for printing at most 10 (RAS_CLIENT_DETAIL_THRESHOLD) rank numbers
+                           // or for printing the local GPU devices, which can't be more than 64 (NCCL_MAX_LOCAL_RANKS)
+                           // small numbers (times two if the NVML mask is different than the CUDA mask).
+                           // Still, 1024 should normally be plenty (verbose output may make things more difficult,
+                           // but we do check for overflows, so it will just be trimmed).
+
+static ncclResult_t getNewClientEntry(struct rasClient** pClient);
+static void rasClientEnqueueMsg(struct rasClient* client, char* msg, size_t msgLen);
+static void rasClientTerminate(struct rasClient* client);
+
+static ncclResult_t rasClientRun(struct rasClient* client);
+static ncclResult_t rasClientRunInit(struct rasClient* client);
+static ncclResult_t rasClientRunConns(struct rasClient* client);
+static ncclResult_t rasClientRunComms(struct rasClient* client);
+static void rasClientBreakDownErrors(struct rasClient* client, struct rasCollComms::comm* comm,
+                                     const int* peerIdxConv, int ncclErrors[ncclNumResults], bool isAsync = false);
+
+static void rasOutAppend(const char* format, ...) __attribute__ ((format(printf, 1, 2)));
+static void rasOutExtract(char* buffer);
+static int rasOutLength();
+static void rasOutReset();
+
+static int rasPeersNGpuCompare(const void* e1, const void* e2);
+static int rasPeersNProcsCompare(const void* e1, const void* e2);
+static int rasPeersHostPidCompare(const void* e1, const void* e2);
+static int ncclSocketsHostCompare(const void* p1, const void* p2);
+static int rasValCountsCompareRev(const void* p1, const void* p2);
+static int rasAuxCommsCompareRev(const void* p1, const void* p2);
+static int rasCommRanksPeerCompare(const void* p1, const void* p2);
+static int rasCommRanksCollOpCompare(const void* p1, const void* p2);
+
+static const char* rasCommRankGpuToString(const struct rasCollComms::comm::rank* rank, char* buf, size_t size);
+static const char* ncclErrorToString(ncclResult_t err);
+static const char* ncclSocketToHost(const union ncclSocketAddress* addr, char* buf, size_t size);
+static bool rasCountIsOutlier(int count, bool verbose, int totalCount = -1);
+
+
+///////////////////////////////////
+// General rasClients functions. //
+///////////////////////////////////
+
+// Creates a listening socket for clients to connect to.
+ncclResult_t rasClientInitSocket() {
+  ncclResult_t ret = ncclSuccess;
+  const char* clientAddr = "localhost:" STR(NCCL_RAS_CLIENT_PORT);
+  union ncclSocketAddress addr;
+  const int opt = 1;
+  if (const char* env = ncclGetEnv("NCCL_RAS_ADDR"))
+    clientAddr = env;
+  NCCLCHECKGOTO(ncclSocketGetAddrFromString(&addr, clientAddr), ret, fail);
+  SYSCHECKGOTO(rasClientListeningSocket = socket(addr.sa.sa_family, SOCK_STREAM, 0), "socket", ret, fail);
+  SYSCHECKGOTO(setsockopt(rasClientListeningSocket, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)),
+               "setsockopt", ret, fail);
+#if defined(SO_REUSEPORT)
+  SYSCHECKGOTO(setsockopt(rasClientListeningSocket, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt)),
+               "setsockopt", ret, fail);
+#endif
+  SYSCHECKGOTO(bind(rasClientListeningSocket, &addr.sa, (addr.sa.sa_family == AF_INET ? sizeof(struct sockaddr_in) :
+                                                          sizeof(struct sockaddr_in6))), "bind", ret, fail);
+  SYSCHECKGOTO(listen(rasClientListeningSocket, 16384), "listen", ret, fail);
+  INFO(NCCL_INIT|NCCL_RAS, "RAS client listening socket at %s", ncclSocketToString(&addr, rasLine));
+exit:
+  return ret;
+fail:
+  INFO(NCCL_INIT|NCCL_RAS, "RAS failed to establish a client listening socket at %s", clientAddr);
+  if (rasClientListeningSocket != -1) {
+    (void)close(rasClientListeningSocket);
+    rasClientListeningSocket = -1;
+  }
+  goto exit;
+}
+
+// Accepts a new RAS client connection.  The acceptance process may need to continue in the main event loop.
+ncclResult_t rasClientAcceptNewSocket() {
+  ncclResult_t ret = ncclSuccess;
+  struct rasClient* client = nullptr;
+  union ncclSocketAddress addr;
+  socklen_t addrlen = sizeof(addr);
+  int flags;
+
+  NCCLCHECKGOTO(getNewClientEntry(&client), ret, fail);
+
+  SYSCHECKGOTO(client->sock = accept(rasClientListeningSocket, (struct sockaddr*)&addr, &addrlen), "accept", ret, fail);
+
+  SYSCHECKGOTO(flags = fcntl(client->sock, F_GETFL), "fcntl", ret, fail);
+  SYSCHECKGOTO(fcntl(client->sock, F_SETFL, flags | O_NONBLOCK), "fcntl", ret, fail);
+
+  NCCLCHECKGOTO(rasGetNewPollEntry(&client->pfd), ret, fail);
+  rasPfds[client->pfd].fd = client->sock;
+  rasPfds[client->pfd].events = POLLIN;
+  client->status = RAS_CLIENT_CONNECTED;
+exit:
+  return ret;
+fail:
+  if (client && client->sock != -1)
+    (void)close(client->sock);
+  goto exit;
+}
+
+// Returns the index of the first available entry in the rasClients array, enlarging the array if necessary.
+static ncclResult_t getNewClientEntry(struct rasClient** pClient) {
+  struct rasClient* client;
+  int i;
+  for (i = 0; i < nRasClients; i++)
+    if (rasClients[i].status == RAS_CLIENT_CLOSED)
+      break;
+  if (i == nRasClients) {
+    NCCLCHECK(ncclRealloc(&rasClients, nRasClients, nRasClients+RAS_INCREMENT));
+    nRasClients += RAS_INCREMENT;
+  }
+
+  client = rasClients+i;
+  memset(client, '\0', sizeof(*client));
+  client->sock = client->pfd = -1;
+  ncclIntruQueueConstruct(&client->sendQ);
+  client->timeout =  RAS_COLLECTIVE_LEG_TIMEOUT;
+  client->collIdx = -1;
+
+  *pClient = client;
+  return ncclSuccess;
+}
+
+// Allocates a message of the desired length for sending.
+// Behind the scenes uses rasMsgAlloc.
+// Must use rasClientFreeMsg to free.
+static ncclResult_t rasClientAllocMsg(char** msg, size_t msgLen) {
+  return rasMsgAlloc((struct rasMsg**)msg, msgLen);
+}
+
+// To be used only with messages allocated with rasClientAllocMsg, i.e., for messages meant for sending.
+static void rasClientFreeMsg(char* msg) {
+  rasMsgFree((struct rasMsg*)msg);
+}
+
+// Enqueues a message for sending to a RAS client.  The message *must* have been allocated using rasClientAllocMsg.
+static void rasClientEnqueueMsg(struct rasClient* client, char* msg, size_t msgLen) {
+  // Get to the metadata of this message.
+  struct rasMsgMeta* meta = (struct rasMsgMeta*)((char*)msg - offsetof(struct rasMsgMeta, msg));
+  meta->offset = 0;
+  meta->length = msgLen;
+  ncclIntruQueueEnqueue(&client->sendQ, meta);
+  assert(client->status != RAS_CLIENT_CLOSED && client->status < RAS_CLIENT_FINISHED);
+  rasPfds[client->pfd].events |= POLLOUT;
+}
+
+// Terminates a connection with a RAS client.
+static void rasClientTerminate(struct rasClient* client) {
+  (void)close(client->sock);
+  client->sock = -1;
+  client->status = RAS_CLIENT_CLOSED;
+  rasPfds[client->pfd].fd = -1;
+  rasPfds[client->pfd].events = rasPfds[client->pfd].revents = 0;
+  client->pfd = -1;
+  while (struct rasMsgMeta* meta = ncclIntruQueueTryDequeue(&client->sendQ)) {
+    free(meta);
+  }
+}
+
+
+//////////////////////////////////////////////////////////////////////
+// Functions related to the asynchronous operations of RAS clients. //
+//////////////////////////////////////////////////////////////////////
+
+// Invoked when an asynchronous operation that a client was waiting on completes.  Finds the right client and
+// reinvokes rasClientRun.
+ncclResult_t rasClientResume(struct rasCollective* coll) {
+  int collIdx = coll-rasCollectives;
+  int i;
+  struct rasClient* client = nullptr;
+  for (i = 0; i < nRasClients; i++) {
+    client = rasClients+i;
+    if (client->status != RAS_CLIENT_CLOSED && client->collIdx == collIdx) {
+      break;
+    }
+  }
+  if (i == nRasClients) {
+    INFO(NCCL_RAS, "RAS failed to find a matching client!");
+    rasCollFree(coll);
+    goto exit;
+  }
+
+  NCCLCHECK(rasClientRun(client));
+exit:
+  return ncclSuccess;
+}
+
+// Handles a ready client FD from the main event loop.
+void rasClientEventLoop(int clientIdx, int pollIdx) {
+  struct rasClient* client = rasClients+clientIdx;
+  bool closed = false;
+
+  if (client->status == RAS_CLIENT_CONNECTED) {
+    char* cmd;
+    char* cmdEnd;
+    if (rasPfds[pollIdx].revents & POLLIN) {
+      if (client->recvOffset < sizeof(client->recvBuffer)) {
+        ssize_t nRecv;
+        nRecv = recv(client->sock, client->recvBuffer+client->recvOffset,
+                     sizeof(client->recvBuffer) - client->recvOffset, MSG_DONTWAIT);
+        if (nRecv == 0) {
+          closed = true;
+        } else if (nRecv == -1) {
+          if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
+            if (errno == ECONNRESET)
+              INFO(NCCL_RAS, "RAS socket closed by the client on receive; terminating it");
+            else
+              INFO(NCCL_RAS, "RAS unexpected error from recv; terminating the client socket");
+            closed = true;
+          }
+        } else { // nRecv > 0
+          client->recvOffset += nRecv;
+        }
+      } else { // client->recvOffset == sizeof(client->recvBuffer)
+        rasPfds[client->pfd].events &= ~POLLIN; // No room to receive for now.
+      }
+    } // if (rasPfds[pollIdx].revents & POLLIN)
+    if (closed) {
+      rasClientTerminate(client);
+      return;
+    }
+    cmd = client->recvBuffer;
+    while ((cmdEnd = (char*)memchr(cmd, '\n', client->recvOffset - (cmd-client->recvBuffer))) != nullptr) {
+      char* msg;
+      int msgLen;
+      *cmdEnd = '\0'; // Replaces '\n'.
+      if (cmdEnd > cmd && cmdEnd[-1] == '\r')
+        cmdEnd[-1] = '\0'; // Replaces '\r' (e.g., in case of a telnet connection).
+
+      if (strncasecmp(cmd, "client protocol ", strlen("client protocol ")) == 0) {
+        // We ignore the protocol version for now; we just send our version back.
+        snprintf(rasLine, sizeof(rasLine), "SERVER PROTOCOL " STR(NCCL_RAS_CLIENT_PROTOCOL) "\n");
+        msgLen = strlen(rasLine);
+        if (rasClientAllocMsg(&msg, msgLen) != ncclSuccess) {
+          rasClientTerminate(client);
+          return;
+        }
+        // We don't copy the terminating '\0', hence memcpy rather than strcpy.
+        memcpy(msg, rasLine, msgLen);
+        rasClientEnqueueMsg(client, msg, msgLen);
+      } else if (strncasecmp(cmd, "timeout ", strlen("timeout ")) == 0) {
+        char* endPtr = nullptr;
+        int timeout = strtol(cmd+strlen("timeout "), &endPtr, 10);
+        if (timeout < 0 || !endPtr || *endPtr != '\0') {
+          snprintf(rasLine, sizeof(rasLine), "ERROR: Invalid timeout value %s\n", cmd+strlen("timeout "));
+        } else {
+          client->timeout = timeout * CLOCK_UNITS_PER_SEC;
+          strcpy(rasLine, "OK\n");
+        }
+        msgLen = strlen(rasLine);
+        if (rasClientAllocMsg(&msg, msgLen) != ncclSuccess) {
+          rasClientTerminate(client);
+          return;
+        }
+        // We don't copy the terminating '\0', hence memcpy rather than strcpy.
+        memcpy(msg, rasLine, msgLen);
+        rasClientEnqueueMsg(client, msg, msgLen);
+      } else if (strcasecmp(cmd, "status") == 0) {
+        client->status = RAS_CLIENT_INIT;
+        (void)rasClientRun(client);
+      } else if (strcasecmp(cmd, "verbose status") == 0) {
+        client->status = RAS_CLIENT_INIT;
+        client->verbose = 1;
+        (void)rasClientRun(client);
+      } else {
+        snprintf(rasLine, sizeof(rasLine), "ERROR: Unknown command %s\n", cmd);
+        msgLen = strlen(rasLine);
+        if (rasClientAllocMsg(&msg, msgLen) != ncclSuccess)
+          return; // It should be non-fatal if we don't return a response...
+        // We don't copy the terminating '\0', hence memcpy rather than strcpy.
+        memcpy(msg, rasLine, msgLen);
+        rasClientEnqueueMsg(client, msg, msgLen);
+      }
+
+      cmd = cmdEnd+1;
+    } // while newline found
+
+    if (cmd == client->recvBuffer) {
+      if (client->recvOffset == sizeof(client->recvBuffer)) {
+        // We didn't find any newlines and the buffer is full.
+        INFO(NCCL_RAS, "RAS excessively long input line; terminating the client socket");
+        rasClientTerminate(client);
+        return;
+      }
+      // Otherwise it's an incomplete command; we need to wait for the rest of it.
+    } else { // cmd > client->recvBuffer
+      // Shift whatever remains (if anything) to the beginning of the buffer.
+      memmove(client->recvBuffer, cmd, client->recvOffset - (cmd-client->recvBuffer));
+      client->recvOffset -= cmd-client->recvBuffer;
+    }
+  } // if (client->status == RAS_CLIENT_CONNECTED)
+
+  if (rasPfds[pollIdx].revents & POLLOUT) {
+    struct rasMsgMeta* meta;
+    while ((meta = ncclIntruQueueHead(&client->sendQ)) != nullptr) {
+      ssize_t nSend;
+      nSend = send(client->sock, ((char*)&meta->msg)+meta->offset, meta->length-meta->offset,
+                   MSG_DONTWAIT | MSG_NOSIGNAL);
+      if (nSend < 1) {
+        if (nSend == -1 && errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
+          if (errno == EPIPE)
+            INFO(NCCL_RAS, "RAS socket closed by the client on send; terminating it");
+          else
+            INFO(NCCL_RAS, "RAS unexpected error from send; terminating the client socket");
+          closed = true;
+        }
+        break;
+      }
+
+      meta->offset += nSend;
+      if (meta->offset < meta->length)
+        break;
+
+      ncclIntruQueueDequeue(&client->sendQ);
+      free(meta);
+    } // while (meta)
+
+    if (closed) {
+      rasClientTerminate(client);
+      return;
+    }
+
+    if (!meta) {
+      rasPfds[client->pfd].events &= ~POLLOUT; // Nothing more to send for now.
+      if (client->status == RAS_CLIENT_FINISHED)
+        rasClientTerminate(client);
+    }
+  } // if (rasPfds[pollIdx].revents & POLLOUT)
+}
+
+
+//////////////////////////////////////////////////////////
+// Functions driving data gathering for the RAS client. //
+//////////////////////////////////////////////////////////
+
+// Main function that drives the whole data gathering process and sends it back to the client.
+// There are multiple asynchronous aspects of it (getting the data on connections and on communicators), so the
+// function may exit early and needs to be reinvoked when the asynchronous responses arrive or the timeout expires.
+// The state tracking the progress of such operations is kept in the rasClient.
+static ncclResult_t rasClientRun(struct rasClient* client) {
+  ncclResult_t ret = ncclSuccess;
+
+  switch (client->status) {
+    case RAS_CLIENT_INIT:
+      NCCLCHECKGOTO(rasClientRunInit(client), ret, exit);
+#if 0 // Commented out for now to focus the summary status report on the information most relevant to the users.
+      // To be revisited with future extensions to RAS.
+      client->status = RAS_CLIENT_CONNS;
+      if (ret == ncclInProgress) {
+        ret = ncclSuccess;
+        break;
+      }
+    case RAS_CLIENT_CONNS:
+      assert(client->collIdx != -1);
+      NCCLCHECKGOTO(rasClientRunConns(client), ret, exit);
+#endif
+      client->status = RAS_CLIENT_COMMS;
+      if (ret == ncclInProgress) {
+        ret = ncclSuccess;
+        break;
+      }
+    case RAS_CLIENT_COMMS:
+      assert(client->collIdx != -1);
+      NCCLCHECKGOTO(rasClientRunComms(client), ret, exit);
+      client->status = RAS_CLIENT_FINISHED;
+      break;
+    default:
+      WARN("Invalid client status %d", client->status);
+      ret = ncclInternalError;
+      goto exit;
+  }
+exit:
+  return ret;
+}
+
+// Sends to the client the initial data that can be obtained locally -- version info, stats on rasPeers,
+// dump of rasDeadPeers.  Initiates the RAS_COLL_CONNS collective operation.
+static ncclResult_t rasClientRunInit(struct rasClient* client) {
+  ncclResult_t ret = ncclSuccess;
+  char* msg = nullptr;
+  int msgLen;
+  struct rasPeerInfo* peersReSorted = nullptr;
+  int totalGpus, totalNodes, firstNGpusNode, firstNGpusGlobal, firstNPeersGlobal;
+  bool consistentNGpusNode, consistentNGpusGlobal, consistentNPeersGlobal;
+  int firstIdx, nPeers;
+  struct rasValCount valCounts[NCCL_MAX_LOCAL_RANKS];
+  int nValCounts;
+  static int cudaDriver = -1, cudaRuntime = -1;
+
+  rasOutReset();
+  rasOutAppend("NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX
+               " compiled with CUDA " STR(CUDA_MAJOR) "." STR(CUDA_MINOR) "\n");
+  if (cudaRuntime == -1)
+    cudaRuntimeGetVersion(&cudaRuntime);
+  if (cudaDriver == -1)
+    cudaDriverGetVersion(&cudaDriver);
+  rasOutAppend("CUDA runtime version %d, driver version %d\n\n", cudaRuntime, cudaDriver);
+  msgLen = rasOutLength();
+  NCCLCHECKGOTO(rasClientAllocMsg(&msg, msgLen), ret, fail);
+  rasOutExtract(msg);
+  rasClientEnqueueMsg(client, msg, msgLen);
+  msg = nullptr;
+
+  rasOutReset();
+  totalGpus = totalNodes = 0;
+  firstNGpusNode = 0; // #GPUs on the first peer of a node.
+  firstNGpusGlobal = 0; // #GPUs on peerIdx 0.
+  consistentNGpusNode = true; // Whether #GPUs/peer is consistent between the peers *on any one node*.
+  consistentNGpusGlobal = true; // Whether #GPUs/peer is consistent between the peers *on all nodes*.
+  consistentNPeersGlobal = true; // Whether #peers/node is consistent between all nodes.
+  nPeers = 0; // #peers on a node.
+  firstNPeersGlobal = 0;
+  for (int peerIdx = 0; peerIdx < nRasPeers; peerIdx++) {
+    int nGpus = __builtin_popcountll(rasPeers[peerIdx].cudaDevs);
+    totalGpus += nGpus;
+    if (peerIdx == 0) {
+      totalNodes = 1;
+      nPeers = 1;
+      firstNGpusGlobal = firstNGpusNode = nGpus;
+    } else { // peerIdx > 0
+      if (nGpus != firstNGpusGlobal)
+        consistentNGpusGlobal = false;
+      if (!ncclSocketsSameNode(&rasPeers[peerIdx].addr, &rasPeers[peerIdx-1].addr)) {
+        totalNodes++;
+        if (firstNPeersGlobal == 0)
+          firstNPeersGlobal = nPeers;
+        else if (nPeers != firstNPeersGlobal)
+          consistentNPeersGlobal = false;
+        nPeers = 1;
+        firstNGpusNode = nGpus;
+      } else { // Same node.
+        if (nGpus != firstNGpusNode)
+          consistentNGpusNode = false;
+        nPeers++;
+      } // Same node
+    } // peerIdx > 0
+    if (peerIdx == nRasPeers-1) {
+      if (firstNPeersGlobal == 0)
+        firstNPeersGlobal = nPeers;
+      else if (nPeers != firstNPeersGlobal)
+        consistentNPeersGlobal = false;
+    }
+  } // for (peerIdx)
+
+  rasOutAppend("Job summary\n"
+               "===========\n\n");
+
+  if (consistentNGpusNode && consistentNGpusGlobal && consistentNPeersGlobal) {
+    rasOutAppend("  Nodes  Processes         GPUs  Processes     GPUs\n"
+                 "(total)   per node  per process    (total)  (total)\n"
+                 "%7d"  "  %9d"    "  %11d"     "  %9d"    "  %7d\n",
+                 totalNodes, firstNPeersGlobal, firstNGpusGlobal, nRasPeers, totalGpus);
+  } else {
+    // Gather the stats on the number of processes per node.  However, that number is not a property of a peer,
+    // but of a group of peers, so calculating it is more involved.  We make a copy of rasPeers and creatively
+    // misuse it: cudaDevs of each element will be repurposed to store the number of processes on the node.
+    NCCLCHECKGOTO(ncclCalloc(&peersReSorted, nRasPeers), ret, fail);
+    memcpy(peersReSorted, rasPeers, nRasPeers * sizeof(*peersReSorted));
+
+    firstIdx = 0;
+    nPeers = 0;
+    for (int peerIdx = 0; peerIdx < nRasPeers; peerIdx++) {
+      if (peerIdx == 0) {
+        nPeers = 1;
+        firstIdx = 0;
+      } else { // peerIdx > 0
+        if (!ncclSocketsSameNode(&peersReSorted[peerIdx].addr, &peersReSorted[peerIdx-1].addr)) {
+          for (int i = firstIdx; i < peerIdx; i++) {
+            // Go back and update the number of processes of all the elements of that node.
+            peersReSorted[i].cudaDevs = nPeers;
+          }
+          nPeers = 1;
+          firstIdx = peerIdx;
+        } else {
+          nPeers++;
+        }
+      } // peerIdx > 0
+      if (peerIdx == nRasPeers-1) {
+        // Last iteration of the loop.
+        for (int i = firstIdx; i < nRasPeers; i++) {
+          peersReSorted[i].cudaDevs = nPeers;
+        }
+      }
+    } // for (peerIdx)
+
+    // Re-sort it now using the number of processes on the node (cudaDevs) as the primary key, host IP as the
+    // secondary, and process id as the tertiary.
+    qsort(peersReSorted, nRasPeers, sizeof(*peersReSorted), rasPeersNProcsCompare);
+
+    // Calculate the distribution of different numbers of peers per node.
+    nValCounts = 0;
+    for (int peerIdx = 0; peerIdx < nRasPeers;) {
+      if (peerIdx == 0 || peersReSorted[peerIdx].cudaDevs != peersReSorted[peerIdx-1].cudaDevs) {
+        valCounts[nValCounts].value = peersReSorted[peerIdx].cudaDevs;
+        valCounts[nValCounts].count = 1;
+        valCounts[nValCounts].firstIdx = peerIdx;
+        nValCounts++;
+      } else {
+        valCounts[nValCounts-1].count++;
+      }
+      // Advance peerIdx to the next node.
+      peerIdx += peersReSorted[peerIdx].cudaDevs;
+    }
+    // valCounts is currently sorted by value (the number of peers per node).  Sort it by the count (most frequent
+    // number of peers first).
+    qsort(valCounts, nValCounts, sizeof(*valCounts), rasValCountsCompareRev);
+
+    // Print it out, the most frequent peer counts first.
+    if (consistentNGpusNode && consistentNGpusGlobal) {
+      rasOutAppend("  Nodes  Processes         GPUs\n"
+                   "          per node  per process\n");
+      for (int i = 0; i < nValCounts; i++) {
+        struct rasValCount* vc = valCounts+i;
+        rasOutAppend("%7d  %9ld  %11d\n",
+                     vc->count, vc->value, firstNGpusGlobal);
+      }
+    } else {
+      rasOutAppend("  Nodes  Processes\n"
+                   "          per node\n");
+      for (int i = 0; i < nValCounts; i++) {
+        struct rasValCount* vc = valCounts+i;
+        rasOutAppend("%7d  %9ld\n",
+                     vc->count, vc->value);
+      }
+
+      // We calculate and print the GPUs/process separately.  This is required for !consistentNGpusNode and
+      // it also makes our life easier above for !consistentNGpusGlobal (which could require a larger valCounts).
+
+      // Sort peers by the GPU count, to simplify data extraction.
+      memcpy(peersReSorted, rasPeers, nRasPeers * sizeof(*peersReSorted));
+      // GPU count is the primary key, host IP is the secondary, and process id is the tertiary.
+      qsort(peersReSorted, nRasPeers, sizeof(*peersReSorted), rasPeersNGpuCompare);
+
+      // Calculate the distribution of different numbers of GPUs per peer.
+      nValCounts = 0;
+      for (int peerIdx = 0; peerIdx < nRasPeers; peerIdx++) {
+        if (peerIdx == 0 || __builtin_popcountll(peersReSorted[peerIdx].cudaDevs) !=
+                            __builtin_popcountll(peersReSorted[peerIdx-1].cudaDevs)) {
+          valCounts[nValCounts].value = __builtin_popcountll(peersReSorted[peerIdx].cudaDevs);
+          valCounts[nValCounts].count = 1;
+          valCounts[nValCounts].firstIdx = peerIdx;
+          nValCounts++;
+        } else {
+          valCounts[nValCounts-1].count++;
+        }
+      }
+      // valCounts is currently sorted by value (number of GPUs per peer).  Sort it by the count (most frequent
+      // GPU counts first).
+      qsort(valCounts, nValCounts, sizeof(*valCounts), rasValCountsCompareRev);
+
+      // Print it out, the most frequent GPU counts first.
+      rasOutAppend("\n"
+                   "         Processes         GPUs\n"
+                   "                    per process\n");
+      for (int i = 0; i < nValCounts; i++) {
+        struct rasValCount* vc = valCounts+i;
+        rasOutAppend("         %9d  %11ld\n",
+                     vc->count, vc->value);
+      }
+    }
+    rasOutAppend("\n"
+                 "  Nodes  Processes         GPUs\n"
+                 "(total)    (total)      (total)\n"
+                 "%7d"  "  %9d"    "  %11d\n",
+                 totalNodes, nRasPeers, totalGpus);
+
+    if (consistentNGpusNode && consistentNGpusGlobal) {
+      // In this simpler case, also print the node outliers.
+      for (int i = 1; i < nValCounts; i++) {
+        struct rasValCount* vc = valCounts+i;
+        // We assume that the most frequent group is correct; for the remaining ones, we try to provide more info,
+        // provided that they meet our definition of an outlier.
+        if (rasCountIsOutlier(vc->count, client->verbose, totalNodes)) {
+          rasOutAppend("\nThe outlier node%s:\n", (vc->count > 1 ? "s" : ""));
+          // peersReSorted is sorted by the node IP address (not port!) as the secondary key and the pid as
+          // the tertiary, which comes in handy when printing...
+          for (int peerIdx = vc->firstIdx; peerIdx < vc->count*vc->value + vc->firstIdx; peerIdx += vc->value) {
+            lineBuf[0] = '\0';
+            for (int j = 0; j < vc->value; j++) {
+              snprintf(lineBuf+strlen(lineBuf), sizeof(lineBuf)-strlen(lineBuf), "%s%d",
+                       (j > 0 ? "," : ""), peersReSorted[j].pid);
+            }
+            rasOutAppend("  Node %s running process%s %s\n",
+                         ncclSocketToHost(&peersReSorted[peerIdx].addr, rasLine, sizeof(rasLine)),
+                         (vc->value > 1 ? "es" : ""), lineBuf);
+          } // for (peerIdx)
+        } // if (rasCountIsOutlier(vc->count))
+      } // for (i)
+    } // !consistentNPeersGlobal
+  } // !consistentNGpusNode || !consistentNGpusGlobal || !consistentNPeersGlobal
+
+#if 0 // Commented out for now to focus the summary status report on the information most relevant to the users.
+      // To be revisited with future extensions to RAS.
+  rasOutAppend("\nGathering data about the RAS network (timeout %lds)...", client->timeout / CLOCK_UNITS_PER_SEC);
+  msgLen = rasOutLength();
+  NCCLCHECKGOTO(rasClientAllocMsg(&msg, msgLen), ret, fail);
+  rasOutExtract(msg);
+  rasClientEnqueueMsg(client, msg, msgLen);
+  msg = nullptr;
+  {
+    struct rasCollRequest collReq;
+    bool allDone = false;
+    rasCollReqInit(&collReq);
+    collReq.timeout = client->timeout;
+    collReq.type = RAS_COLL_CONNS;
+    NCCLCHECKGOTO(rasNetSendCollReq(&collReq, rasCollDataLength(RAS_COLL_CONNS), &allDone, &client->collIdx),
+                  ret, fail);
+    if (!allDone)
+      ret = ncclInProgress; // We need to wait for async. responses.
+  }
+#endif
+  rasOutAppend("\nCommunicators...");
+  msgLen = rasOutLength();
+  NCCLCHECKGOTO(rasClientAllocMsg(&msg, msgLen), ret, fail);
+  rasOutExtract(msg);
+  rasClientEnqueueMsg(client, msg, msgLen);
+  msg = nullptr;
+  {
+    struct rasCollRequest collReq;
+    bool allDone = false;
+    rasCollReqInit(&collReq);
+    collReq.timeout = client->timeout;
+    collReq.type = RAS_COLL_COMMS;
+    NCCLCHECKGOTO(rasNetSendCollReq(&collReq, rasCollDataLength(RAS_COLL_COMMS), &allDone, &client->collIdx),
+                  ret, fail);
+    if (!allDone)
+      ret = ncclInProgress;
+  }
+exit:
+  free(peersReSorted);
+  return ret;
+fail:
+  goto exit;
+}
+
+#if 0 // Commented out for now to focus the summary status report on the information most relevant to the users.
+      // To be revisited with future extensions to RAS.
+// Processes the response from the RAS_COLL_CONNS collective operation and sends the data to the client (for now
+// primarily the list of missing processes).  Initiates the RAS_COLL_COMMS collective operation.
+static ncclResult_t rasClientRunConns(struct rasClient* client) {
+  ncclResult_t ret = ncclSuccess;
+  char* msg = nullptr;
+  int msgLen;
+  struct rasCollective* coll = rasCollectives+client->collIdx;
+  struct rasCollConns* connsData = (struct rasCollConns*)coll->data;
+  int expected;
+  struct rasPeerInfo* peersBuf = nullptr;
+
+  assert(coll->nFwdSent == coll->nFwdRecv);
+  client->collIdx = -1;
+
+  rasOutReset();
+  rasOutAppend(" obtained a result in %.2fs\n", (clockNano()-coll->startTime)/1e9);
+  if (coll->nLegTimeouts > 0) {
+    rasOutAppend(" Warning: encountered %d communication timeout%s while gathering data\n", coll->nLegTimeouts,
+                 (coll->nLegTimeouts > 1 ? "s" : ""));
+  }
+
+  expected = nRasPeers - nRasDeadPeers;
+  if (coll->nPeers != expected) {
+    int missing = expected - coll->nPeers;
+    rasOutAppend(" Warning: missing data from %d process%s (received from %d, expected %d)\n",
+                 missing, (missing > 1 ? "es" : ""), coll->nPeers, expected);
+    if (missing <= RAS_CLIENT_DETAIL_THRESHOLD) {
+      // Extract a list of missing peers.  We don't want to print it right away because it would be sorted
+      // by address (including port, which isn't meaningful to end users).
+      int nPeersBuf = 0;
+      NCCLCHECKGOTO(ncclCalloc(&peersBuf, missing), ret, fail);
+      // Ensure both arrays are sorted (rasPeers already is, by addr); makes finding missing records a breeze.
+      qsort(coll->peers, coll->nPeers, sizeof(*coll->peers), &ncclSocketsCompare);
+      for (int rasPeerIdx = 0, collPeerIdx = 0; rasPeerIdx < nRasPeers || collPeerIdx < coll->nPeers;) {
+        int cmp;
+        if (rasPeerIdx < nRasPeers && collPeerIdx < coll->nPeers)
+          cmp = ncclSocketsCompare(&rasPeers[rasPeerIdx].addr, coll->peers+collPeerIdx);
+        else
+          cmp = (rasPeerIdx < nRasPeers ? -1 : 1);
+
+        if (cmp == 0) {
+          rasPeerIdx++;
+          collPeerIdx++;
+        } else if (cmp < 0) {
+          memcpy(peersBuf+(nPeersBuf++), rasPeers+rasPeerIdx, sizeof(*peersBuf));
+          rasPeerIdx++;
+        } else { // cmp > 0
+          // Process not found in rasPeers -- shouldn't happen.
+          collPeerIdx++;
+        } // cmp > 0
+      } // for (rasPeerIdx, collPeerIdx)
+
+      // Sort the output by host and pid, not host and port.
+      qsort(peersBuf, nPeersBuf, sizeof(*peersBuf), rasPeersHostPidCompare);
+      rasOutAppend("  The missing process%s:\n", (missing > 1 ? "es" : ""));
+      for (int peerIdx = 0; peerIdx < nPeersBuf; peerIdx++) {
+        rasOutAppend("  Process %d on node %s managing GPU%s %s\n", peersBuf[peerIdx].pid,
+                     ncclSocketToHost(&peersBuf[peerIdx].addr, rasLine, sizeof(rasLine)),
+                     (__builtin_popcountll(peersBuf[peerIdx].cudaDevs) > 1 ? "s" : ""),
+                     rasGpuDevsToString(peersBuf[peerIdx].cudaDevs, peersBuf[peerIdx].nvmlDevs, lineBuf,
+                                        sizeof(lineBuf)));
+      }
+      if (nPeersBuf != missing)
+        rasOutAppend("  [could not find information on %d process%s]\n",
+                     missing-nPeersBuf, (missing-nPeersBuf > 1 ? "es" : ""));
+    } // if (expected - coll->nPeers <= RAS_CLIENT_DETAIL_THRESHOLD)
+  } // if (coll->nPeers != expected)
+
+  if (connsData->nConns > 0) {
+    rasOutAppend(" Collected data about %d unidirectional connection%s\n",
+                 connsData->nConns, (connsData->nConns > 1 ? "s" : ""));
+    rasOutAppend(" Travel times (valid only if system clocks are synchronized between nodes):\n"
+                 "  Minimum %fs, maximum %fs, average %fs\n",
+                 connsData->travelTimeMin/1e9, connsData->travelTimeMax/1e9,
+                 connsData->travelTimeSum/(1e9*connsData->travelTimeCount));
+  } else {
+    rasOutAppend(" No connection data collected!\n");
+  }
+  if (connsData->nNegativeMins > 0) {
+    rasOutAppend(" Warning: negative travel times were observed across %d connection%s,\n"
+                 " indicating that the system clocks are *not* synchronized.\n"
+                 " Ordering of events based on local timestamps should be considered unreliable\n",
+                 connsData->nNegativeMins, (connsData->nNegativeMins > 1 ? "s" : ""));
+    if (connsData->nNegativeMins <= RAS_CLIENT_DETAIL_THRESHOLD) {
+      rasOutAppend("  The affected connection%s:\n", (connsData->nNegativeMins > 1 ? "s" : ""));
+      for (int i = 0; i < connsData->nNegativeMins; i++) {
+        struct rasCollConns::negativeMin* negativeMin = connsData->negativeMins+i;
+        int sourcePeerIdx = rasPeerFind(&negativeMin->source);
+        int destPeerIdx = rasPeerFind(&negativeMin->dest);
+        if (sourcePeerIdx != -1 && destPeerIdx != -1)
+          rasOutAppend("  From node %s process %d to node %s process %d: observed travel time of %fs\n",
+                       ncclSocketToHost(&negativeMin->source, rasLine, sizeof(rasLine)), rasPeers[sourcePeerIdx].pid,
+                       ncclSocketToHost(&negativeMin->dest, lineBuf, sizeof(lineBuf)), rasPeers[destPeerIdx].pid,
+                       negativeMin->travelTimeMin/1e9);
+      }
+    }
+  }
+  rasCollFree(coll);
+
+  rasOutAppend("\nGathering data about the NCCL communicators (timeout %lds)...",
+               client->timeout / CLOCK_UNITS_PER_SEC);
+  msgLen = rasOutLength();
+  NCCLCHECKGOTO(rasClientAllocMsg(&msg, msgLen), ret, fail);
+  rasOutExtract(msg);
+  rasClientEnqueueMsg(client, msg, msgLen);
+  msg = nullptr;
+  {
+    struct rasCollRequest collReq;
+    bool allDone = false;
+    rasCollReqInit(&collReq);
+    collReq.timeout = client->timeout;
+    collReq.type = RAS_COLL_COMMS;
+    NCCLCHECKGOTO(rasNetSendCollReq(&collReq, rasCollDataLength(RAS_COLL_COMMS), &allDone, &client->collIdx),
+                  ret, fail);
+    if (!allDone)
+      ret = ncclInProgress;
+  }
+exit:
+  free(peersBuf);
+  return ret;
+fail:
+  goto exit;
+}
+#endif
+
+// Processes the response from the RAS_COLL_COMMS collective operation and sends the data to the client:
+// statistics on the communicators, missing data from ranks, inconsistent collective operation counts,
+// initialization and asynchronous errors, and inconsistent initialization/termination status.
+static ncclResult_t rasClientRunComms(struct rasClient* client) {
+  ncclResult_t ret = ncclSuccess;
+  char* msg = nullptr;
+  int msgLen;
+  struct rasCollective* coll = rasCollectives+client->collIdx;
+  struct rasCollComms* commsData = (struct rasCollComms*)coll->data;
+  struct rasCollComms::comm* comm;
+  struct rasCollComms::comm::rank* ranksReSorted = nullptr;
+  struct rasValCount* valCounts = nullptr;
+  int nValCounts;
+  struct rasValCount* collOpCounts = nullptr;
+  struct rasAuxComm* auxComms = nullptr;
+  int maxCommSize;
+  int* peerIdxConv = nullptr;
+  int vcIdx;
+  int nPeersMissing;
+  uint64_t* peerNvmlDevs = nullptr;
+  const char*const statusStr[] = { "UNKNOWN", "INIT", "RUNNING", "FINALIZE", "ABORT" };
+  const char*const errorStr[] = {
+    // Listing them all like this, while a bit of a hassle, is less effort than formatting in a temporary buffer.
+    "OK",
+    "MISMATCH",
+    "ERROR",
+    "ERROR,MISMATCH",
+    "INCOMPLETE",
+    "INCOMPLETE,MISMATCH",
+    "INCOMPLETE,ERROR",
+    "INCOMPLETE,ERROR,MISMATCH"
+  };
+
+  assert(coll->nFwdSent == coll->nFwdRecv);
+  client->collIdx = -1;
+
+  rasOutReset();
+  rasOutAppend(" (%.2fs)\n=============\n\n", (clockNano()-coll->startTime)/1e9);
+
+  // Calculate the number of missing peers early as we rely on it for other things.
+  nPeersMissing = nRasPeers - nRasDeadPeers - coll->nPeers;
+
+  // Sort the communicators by size.  As the structure is inconvenient to move around due to the elements being
+  // of variable length, we create an auxiliary array that includes pointers to individual elements and simply sort
+  // that array while keeping the data intact.
+  NCCLCHECKGOTO(ncclCalloc(&auxComms, commsData->nComms), ret, fail);
+  // While initializing the just allocated array, also find out the size of the largest communicator so that we know
+  // how much memory to allocate for another temporary array.
+  maxCommSize = 0;
+  comm = commsData->comms;
+  for (int commIdx = 0; commIdx < commsData->nComms; commIdx++) {
+    if (maxCommSize < comm->commNRanks)
+      maxCommSize = comm->commNRanks;
+    auxComms[commIdx].comm = comm;
+    comm = (struct rasCollComms::comm*)(((char*)(comm+1)) + comm->nRanks * sizeof(*comm->ranks));
+  }
+  NCCLCHECKGOTO(ncclCalloc(&ranksReSorted, maxCommSize), ret, fail);
+
+  // For convenience, create a translation table from rasCollective's peerIdx to rasPeers peerIdx.
+  NCCLCHECKGOTO(ncclCalloc(&peerIdxConv, coll->nPeers), ret, fail);
+  for (int peerIdx = 0; peerIdx < coll->nPeers; peerIdx++)
+    peerIdxConv[peerIdx] = rasPeerFind(coll->peers+peerIdx);
+  // Sort coll->peers to match the ordering of rasPeers -- we may need it later...
+  qsort(coll->peers, coll->nPeers, sizeof(*coll->peers), &ncclSocketsCompare);
+
+  // Fill in the remaining fields of auxComm's.
+  for (int commIdx = 0; commIdx < commsData->nComms; commIdx++) {
+    struct rasAuxComm* auxComm = auxComms+commIdx;
+    int nRanks = 0;
+    comm = auxComm->comm;
+
+    if (comm->commNRanks > comm->nRanks) {
+      // There are two possibilities here.  Either we are missing the data on some ranks because the processes are
+      // unreachable, or the processes _are_ reachable but didn't report to be part of this communicator (which
+      // could definitely happen if some processes have already called ncclCommDestroy or ncclCommAbort).  Because we
+      // currently don't collect data about missing ranks, we can't reliably distinguish these two cases.
+      // For now we rely on an approximation: if we _know_ that some peers failed to respond, we mark this
+      // as an INCOMPLETE error; otherwise as a MISMATCH warning.
+      if (nPeersMissing > 0 || nRasDeadPeers > 0)
+        auxComm->errors |= RAS_ACE_INCOMPLETE;
+      else {
+        auxComm->errors |= RAS_ACE_MISMATCH;
+        auxComm->status |= RAS_ACS_UNKNOWN;
+      }
+    }
+
+    memcpy(ranksReSorted, comm->ranks, comm->nRanks * sizeof(*ranksReSorted));
+    // Convert ranksReSorted' peerIdx to rasPeers and sort by it -- that way we will have the ranks sorted
+    // by process _and_ node, which makes counting easy.
+    for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++)
+      ranksReSorted[rankIdx].peerIdx = peerIdxConv[ranksReSorted[rankIdx].peerIdx];
+    qsort(ranksReSorted, comm->nRanks, sizeof(*ranksReSorted), rasCommRanksPeerCompare);
+
+    // Count the peers and nodes, get the status/error indicators.
+    for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) {
+      struct rasCollComms::comm::rank* rank = ranksReSorted+rankIdx;
+      if (rankIdx == 0) {
+        auxComm->nPeers = auxComm->nNodes = 1;
+        auxComm->ranksPerNodeMin = NCCL_MAX_LOCAL_RANKS;
+        auxComm->ranksPerNodeMax = 0;
+        auxComm->firstCollOpCount = rank->collOpCount;
+        nRanks = 1;
+      } else { // rankIdx > 0
+        if (rank->peerIdx != rank[-1].peerIdx) {
+          auxComm->nPeers++;
+          if (!ncclSocketsSameNode(&rasPeers[rank->peerIdx].addr, &rasPeers[rank[-1].peerIdx].addr)) {
+            auxComm->nNodes++;
+            if (auxComm->ranksPerNodeMin > nRanks)
+              auxComm->ranksPerNodeMin = nRanks;
+            if (auxComm->ranksPerNodeMax < nRanks)
+              auxComm->ranksPerNodeMax = nRanks;
+            nRanks = 0;
+          }
+        } // if (rank->peerIdx != rank[-1].peerIdx)
+        nRanks++;
+      } // rankIdx > 0
+      if (rankIdx == comm->nRanks-1) {
+        // Last iteration of the loop.
+        if (auxComm->ranksPerNodeMin > nRanks)
+          auxComm->ranksPerNodeMin = nRanks;
+        if (auxComm->ranksPerNodeMax < nRanks)
+          auxComm->ranksPerNodeMax = nRanks;
+      }
+
+      if (rank->status.abortFlag)
+        auxComm->status |= RAS_ACS_ABORT;
+      else if (rank->status.finalizeCalled || rank->status.destroyFlag) {
+        // destroyFlag is set by ncclCommDestroy and ncclCommAbort.  finalizeCalled appears to be set by
+        // ncclCommFinalize only.  According to the docs, ncclCommDestroy *can* be called without calling
+        // ncclCommFinalize first.  The code structure here ensures that we attribute destroyFlag properly
+        // as a finalize state indicator (and ignore it in case of ncclCommAbort).
+        auxComm->status |= RAS_ACS_FINALIZE;
+      }
+      else if (rank->status.initState == ncclSuccess)
+        auxComm->status |= RAS_ACS_RUNNING;
+      else // rank->initState != ncclSuccess
+        auxComm->status |= RAS_ACS_INIT;
+
+      if (rank->collOpCount != auxComm->firstCollOpCount)
+        auxComm->errors |= RAS_ACE_MISMATCH;
+      if (rank->status.initState != ncclSuccess && rank->status.initState != ncclInProgress)
+        auxComm->errors |= RAS_ACE_ERROR;
+      if (rank->status.asyncError != ncclSuccess && rank->status.asyncError != ncclInProgress)
+        auxComm->errors |= RAS_ACE_ERROR;
+    } // for (rankIdx)
+
+    if (__builtin_popcount(auxComm->status) > 1) {
+      // We've got a status mismatch between ranks.
+      auxComm->errors |= RAS_ACE_MISMATCH;
+    }
+  } // for (commIdx)
+  // Sort it by size/nNodes/status/errors/missing ranks.
+  qsort(auxComms, commsData->nComms, sizeof(*auxComms), &rasAuxCommsCompareRev);
+
+  // Calculate the distribution of different communicator sizes.
+  NCCLCHECKGOTO(ncclCalloc(&valCounts, commsData->nComms), ret, fail);
+  nValCounts = 0;
+  for (int commIdx = 0; commIdx < commsData->nComms; commIdx++) {
+    if (commIdx == 0 ||
+        auxComms[commIdx].comm->commNRanks != auxComms[commIdx-1].comm->commNRanks ||
+        auxComms[commIdx].nNodes != auxComms[commIdx-1].nNodes ||
+        // __builtin_clz returns the number of leading 0-bits, which is a proxy for the index of the highest 1-bit.
+        __builtin_clz(auxComms[commIdx].status) != __builtin_clz(auxComms[commIdx-1].status) ||
+        auxComms[commIdx].errors != auxComms[commIdx-1].errors) {
+      valCounts[nValCounts].value = 0; // We have many distinguishing values but only one field to store them.
+                                       // It doesn't really matter, given that we can extract them via firstIdx.
+      valCounts[nValCounts].count = 1;
+      valCounts[nValCounts].firstIdx = commIdx;
+      nValCounts++;
+    } else {
+      valCounts[nValCounts-1].count++;
+    }
+  }
+
+  rasOutAppend("Group     Comms     Nodes     Ranks     Ranks     Ranks    Status  Errors\n"
+               "    #  in group  per comm  per node  per comm  in group\n");
+  if (commsData->nComms == 0)
+    rasOutAppend("No communicator data collected!\n");
+
+  // Allocate an auxiliary structure used for counting the number of ranks (unique GPUs) in a group.
+  NCCLCHECKGOTO(ncclCalloc(&peerNvmlDevs, coll->nPeers), ret, fail);
+
+  // Print it out, the largest communicators first.
+  for (int vcIdx = 0; vcIdx < nValCounts; vcIdx++) {
+    struct rasValCount* vc = valCounts+vcIdx;
+    struct rasAuxComm* auxComm = auxComms+vc->firstIdx;
+    int ranksPerNodeMin, ranksPerNodeMax;
+    int ranksTotal;
+
+    ranksPerNodeMin = NCCL_MAX_LOCAL_RANKS;
+    ranksPerNodeMax = 0;
+    memset(peerNvmlDevs, '\0', coll->nPeers * sizeof(*peerNvmlDevs));
+    // We don't group comms by ranksPerNodeMin/Max, so the values may differ between comms in one group.
+    // Calculate the group's min/max.
+    // Also calculate the number of unique ranks in the group.
+    for (int commIdx = 0; commIdx < vc->count; commIdx++) {
+      if (ranksPerNodeMin > auxComm[commIdx].ranksPerNodeMin)
+        ranksPerNodeMin = auxComm[commIdx].ranksPerNodeMin;
+      if (ranksPerNodeMax < auxComm[commIdx].ranksPerNodeMax)
+        ranksPerNodeMax = auxComm[commIdx].ranksPerNodeMax;
+      for (int rankIdx = 0; rankIdx < auxComm[commIdx].comm->nRanks; rankIdx++) {
+        struct rasCollComms::comm::rank* rank = auxComm[commIdx].comm->ranks+rankIdx;
+        peerNvmlDevs[rank->peerIdx] |= (1UL << rank->nvmlDev);
+      }
+    }
+    ranksTotal = 0;
+    for (int peerIdx = 0; peerIdx < coll->nPeers; peerIdx++)
+      ranksTotal += __builtin_popcountll(peerNvmlDevs[peerIdx]);
+    if (ranksPerNodeMin == ranksPerNodeMax)
+      snprintf(rasLine, sizeof(rasLine), "%d", ranksPerNodeMin);
+    else
+      snprintf(rasLine, sizeof(rasLine), "%d-%d", ranksPerNodeMin, ranksPerNodeMax);
+    rasOutAppend("%5d  %8d  %8d  %8s  %8d  %8d  %8s  %6s\n",
+                 vcIdx, vc->count, auxComm->nNodes, rasLine, auxComm->comm->commNRanks, ranksTotal,
+                 // __builtin_clz returns the number of leading 0-bits.  This makes it possible to translate the
+                 // status (which is a bitmask) into an array index.
+                 statusStr[(sizeof(unsigned int)*8-1)-__builtin_clz(auxComm->status)], errorStr[auxComm->errors]);
+  }
+
+  rasOutAppend("\nErrors\n"
+               "======\n\n");
+
+  if (nPeersMissing > 0) {
+    rasOutAppend("INCOMPLETE\n"
+                 "  Missing communicator data from %d job process%s\n", nPeersMissing, (nPeersMissing > 1 ? "es" : ""));
+    if (rasCountIsOutlier(nPeersMissing, client->verbose)) {
+      // Extract a list of missing peers.  We don't want to print it right away because it would be sorted
+      // by address (including port, which isn't meaningful to end users).
+      struct rasPeerInfo* peersBuf = nullptr;
+      int nPeersBuf;
+
+      // Both rasPeers and coll->peers are sorted by address (the latter we sorted above) which makes comparing
+      // them much easier.
+      NCCLCHECKGOTO(ncclCalloc(&peersBuf, nPeersMissing), ret, fail);
+      nPeersBuf = 0;
+      for (int rasPeerIdx = 0, collPeerIdx = 0; rasPeerIdx < nRasPeers || collPeerIdx < coll->nPeers;) {
+        int cmp;
+        if (rasPeerIdx < nRasPeers && collPeerIdx < coll->nPeers)
+          cmp = ncclSocketsCompare(&rasPeers[rasPeerIdx].addr, coll->peers+collPeerIdx);
+        else
+          cmp = (rasPeerIdx < nRasPeers ? -1 : 1);
+
+        if (cmp == 0) {
+          rasPeerIdx++;
+          collPeerIdx++;
+        } else if (cmp < 0) {
+          // Process missing from coll->peers.  Don't report dead ones though, as they are not included
+          // in nPeersMissing and are reported separately below.
+          if (!rasPeerIsDead(&rasPeers[rasPeerIdx].addr)) {
+            assert(nPeersBuf < nPeersMissing);
+            memcpy(peersBuf+(nPeersBuf++), rasPeers+rasPeerIdx, sizeof(*peersBuf));
+          }
+          rasPeerIdx++;
+        } else { // cmp > 0
+          // Process not found in rasPeers -- shouldn't happen, unless during a race?
+          collPeerIdx++;
+        } // cmp > 0
+      } // for (rasPeerIdx, collPeerIdx)
+
+      // Sort the output by host and pid.
+      qsort(peersBuf, nPeersBuf, sizeof(*peersBuf), rasPeersHostPidCompare);
+      for (int peerIdx = 0; peerIdx < nPeersBuf; peerIdx++) {
+        rasOutAppend("  Process %d on node %s managing GPU%s %s\n", peersBuf[peerIdx].pid,
+                     ncclSocketToHost(&peersBuf[peerIdx].addr, rasLine, sizeof(rasLine)),
+                     (__builtin_popcountll(peersBuf[peerIdx].cudaDevs) > 1 ? "s" : ""),
+                     rasGpuDevsToString(peersBuf[peerIdx].cudaDevs, peersBuf[peerIdx].nvmlDevs, lineBuf,
+                                        sizeof(lineBuf)));
+      }
+      if (nPeersBuf != nPeersMissing)
+        rasOutAppend("  [could not find information on %d process%s]\n",
+                     nPeersMissing-nPeersBuf, (nPeersMissing-nPeersBuf > 1 ? "es" : ""));
+      free(peersBuf);
+    } // if (rasCountIsOutlier(nPeersMissing))
+    rasOutAppend("\n");
+  }
+
+  if (nRasDeadPeers > 0) {
+    rasOutAppend("DEAD\n"
+                 "  %d job process%s considered dead (unreachable via the RAS network)\n", nRasDeadPeers,
+                 (nRasDeadPeers > 1 ? "es are" : " is"));
+    if (rasCountIsOutlier(nRasDeadPeers, client->verbose)) {
+      struct rasPeerInfo* peersReSorted = nullptr;
+      int nPeersReSorted = 0;
+      NCCLCHECKGOTO(ncclCalloc(&peersReSorted, nRasDeadPeers), ret, fail);
+      for (int i = 0; i < nRasDeadPeers; i++) {
+        int peerIdx = rasPeerFind(rasDeadPeers+i);
+        if (peerIdx != -1)
+          memcpy(peersReSorted+(nPeersReSorted++), rasPeers+peerIdx, sizeof(*peersReSorted));
+      }
+      // Sort the output by host and pid, not host and port.
+      qsort(peersReSorted, nPeersReSorted, sizeof(*peersReSorted), rasPeersHostPidCompare);
+      for (int peerIdx = 0; peerIdx < nPeersReSorted; peerIdx++) {
+        rasOutAppend("  Process %d on node %s managing GPU%s %s\n", peersReSorted[peerIdx].pid,
+                     ncclSocketToHost(&peersReSorted[peerIdx].addr, rasLine, sizeof(rasLine)),
+                     (__builtin_popcountll(peersReSorted[peerIdx].cudaDevs) > 1 ? "s" : ""),
+                     rasGpuDevsToString(peersReSorted[peerIdx].cudaDevs, peersReSorted[peerIdx].nvmlDevs, lineBuf,
+                                        sizeof(lineBuf)));
+      }
+      if (nPeersReSorted != nRasDeadPeers)
+        rasOutAppend("  [could not find information on %d process%s]\n",
+                     nRasDeadPeers-nPeersReSorted, (nRasDeadPeers-nPeersReSorted > 1 ? "es" : ""));
+      free(peersReSorted);
+    } // if (rasCountIsOutlier(nRasDeadPeers)
+    rasOutAppend("\n");
+  }
+
+  for (vcIdx = 0; vcIdx < nValCounts; vcIdx++) {
+    struct rasValCount* vc;
+    vc = valCounts+vcIdx;
+    for (int commIdx = vc->firstIdx; commIdx < vc->count + vc->firstIdx; commIdx++) {
+      struct rasAuxComm* auxComm = auxComms+commIdx;
+      comm = auxComm->comm;
+
+      if (auxComm->errors & RAS_ACE_INCOMPLETE) {
+        int nRanksMissing = comm->commNRanks - comm->nRanks;
+        rasOutAppend("#%d-%d (%016lx) INCOMPLETE\n"
+                     "  Missing communicator data from %d rank%s\n", vcIdx, commIdx - vc->firstIdx,
+                     comm->commHash, nRanksMissing, (nRanksMissing > 1 ? "s" : ""));
+        if (rasCountIsOutlier(nRanksMissing, client->verbose)) {
+          lineBuf[0] = '\0';
+          // rankIdx indexes the comm->ranks array; in principle it should be the same as commRank, with the
+          // exception of the missing ranks...
+          for (int commRank = 0, rankIdx = 0; commRank < comm->commNRanks; commRank++) {
+            if (rankIdx < comm->nRanks && comm->ranks[rankIdx].commRank == commRank) {
+              rankIdx++;
+            } else {
+              snprintf(lineBuf+strlen(lineBuf), sizeof(lineBuf)-strlen(lineBuf), "%s%d",
+                       (rankIdx == commRank ? "" : ","), commRank);
+            }
+          } // for (commRank)
+          rasOutAppend("  The missing rank%s: %s\n", (nRanksMissing > 1 ? "s" : ""), lineBuf);
+        } // if (rasCountIsOutlier(nRanksMissing))
+        rasOutAppend("\n");
+      } // if (auxComm->errors & RAS_ACE_INCOMPLETE)
+
+      if (auxComm->errors & RAS_ACE_ERROR) {
+        int ncclErrors[ncclNumResults];
+        int nErrors;
+        rasOutAppend("#%d-%d (%016lx) ERROR\n", vcIdx, commIdx - vc->firstIdx, comm->commHash);
+
+        memset(ncclErrors, '\0', sizeof(ncclErrors));
+        for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++)
+          ncclErrors[comm->ranks[rankIdx].status.initState]++;
+        nErrors = comm->nRanks - (ncclErrors[ncclSuccess] + ncclErrors[ncclInProgress]);
+        if (nErrors > 0) {
+          rasOutAppend("  Initialization error%s on %d rank%s\n",
+                       (nErrors > 1 ? "s" : ""), nErrors, (nErrors > 1 ? "s" : ""));
+          rasClientBreakDownErrors(client, comm, peerIdxConv, ncclErrors);
+        }
+
+        memset(ncclErrors, '\0', sizeof(ncclErrors));
+        for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++)
+          ncclErrors[comm->ranks[rankIdx].status.asyncError]++;
+        nErrors = comm->nRanks - (ncclErrors[ncclSuccess] + ncclErrors[ncclInProgress]);
+        if (nErrors > 0) {
+          rasOutAppend("  Asynchronous error%s on %d rank%s\n",
+                       (nErrors > 1 ? "s" : ""), nErrors, (nErrors > 1 ? "s" : ""));
+          rasClientBreakDownErrors(client, comm, peerIdxConv, ncclErrors, /*isAsync*/true);
+        }
+        rasOutAppend("\n");
+      } // if (auxComm->errors & RAS_ACE_ERROR)
+    } // for (commIdx)
+  } // for (vcIdx)
+
+  rasOutAppend("Warnings\n"
+               "========\n\n");
+
+  if (coll->nLegTimeouts > 0) {
+    rasOutAppend("TIMEOUT\n"
+                 "  Encountered %d communication timeout%s while gathering communicator data\n\n",
+                 coll->nLegTimeouts, (coll->nLegTimeouts > 1 ? "s" : ""));
+  }
+
+  for (int vcIdx = 0; vcIdx < nValCounts; vcIdx++) {
+    struct rasValCount* vc = valCounts+vcIdx;
+    for (int commIdx = vc->firstIdx; commIdx < vc->count + vc->firstIdx; commIdx++) {
+      bool inconsistent;
+      struct rasAuxComm* auxComm = auxComms+commIdx;
+      comm = auxComm->comm;
+
+      if (auxComm->errors & RAS_ACE_MISMATCH) {
+        rasOutAppend("#%d-%d (%016lx) MISMATCH\n", vcIdx, commIdx - vc->firstIdx, comm->commHash);
+
+        if (collOpCounts == nullptr) {
+          // Allocating comm->commNRanks elements ensures that we won't need to reallocate, because the valCounts
+          // array is reverse-sorted by commNRanks.  On the other hand, for this purpose allocating commNRanks
+          // elements may be massively overpessimistic...
+          NCCLCHECKGOTO(ncclCalloc(&collOpCounts, comm->commNRanks), ret, fail);
+        }
+
+        if (__builtin_popcount(auxComm->status) > 1) {
+          rasOutAppend("  Communicator ranks have different status\n");
+
+          // We need to sort the ranks by status.  However, status is normally calculated from other fields.
+          // We will copy the ranks and reuse collOpCount to store it.
+          memcpy(ranksReSorted, comm->ranks, comm->nRanks * sizeof(*ranksReSorted));
+          for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) {
+            struct rasCollComms::comm::rank* rank = ranksReSorted+rankIdx;
+
+            if (rank->status.abortFlag)
+              rank->collOpCount = RAS_ACS_ABORT;
+            else if (rank->status.finalizeCalled || rank->status.destroyFlag)
+              rank->collOpCount = RAS_ACS_FINALIZE;
+            else if (rank->status.initState == ncclSuccess)
+              rank->collOpCount = RAS_ACS_RUNNING;
+            else
+              rank->collOpCount = RAS_ACS_INIT;
+          }
+          qsort(ranksReSorted, comm->nRanks, sizeof(*ranksReSorted), rasCommRanksCollOpCompare);
+          // Calculate the frequency of different status values.
+          int nCollOpCounts = 0;
+          for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) {
+            if (rankIdx == 0 || ranksReSorted[rankIdx].collOpCount != ranksReSorted[rankIdx-1].collOpCount) {
+              // __builtin_clz returns the number of leading 0-bits.  This makes it possible to translate the
+              // status (which is a bitmask) into an array index.
+              collOpCounts[nCollOpCounts].value = (sizeof(unsigned int)*8-1) - __builtin_clz(ranksReSorted[rankIdx].collOpCount);
+              collOpCounts[nCollOpCounts].count = 1;
+              collOpCounts[nCollOpCounts].firstIdx = rankIdx;
+              nCollOpCounts++;
+            } else {
+              collOpCounts[nCollOpCounts-1].count++;
+            }
+          }
+          if (comm->nRanks < comm->commNRanks) {
+            // Add a "fake" element corresponding to the missing entries.  The statusStr array contains the "UNKNOWN"
+            // string at index 0.
+            collOpCounts[nCollOpCounts].value = 0;
+            collOpCounts[nCollOpCounts].count = comm->commNRanks - comm->nRanks;
+            collOpCounts[nCollOpCounts].firstIdx = -1; // "Fake" entry identifier.
+            nCollOpCounts++;
+          }
+          // Sort by that frequency (most frequent first).
+          qsort(collOpCounts, nCollOpCounts, sizeof(*collOpCounts), rasValCountsCompareRev);
+
+          for (int coc = 0; coc < nCollOpCounts; coc++) {
+            struct rasValCount* vcc = collOpCounts+coc;
+            if (vcc->count > 1)
+              rasOutAppend("  %d ranks have status %s\n", vcc->count, statusStr[vcc->value]);
+            if (rasCountIsOutlier(vcc->count, client->verbose, comm->commNRanks)) {
+              if (vcc->firstIdx != -1) {
+                // ranksReSorted is sorted by rank as the secondary key, which comes in handy when printing...
+                for (int rankIdx = vcc->firstIdx; rankIdx < vcc->count+vcc->firstIdx; rankIdx++) {
+                  int peerIdx = peerIdxConv[ranksReSorted[rankIdx].peerIdx];
+                  if (peerIdx != -1) {
+                    if (vcc->count > 1)
+                      rasOutAppend("  Rank %d -- GPU %s managed by process %d on node %s\n",
+                                   ranksReSorted[rankIdx].commRank,
+                                   rasCommRankGpuToString(ranksReSorted+rankIdx, lineBuf, sizeof(lineBuf)),
+                                   rasPeers[peerIdx].pid,
+                                   ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine)));
+                    else
+                      rasOutAppend("  Rank %d has status %s -- GPU %s managed by process %d on node %s\n",
+                                   ranksReSorted[rankIdx].commRank, statusStr[vcc->value],
+                                   rasCommRankGpuToString(ranksReSorted+rankIdx, lineBuf, sizeof(lineBuf)),
+                                   rasPeers[peerIdx].pid,
+                                   ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine)));
+                  } else { // peerIdx == -1
+                    if (vcc->count > 1)
+                      rasOutAppend("  Rank %d -- [process information not found]\n", ranksReSorted[rankIdx].commRank);
+                    else
+                      rasOutAppend("  Rank %d has status %s -- [process information not found]\n",
+                                   ranksReSorted[rankIdx].commRank, statusStr[vcc->value]);
+                  } // peerIdx == -1
+                } // for (rankIdx)
+              } else {
+                // UNKNOWN ranks.  Format a string with their rank numbers (we don't know anything more).
+                lineBuf[0] = '\0';
+                // rankIdx indexes the comm->ranks array; in principle it should be the same as commRank, with the
+                // exception of the missing ranks...
+                for (int commRank = 0, rankIdx = 0; commRank < comm->commNRanks; commRank++) {
+                  if (rankIdx < comm->nRanks && comm->ranks[rankIdx].commRank == commRank) {
+                    rankIdx++;
+                  } else {
+                    snprintf(lineBuf+strlen(lineBuf), sizeof(lineBuf)-strlen(lineBuf), "%s%d",
+                             (rankIdx == commRank ? "" : ","), commRank);
+                  }
+                } // for (commRank)
+                if (vcc->count > 1) {
+                  rasOutAppend("  The unknown ranks: %s\n", lineBuf);
+                } else {
+                  rasOutAppend("  Rank %s has status %s\n", lineBuf, statusStr[vcc->value]);
+                }
+              }
+            } // if (rasCountIsOutlier(vcc->count))
+          } // for (coc)
+        } // if (__builtin_popcount(auxComm->status) > 1)
+
+        inconsistent = false;
+        for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) {
+          if (comm->ranks[rankIdx].collOpCount != auxComm->firstCollOpCount) {
+            inconsistent = true;
+            break;
+          }
+        }
+        if (inconsistent) {
+          rasOutAppend("  Communicator ranks have different collective operation counts\n");
+
+          // Sort the ranks by collOpCount and rank for easy counting.
+          memcpy(ranksReSorted, comm->ranks, comm->nRanks * sizeof(*ranksReSorted));
+          qsort(ranksReSorted, comm->nRanks, sizeof(*ranksReSorted), rasCommRanksCollOpCompare);
+          // Calculate the frequency of different collOpCount values.
+          int nCollOpCounts = 0;
+          for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) {
+            if (rankIdx == 0 || ranksReSorted[rankIdx].collOpCount != ranksReSorted[rankIdx-1].collOpCount) {
+              collOpCounts[nCollOpCounts].value = ranksReSorted[rankIdx].collOpCount;
+              collOpCounts[nCollOpCounts].count = 1;
+              collOpCounts[nCollOpCounts].firstIdx = rankIdx;
+              nCollOpCounts++;
+            } else {
+              collOpCounts[nCollOpCounts-1].count++;
+            }
+          }
+          // Sort by that frequency (most frequent first).
+          qsort(collOpCounts, nCollOpCounts, sizeof(*collOpCounts), rasValCountsCompareRev);
+
+          for (int coc = 0; coc < nCollOpCounts; coc++) {
+            struct rasValCount* vcc = collOpCounts+coc;
+            if (vcc->count > 1)
+              rasOutAppend("  %d ranks have launched up to operation %ld\n", vcc->count, vcc->value);
+            if (rasCountIsOutlier(vcc->count, client->verbose, comm->commNRanks)) {
+              // ranksReSorted is sorted by rank as the secondary key, which comes in handy when printing...
+              for (int rankIdx = vcc->firstIdx; rankIdx < vcc->count+vcc->firstIdx; rankIdx++) {
+                int peerIdx = peerIdxConv[ranksReSorted[rankIdx].peerIdx];
+                if (peerIdx != -1) {
+                  if (vcc->count > 1)
+                    rasOutAppend("  Rank %d -- GPU %s managed by process %d on node %s\n",
+                                 ranksReSorted[rankIdx].commRank,
+                                 rasCommRankGpuToString(ranksReSorted+rankIdx, lineBuf, sizeof(lineBuf)),
+                                 rasPeers[peerIdx].pid,
+                                 ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine)));
+                  else
+                    rasOutAppend("  Rank %d has launched up to operation %ld -- GPU %s managed by process %d on node %s\n",
+                                 ranksReSorted[rankIdx].commRank, vcc->value,
+                                 rasCommRankGpuToString(ranksReSorted+rankIdx, lineBuf, sizeof(lineBuf)),
+                                 rasPeers[peerIdx].pid,
+                                 ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine)));
+                } else { // peerIdx == -1
+                  if (vcc->count > 1)
+                    rasOutAppend("  Rank %d -- [process information not found]\n", ranksReSorted[rankIdx].commRank);
+                  else
+                     rasOutAppend("  Rank %d has launched up to operation %ld -- [process information not found]\n",
+                                  ranksReSorted[rankIdx].commRank, vcc->value);
+                } // peerIdx == -1
+              } // for (rankIdx)
+            } // if (rasCountIsOutlier(vcc->count))
+          } // for (coc)
+        } // if (inconsistent)
+        rasOutAppend("\n");
+      } // if (auxComm->errors & RAS_ACE_MISMATCH)
+    } // for (commIdx)
+  } // for (vcIdx)
+  rasCollFree(coll);
+
+  msgLen = rasOutLength();
+  NCCLCHECKGOTO(rasClientAllocMsg(&msg, msgLen), ret, fail);
+  rasOutExtract(msg);
+  rasClientEnqueueMsg(client, msg, msgLen);
+  msg = nullptr;
+exit:
+  free(peerNvmlDevs);
+  free(collOpCounts);
+  free(valCounts);
+  free(peerIdxConv);
+  free(ranksReSorted);
+  free(auxComms);
+  return ret;
+fail:
+  goto exit;
+}
+
+static void rasClientBreakDownErrors(struct rasClient* client, struct rasCollComms::comm* comm,
+                                     const int* peerIdxConv, int ncclErrors[ncclNumResults], bool isAsync) {
+  for (;;) {
+    int maxCount = 0;
+    ncclResult_t maxCountIdx = ncclSuccess;
+    for (int i = ncclUnhandledCudaError; i < ncclInProgress; i++) {
+      if (maxCount < ncclErrors[i]) {
+        maxCount = ncclErrors[i];
+        maxCountIdx = (ncclResult_t)i;
+      }
+    } // for (i)
+    if (maxCountIdx == ncclSuccess)
+      break;
+    if (maxCount > 1)
+      rasOutAppend("  %d ranks reported %s\n", maxCount, ncclErrorToString(maxCountIdx));
+    if (rasCountIsOutlier(maxCount, client->verbose)) {
+      for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) {
+        if ((isAsync ? comm->ranks[rankIdx].status.asyncError : comm->ranks[rankIdx].status.initState) == maxCountIdx) {
+          int peerIdx = peerIdxConv[comm->ranks[rankIdx].peerIdx];
+          if (peerIdx != -1) {
+            if (maxCount > 1)
+              rasOutAppend("  Rank %d -- GPU %s managed by process %d on node %s\n",
+                           comm->ranks[rankIdx].commRank,
+                           rasCommRankGpuToString(comm->ranks+rankIdx, lineBuf, sizeof(lineBuf)),
+                           rasPeers[peerIdx].pid,
+                           ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine)));
+            else
+              rasOutAppend("  Rank %d reported %s -- GPU %s managed by process %d on node %s\n",
+                           comm->ranks[rankIdx].commRank, ncclErrorToString(maxCountIdx),
+                           rasCommRankGpuToString(comm->ranks+rankIdx, lineBuf, sizeof(lineBuf)),
+                           rasPeers[peerIdx].pid,
+                           ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine)));
+          } else { // peerIdx == -1
+            if (maxCount > 1)
+              rasOutAppend("  Rank %d -- [process information not found]\n", comm->ranks[rankIdx].commRank);
+            else
+              rasOutAppend("  Rank %d reported %s -- [process information not found]\n",
+                           comm->ranks[rankIdx].commRank, ncclErrorToString(maxCountIdx));
+          } // peerIdx == -1
+        } // if rank's error matches
+      } // for (rankIdx)
+    } // if (rasCountIsOutlier(maxCount))
+    ncclErrors[maxCountIdx] = 0;
+  } // for (;;)
+}
+
+
+//////////////////////////////////////////////////////////////////////
+// Functions related to the handling of the internal output buffer. //
+//////////////////////////////////////////////////////////////////////
+
+// Appends a printf-formatted string to the output buffer.
+// Unlike with INFO or WARN messages, the caller should terminate lines with '\n' as appropriate.
+static void rasOutAppend(const char* format, ...) {
+  ncclResult_t ret; // Ignored.
+  va_list vargs;
+  int needed;
+  va_start(vargs, format);
+  needed = vsnprintf(rasOutBuffer+nRasOutBuffer, rasOutBufferSize-nRasOutBuffer, format, vargs);
+  va_end(vargs);
+
+  if (needed < 0) // Output error (whatever that might be...)
+    return;
+
+  // The +1 below accounts for the terminating '\0'.
+  if (needed + 1 > rasOutBufferSize-nRasOutBuffer) {
+    int newBufferSize = ROUNDUP(nRasOutBuffer+needed+1, RAS_OUT_INCREMENT);
+    NCCLCHECKGOTO(ncclRealloc(&rasOutBuffer, rasOutBufferSize, newBufferSize), ret, exit);
+    rasOutBufferSize = newBufferSize;
+
+    va_start(vargs, format);
+    needed = vsnprintf(rasOutBuffer+nRasOutBuffer, rasOutBufferSize-nRasOutBuffer, format, vargs);
+    va_end(vargs);
+
+    if (needed < 0) // Output error (whatever that might be...)
+      return;
+  }
+
+  nRasOutBuffer += needed;
+  assert(nRasOutBuffer <= rasOutBufferSize);
+exit:
+  ;
+}
+
+// Copies the output data from an internal buffer to a user-supplied one, including the terminating '\0'.
+// The user buffer must already be allocated and be at least rasOutLength() bytes long (which includes
+// the terminating '\0').
+static void rasOutExtract(char* buffer) {
+  if (rasOutBuffer)
+    memcpy(buffer, rasOutBuffer, rasOutLength());
+}
+
+// Returns the current length of the used portion of the output buffer, *not* including the terminating '\0'.
+static int rasOutLength() {
+  return nRasOutBuffer;
+}
+
+// Resets the output buffer position to the beginning (effectively clearing the buffer).
+static void rasOutReset() {
+  ncclResult_t ret; // Ignored.
+  nRasOutBuffer = 0;
+  if (rasOutBuffer == nullptr) {
+    NCCLCHECKGOTO(ncclCalloc(&rasOutBuffer, RAS_OUT_INCREMENT), ret, exit);
+    rasOutBufferSize = RAS_OUT_INCREMENT;
+  }
+exit:
+  ;
+}
+
+
+///////////////////////////////////////////////////////////////////
+// Various sorting callbacks used when grouping/formatting data. //
+///////////////////////////////////////////////////////////////////
+
+// Sorting callback for rasPeerInfo elements.  Sorts by the number of bits set in cudaDevs.  Uses the host IP as the
+// secondary key and the process id as the tertiary key.
+static int rasPeersNGpuCompare(const void* e1, const void* e2) {
+  const struct rasPeerInfo* p1 = (const struct rasPeerInfo*)e1;
+  const struct rasPeerInfo* p2 = (const struct rasPeerInfo*)e2;
+  int c1 = __builtin_popcountll(p1->cudaDevs);
+  int c2 = __builtin_popcountll(p2->cudaDevs);
+
+  if (c1 == c2) {
+    // Host IP address is the secondary key.
+    int cmp = ncclSocketsHostCompare(&p1->addr, &p2->addr);
+    if (cmp == 0) {
+      // Process ID is the tertiary key.
+      cmp = (p1->pid < p2->pid ? -1 : (p1->pid > p2->pid ? 1 : 0));
+    }
+    return cmp;
+  } else {
+    return (c1 < c2 ? -1 : 1);
+  }
+}
+
+// Sorting callback for rasPeerInfo elements.  Sorts by the number of peers per node, which we store in cudaDevs.
+// Uses the host IP as the secondary key and the process id as the tertiary key.
+static int rasPeersNProcsCompare(const void* e1, const void* e2) {
+  const struct rasPeerInfo* p1 = (const struct rasPeerInfo*)e1;
+  const struct rasPeerInfo* p2 = (const struct rasPeerInfo*)e2;
+
+  if (p1->cudaDevs == p2->cudaDevs) {
+    // Host IP address is the secondary key.
+    int cmp = ncclSocketsHostCompare(&p1->addr, &p2->addr);
+    if (cmp == 0) {
+      // Process ID is the tertiary key.
+      cmp = (p1->pid < p2->pid ? -1 : (p1->pid > p2->pid ? 1 : 0));
+    }
+    return cmp;
+  } else {
+    return (p1->cudaDevs < p2->cudaDevs ? -1 : 1);
+  }
+}
+
+// Sorting callback for rasPeerInfo elements.  Sorts by the host IP and the process id as the secondary key (rather
+// than the port).
+static int rasPeersHostPidCompare(const void* e1, const void* e2) {
+  const struct rasPeerInfo* p1 = (const struct rasPeerInfo*)e1;
+  const struct rasPeerInfo* p2 = (const struct rasPeerInfo*)e2;
+
+  int cmp = ncclSocketsHostCompare(&p1->addr, &p2->addr);
+  if (cmp == 0) {
+    // Process ID is the secondary key.
+    cmp = (p1->pid < p2->pid ? -1 : (p1->pid > p2->pid ? 1 : 0));
+  }
+  return cmp;
+}
+
+// Sorting callback for ncclSocketAddress.  Unlike the ncclSocketsCompare, it ignores the port.
+static int ncclSocketsHostCompare(const void* p1, const void* p2) {
+  const union ncclSocketAddress* a1 = (const union ncclSocketAddress*)p1;
+  const union ncclSocketAddress* a2 = (const union ncclSocketAddress*)p2;
+  // AF_INET (2) is less than AF_INET6 (10).
+  int family = a1->sa.sa_family;
+  if (family != a2->sa.sa_family) {
+    if (family > 0 && a2->sa.sa_family > 0)
+      return (family < a2->sa.sa_family ? -1 : 1);
+    else // Put empty addresses at the end (not that it matters...).
+      return (family > 0 ? -1 : 1);
+  }
+
+  int cmp;
+  if (family == AF_INET) {
+    cmp = memcmp(&a1->sin.sin_addr, &a2->sin.sin_addr, sizeof(a1->sin.sin_addr));
+  }
+  else if (family == AF_INET6) {
+    cmp = memcmp(&a1->sin6.sin6_addr, &a2->sin6.sin6_addr, sizeof(a1->sin6.sin6_addr));
+  } else {
+    // The only remaining valid case are empty addresses.
+    assert(family == 0);
+    cmp = 0; // Two empty addresses are equal...
+  }
+
+  return cmp;
+}
+
+// Sorting callback for rasValCount elements.  Sorts by the count, largest first.  Value is the secondary key.
+static int rasValCountsCompareRev(const void* p1, const void* p2) {
+  const struct rasValCount* r1 = (const struct rasValCount*)p1;
+  const struct rasValCount* r2 = (const struct rasValCount*)p2;
+
+  if (r1->count == r2->count) {
+    return (r1->value > r2->value ? -1 : (r1->value < r2->value ? 1: 0));
+  } else {
+    return (r1->count > r2->count ? -1 : 1);
+  }
+}
+
+// Sorting callback for rasAuxComm elements.
+// Sorts the comms by the rank count (commNRanks), nNodes as secondary key, status as the tertiary, and errors as
+// the quaternary.  Sorts in reverse (largest first).
+// The final key is the comm's nRanks, sorted in reverse to the other keys, so comms with the largest number
+// of ranks *missing* will be first.
+static int rasAuxCommsCompareRev(const void* p1, const void* p2) {
+  const struct rasAuxComm* c1 = (const struct rasAuxComm*)p1;
+  const struct rasAuxComm* c2 = (const struct rasAuxComm*)p2;
+
+  if (c1->comm->commNRanks == c2->comm->commNRanks) {
+    if (c1->nNodes == c2->nNodes) {
+      // We don't want to compare the status values directly because they could be bitmasks and we are only
+      // interested in the highest bit set.
+      // __builtin_clz returns the number of leading 0-bits, so in our case the value will be the *smallest*
+      // if RAS_ACS_ABORT (8) is set and the *largest* if only RAS_ACS_INIT (1) is set, so we reverse the
+      // comparison to get the desired sorting order.
+      int s1 = __builtin_clz(c1->status);
+      int s2 = __builtin_clz(c2->status);
+      if (s1 == s2) {
+        if (c1->errors == c2->errors) {
+          if (c1->comm->nRanks == c2->comm->nRanks) {
+            return 0;
+          } else {
+            return (c1->comm->nRanks < c2->comm->nRanks ? -1 : 1);
+          }
+        } else {
+          return (c1->errors > c2->errors ? -1 : 1);
+        }
+      } else {
+        return (s1 < s2 ? -1 : 1);
+      }
+    } else {
+      return (c1->nNodes > c2->nNodes ? -1 : 1);
+    }
+  } else {
+    return (c1->comm->commNRanks > c2->comm->commNRanks ? -1 : 1);
+  }
+}
+
+// Sorting callback for rasCollComms::comm::rank elements.  Sorts by the peerIdx.
+static int rasCommRanksPeerCompare(const void* p1, const void* p2) {
+  const struct rasCollComms::comm::rank* r1 = (const struct rasCollComms::comm::rank*)p1;
+  const struct rasCollComms::comm::rank* r2 = (const struct rasCollComms::comm::rank*)p2;
+
+  return (r1->peerIdx < r2->peerIdx ? -1 : (r1->peerIdx > r2->peerIdx ? 1 : 0));
+}
+
+// Sorting callback for rasCollComms::comm::rank elements.  Sorts by the collOpCount, with rank as the secondary key.
+static int rasCommRanksCollOpCompare(const void* p1, const void* p2) {
+  const struct rasCollComms::comm::rank* r1 = (const struct rasCollComms::comm::rank*)p1;
+  const struct rasCollComms::comm::rank* r2 = (const struct rasCollComms::comm::rank*)p2;
+
+  if (r1->collOpCount == r2->collOpCount) {
+    // Use the rank as the secondary key.
+    return (r1->commRank < r2->commRank ? -1 : (r1->commRank > r2->commRank ? 1 : 0));
+  } else {
+    return (r1->collOpCount < r2->collOpCount ? -1 : 1);
+  }
+}
+
+
+////////////////////////////////////////////////////////////
+// String formatting functions for various types of data. //
+////////////////////////////////////////////////////////////
+
+// Coverts a GPU mask(s) to a string.  If the CUDA mask is different from the NVML mask, both are printed.
+const char* rasGpuDevsToString(uint64_t cudaDevs, uint64_t nvmlDevs, char* buf, size_t size) {
+  bool first = true;
+  buf[0] = '\0';
+  for (int i = 0; i < NCCL_MAX_LOCAL_RANKS; i++)
+    if (cudaDevs & (1UL << i)) {
+      snprintf(buf+strlen(buf), size-strlen(buf), "%s%d", (first ? "" : ","), i);
+      first = false;
+    }
+  if (cudaDevs != nvmlDevs) {
+    snprintf(buf+strlen(buf), size-strlen(buf), " (NVML ");
+    first = true;
+    for (int i = 0; i < NCCL_MAX_LOCAL_RANKS; i++)
+      if (nvmlDevs & (1UL << i)) {
+        snprintf(buf+strlen(buf), size-strlen(buf), "%s%d", (first ? "" : ","), i);
+        first = false;
+      }
+    snprintf(buf+strlen(buf), size-strlen(buf), ")");
+  }
+  return buf;
+}
+
+// Formats a GPU string based on the rasCollComms's rank.  If the CUDA id is different from the NVML id, both are
+// printed.
+static const char* rasCommRankGpuToString(const struct rasCollComms::comm::rank* rank, char* buf, size_t size) {
+  snprintf(buf, size, "%d", rank->cudaDev);
+  if (rank->cudaDev != rank->nvmlDev) {
+    snprintf(buf+strlen(buf), size-strlen(buf), " (NVML %d)", rank->nvmlDev);
+  }
+  return buf;
+}
+
+// Converts a NCCL error result to a string.
+static const char* ncclErrorToString(ncclResult_t err) {
+  switch (err) {
+    case ncclUnhandledCudaError     : return "Unhandled CUDA error";
+    case ncclSystemError            : return "System error";
+    case ncclInternalError          : return "Internal error";
+    case ncclInvalidArgument        : return "Invalid argument";
+    case ncclInvalidUsage           : return "Invalid usage";
+    case ncclRemoteError            : return "Remote process error";
+    case ncclInProgress             : return "NCCL operation in progress";
+    default                         : return "Unexpected error";
+  }
+}
+
+// Converts the IP number of a NCCL address to a string (the port part is ignored and no DNS resolution is attempted).
+static const char* ncclSocketToHost(const union ncclSocketAddress* addr, char* buf, size_t size) {
+  if (addr->sa.sa_family > 0)
+    return inet_ntop(addr->sa.sa_family,
+                     (addr->sa.sa_family == AF_INET ? (void*)&addr->sin.sin_addr : (void*)&addr->sin6.sin6_addr),
+                     buf, size);
+  else {
+    if (size > 0)
+      buf[0] = '\0';
+    return buf;
+  }
+}
+
+// Determines if the given count constitutes an outlier.
+static bool rasCountIsOutlier(int count, bool verbose, int totalCount) {
+  if (count == 1)
+    return true; // A single rank is always considered an outlier...
+  if (verbose) {
+    return (totalCount != -1 ? count < totalCount * RAS_CLIENT_VERBOSE_OUTLIER_FRACTION : true);
+  } else {
+    return count <= RAS_CLIENT_DETAIL_THRESHOLD &&
+           (totalCount == -1 || count <= totalCount * RAS_CLIENT_OUTLIER_FRACTION);
+  }
+}
diff --git a/src/ras/collectives.cc b/src/ras/collectives.cc
new file mode 100644
index 0000000000..201144f1a0
--- /dev/null
+++ b/src/ras/collectives.cc
@@ -0,0 +1,762 @@
+/*************************************************************************
+ * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#define NDEBUG // Comment out duriyng development only!
+#include <cassert>
+#include <mutex>
+
+#include "alloc.h"
+#include "checks.h"
+#include "comm.h"
+#include "nccl.h"
+#include "utils.h"
+#include "ras_internal.h"
+
+// The number of recent collectives to keep track of.  Completely arbitrary.
+#define COLL_HISTORY_SIZE 64
+
+// An entry in the rasCollHistory array keeping track of recently completed collectives (to make it possible to
+// identify and drop duplicates arriving over different links).
+struct rasCollHistoryEntry {
+  union ncclSocketAddress rootAddr;
+  uint64_t rootId;
+};
+
+// Array keeping track of recently completed collectives (to avoid infinite loops).  LRU-based replacement.
+static struct rasCollHistoryEntry rasCollHistory[COLL_HISTORY_SIZE];
+static int nRasCollHistory, rasCollHistNextIdx;
+
+// Monotonically increased to ensure that each collective originating locally has a unique Id.
+static uint64_t rasCollLastId;
+
+// Array keeping track of ongoing collective operations (apart from broadcasts, which have no response so require
+// no such tracking).
+struct rasCollective* rasCollectives;
+static int nRasCollectives;
+
+static ncclResult_t getNewCollEntry(struct rasCollective** pColl);
+static ncclResult_t rasLinkSendCollReq(struct rasLink* link, struct rasCollective* coll,
+                                       const struct rasCollRequest* req, size_t reqLen, int fromConnIdx);
+static ncclResult_t rasConnSendCollReq(struct rasConnection* conn, const struct rasCollRequest* req, size_t reqLen);
+static ncclResult_t rasCollReadyResp(struct rasCollective* coll);
+static ncclResult_t rasConnSendCollResp(struct rasConnection* conn,
+                                        const union ncclSocketAddress* rootAddr, uint64_t rootId,
+                                        const union ncclSocketAddress* peers, int nPeers,
+                                        const char* data, int nData, int nLegTimeouts);
+
+static ncclResult_t rasCollConnsInit(char** pData, int* pNData);
+static ncclResult_t rasCollConnsMerge(struct rasCollective* coll, struct rasMsg* msg);
+
+static ncclResult_t rasCollCommsInit(char** pData, int* pNData);
+static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg* msg);
+static int ncclCommsCompare(const void* p1, const void* p2);
+
+
+///////////////////////////////////////////////////////////////////////////////////////
+// Functions related to the initialization of collectives and the message exchanges. //
+///////////////////////////////////////////////////////////////////////////////////////
+
+// Returns the index of the first available entry in the rasCollectives array, enlarging the array if necessary.
+static ncclResult_t getNewCollEntry(struct rasCollective** pColl) {
+  struct rasCollective* coll;
+  int i;
+  for (i = 0; i < nRasCollectives; i++)
+    if (rasCollectives[i].type == RAS_MSG_NONE)
+      break;
+  if (i == nRasCollectives) {
+    NCCLCHECK(ncclRealloc(&rasCollectives, nRasCollectives, nRasCollectives+RAS_INCREMENT));
+    nRasCollectives += RAS_INCREMENT;
+  }
+
+  coll = rasCollectives+i;
+  memset(coll, '\0', sizeof(*coll));
+  coll->startTime = clockNano();
+  coll->fromConnIdx = -1;
+  // We are unlikely to use the whole array, but at least we won't need to realloc.
+  NCCLCHECK(ncclCalloc(&coll->fwdConns, nRasConns));
+
+  *pColl = coll;
+  return ncclSuccess;
+}
+
+// Initializes a collective request by giving it a unique ID.
+void rasCollReqInit(struct rasCollRequest* req) {
+  memcpy(&req->rootAddr, &rasNetListeningSocket.addr, sizeof(req->rootAddr));
+  req->rootId = ++rasCollLastId;
+}
+
+// Sends a collective request message through all regular RAS network connections (effectively, broadcasts it).
+// Also used for re-broadcasts (on peers receiving the request over the network).
+// Checking for duplicates is the responsibility of the caller.
+// For collectives other than broadcasts, initializes a rasCollective structure and fills it with local data,
+// in preparation for collective response messages.
+// pAllDone indicates on return if the collective operation is already finished, which is unusual, but possible
+// in scenarios such as a total of two peers.
+// pCollIdx provides on return an index of the allocated rasCollective structure to track this collective (unless
+// it's a broadcast, which require no such tracking).
+ncclResult_t rasNetSendCollReq(const struct rasCollRequest* req, size_t reqLen, bool* pAllDone, int* pCollIdx,
+                               int fromConnIdx) {
+  struct rasCollective* coll = nullptr;
+  if (req->type >= RAS_COLL_CONNS) {
+    // Keep track of this collective operation so that we can handle the responses appropriately.
+    NCCLCHECK(getNewCollEntry(&coll));
+    if (pCollIdx)
+      *pCollIdx = coll-rasCollectives;
+    memcpy(&coll->rootAddr, &req->rootAddr, sizeof(coll->rootAddr));
+    coll->rootId = req->rootId;
+    coll->type = req->type;
+    coll->timeout = req->timeout;
+    coll->fromConnIdx = fromConnIdx;
+    if (ncclCalloc(&coll->peers, 1) == ncclSuccess) {
+      memcpy(coll->peers, &rasNetListeningSocket.addr, sizeof(*coll->peers));
+      coll->nPeers = 1;
+    }
+
+    // Collective-specific initialization of accumulated data (using local data for now).
+    if (req->type == RAS_COLL_CONNS)
+      (void)rasCollConnsInit(&coll->data, &coll->nData);
+    else if (req->type == RAS_COLL_COMMS)
+      (void)rasCollCommsInit(&coll->data, &coll->nData);
+  } else { // req->type < RAS_COLL_CONNS
+    // Add the info to the collective message history.
+    nRasCollHistory = std::min(nRasCollHistory+1, COLL_HISTORY_SIZE);
+    memcpy(&rasCollHistory[rasCollHistNextIdx].rootAddr, &req->rootAddr,
+           sizeof(rasCollHistory[rasCollHistNextIdx].rootAddr));
+    rasCollHistory[rasCollHistNextIdx].rootId = req->rootId;
+    rasCollHistNextIdx = (rasCollHistNextIdx + 1) % COLL_HISTORY_SIZE;
+
+    // Collective-specific message handling.
+    if (req->type == RAS_BC_DEADPEER) {
+      bool done = false;
+      rasMsgHandleBCDeadPeer(req, &done);
+      if (done)
+        goto exit;
+    }
+  } // req->type < RAS_COLL_CONNS
+
+  for (int connIdx = 0; connIdx < nRasConns; connIdx++)
+    rasConns[connIdx].linkFlag = false;
+
+  (void)rasLinkSendCollReq(&rasNextLink, coll, req, reqLen, fromConnIdx);
+  (void)rasLinkSendCollReq(&rasPrevLink, coll, req, reqLen, fromConnIdx);
+
+  if (coll && pAllDone)
+    *pAllDone = (coll->nFwdSent == coll->nFwdRecv);
+exit:
+  return ncclSuccess;
+}
+
+// Sends the collective message through all connections associated with this link (with the exception of the one
+// the message came from, if any).
+static ncclResult_t rasLinkSendCollReq(struct rasLink* link, struct rasCollective* coll,
+                                       const struct rasCollRequest* req, size_t reqLen, int fromConnIdx) {
+  for (int i = 0; i < link->nConns; i++) {
+    struct rasLinkConn* linkConn = link->conns+i;
+    if (linkConn->connIdx != -1 && linkConn->connIdx != fromConnIdx) {
+      struct rasConnection* conn = rasConns+linkConn->connIdx;
+      if (!conn->linkFlag) {
+        // We send collective messages through fully established and operational connections only.
+        if (conn->sockIdx != -1 && rasSockets[conn->sockIdx].status == RAS_SOCK_READY && !conn->experiencingDelays) {
+          if (rasConnSendCollReq(conn, req, reqLen) == ncclSuccess && coll != nullptr)
+            coll->fwdConns[coll->nFwdSent++] = linkConn->connIdx;
+        } // if (conn->sockIdx != -1 && RAS_SOCK_READY)
+        conn->linkFlag = true;
+      } // if (!conn->linkFlag)
+    } // if (linkConn->connIdx != -1 && linkConn->connIdx != fromConnIdx)
+  } // for (i)
+
+  return ncclSuccess;
+}
+
+// Sends a collective message down a particular connection.
+static ncclResult_t rasConnSendCollReq(struct rasConnection* conn, const struct rasCollRequest* req, size_t reqLen) {
+  struct rasMsg* msg = nullptr;
+  int msgLen = rasMsgLength(RAS_MSG_COLLREQ) + reqLen;
+
+  NCCLCHECK(rasMsgAlloc(&msg, msgLen));
+  msg->type = RAS_MSG_COLLREQ;
+  memcpy(&msg->collReq, req, reqLen);
+
+  rasConnEnqueueMsg(conn, msg, msgLen);
+
+  return ncclSuccess;
+}
+
+// Handles the RAS_MSG_COLLREQ collective message request on the receiver side.  Primarily deals with duplicates and
+// re-broadcasts the message to local peers, though in case of a very limited RAS network it might be done right away,
+// in which case it can immediately send the response.
+ncclResult_t rasMsgHandleCollReq(struct rasMsg* msg, struct rasSocket* sock) {
+  bool allDone = false;
+  int collIdx = -1;
+  assert(sock->connIdx != -1);
+
+  // First check if we've already handled this request (through another connection).
+  for (int i = 0; i < nRasCollHistory; i++) {
+    // In principle we can use i to index the array but we convert it so that we check the most recent entries first.
+    int collHistIdx = (rasCollHistNextIdx + COLL_HISTORY_SIZE - 1 - i) % COLL_HISTORY_SIZE;
+    if (memcmp(&msg->collReq.rootAddr, &rasCollHistory[collHistIdx].rootAddr, sizeof(msg->collReq.rootAddr)) == 0 &&
+        msg->collReq.rootId == rasCollHistory[collHistIdx].rootId) {
+      if (msg->collReq.type >= RAS_COLL_CONNS) {
+        // Send an empty response so that the sender can account for it.  The non-empty response has already been
+        // sent through the connection that we received the request through first.
+        NCCLCHECK(rasConnSendCollResp(rasConns+sock->connIdx, &msg->collReq.rootAddr, msg->collReq.rootId,
+                                      /*peers*/nullptr, /*nPeers*/0, /*data*/nullptr, /*nData*/0, /*nLegTimeouts*/0));
+      }
+      goto exit;
+    }
+  } // for (i)
+
+  if (msg->collReq.type >= RAS_COLL_CONNS) {
+    // Check if we're currently handling this collective request.
+    for (int i = 0; i < nRasCollectives; i++) {
+      struct rasCollective* coll = rasCollectives+i;
+      if (coll->type != RAS_MSG_NONE &&
+          memcmp(&msg->collReq.rootAddr, &coll->rootAddr, sizeof(msg->collReq.rootAddr)) == 0 &&
+          msg->collReq.rootId == coll->rootId) {
+        assert(msg->collReq.type == coll->type);
+
+        // Send an empty response so that the sender can account for it.  The non-empty response will be
+        // sent through the connection that we received the request through first.
+        NCCLCHECK(rasConnSendCollResp(rasConns+sock->connIdx, &msg->collReq.rootAddr, msg->collReq.rootId,
+                                      /*peers*/nullptr, /*nPeers*/0, /*data*/nullptr, /*nData*/0, /*nLegTimeouts*/0));
+        goto exit;
+      } // if match
+    } // for (i)
+  } // if (msg->collReq.type >= RAS_COLL_CONNS)
+
+  // Re-broadcast the message to my peers (minus the one it came from) and handle it locally.
+  NCCLCHECK(rasNetSendCollReq(&msg->collReq, rasCollDataLength(msg->collReq.type), &allDone, &collIdx, sock->connIdx));
+
+  if (msg->collReq.type >= RAS_COLL_CONNS && allDone) {
+    assert(collIdx != -1);
+    // We are a leaf process -- send the response right away.  This can probably trigger only for the case of a total
+    // of two peers, and hence just one RAS connection, or during communication issues, because normally every peer
+    // has more than one connection so there should always be _some_ other peer to forward the request to.
+    NCCLCHECK(rasCollReadyResp(rasCollectives+collIdx));
+  }
+exit:
+  return ncclSuccess;
+}
+
+// Sends a collective response back to the process we received the collective request from.
+// Invoked when we are finished waiting for the collective responses from other peers (i.e., either there weren't
+// any peers (unlikely), the peers sent their responses (likely), or we timed out.
+static ncclResult_t rasCollReadyResp(struct rasCollective* coll) {
+  if (coll->fromConnIdx != -1) {
+    // For remotely-initiated collectives, send the response back.
+    NCCLCHECK(rasConnSendCollResp(rasConns+coll->fromConnIdx, &coll->rootAddr, coll->rootId,
+                                  coll->peers, coll->nPeers, coll->data, coll->nData, coll->nLegTimeouts));
+
+    // Add the identifying info to the collective message history.
+    nRasCollHistory = std::min(nRasCollHistory+1, COLL_HISTORY_SIZE);
+    memcpy(&rasCollHistory[rasCollHistNextIdx].rootAddr, &coll->rootAddr,
+           sizeof(rasCollHistory[rasCollHistNextIdx].rootAddr));
+    rasCollHistory[rasCollHistNextIdx].rootId = coll->rootId;
+    rasCollHistNextIdx = (rasCollHistNextIdx + 1) % COLL_HISTORY_SIZE;
+
+    rasCollFree(coll);
+  } else {
+    // For locally-initiated collectives, invoke the client code again (which will release it, once finished).
+    NCCLCHECK(rasClientResume(coll));
+  }
+  return ncclSuccess;
+}
+
+// Sends a collective response via the connection we originally received the request from.  The message should be
+// a cumulative response from this process and all the processes that we forwarded the request to.
+static ncclResult_t rasConnSendCollResp(struct rasConnection* conn,
+                                        const union ncclSocketAddress* rootAddr, uint64_t rootId,
+                                        const union ncclSocketAddress* peers, int nPeers,
+                                        const char* data, int nData, int nLegTimeouts) {
+  struct rasMsg* msg = nullptr;
+  int msgLen = rasMsgLength(RAS_MSG_COLLRESP) + nPeers*sizeof(*peers);
+  int dataOffset = 0;
+
+  if (nData > 0) {
+    ALIGN_SIZE(msgLen, alignof(int64_t));
+    dataOffset = msgLen;
+    msgLen += nData;
+  }
+
+  NCCLCHECK(rasMsgAlloc(&msg, msgLen));
+  msg->type = RAS_MSG_COLLRESP;
+  memcpy(&msg->collResp.rootAddr, rootAddr, sizeof(msg->collResp.rootAddr));
+  msg->collResp.rootId = rootId;
+  msg->collResp.nLegTimeouts = nLegTimeouts;
+  msg->collResp.nPeers = nPeers;
+  msg->collResp.nData = nData;
+  if (nPeers)
+    memcpy(msg->collResp.peers, peers, nPeers*sizeof(*msg->collResp.peers));
+  if (nData)
+    memcpy(((char*)msg)+dataOffset, data, nData);
+
+  rasConnEnqueueMsg(conn, msg, msgLen);
+
+  return ncclSuccess;
+}
+
+// Handles the collective response on the receiver side.  Finds the corresponding rasCollective structure, merges
+// the data from the response into the accumulated data.  If all the responses have been accounted for, sends the
+// accumulated response back.
+ncclResult_t rasMsgHandleCollResp(struct rasMsg* msg, struct rasSocket* sock) {
+  int collIdx;
+  struct rasCollective* coll = nullptr;
+  char line[SOCKET_NAME_MAXLEN+1];
+
+  for (collIdx = 0; collIdx < nRasCollectives; collIdx++) {
+    coll = rasCollectives+collIdx;
+    if (coll->type != RAS_MSG_NONE &&
+        memcmp(&msg->collResp.rootAddr, &coll->rootAddr, sizeof(msg->collResp.rootAddr)) == 0 &&
+        msg->collResp.rootId == coll->rootId)
+      break;
+  }
+  if (collIdx == nRasCollectives) {
+    INFO(NCCL_RAS, "RAS failed to find a matching ongoing collective for response %s:%ld from %s!",
+         ncclSocketToString(&msg->collResp.rootAddr, line), msg->collResp.rootId,
+         ncclSocketToString(&sock->sock.addr, rasLine));
+    goto exit;
+  }
+
+  coll->nLegTimeouts += msg->collResp.nLegTimeouts;
+  assert(sock->connIdx != -1);
+  // Account for the received response in our collective operation tracking.
+  for (int i = 0; i < coll->nFwdSent; i++) {
+    if (coll->fwdConns[i] == sock->connIdx) {
+      coll->fwdConns[i] = -1;
+      break;
+    }
+  }
+  coll->nFwdRecv++;
+  if (msg->collResp.nData > 0) {
+    // Collective-specific merging of the response into locally accumulated data.
+    if (coll->type == RAS_COLL_CONNS)
+      NCCLCHECK(rasCollConnsMerge(coll, msg));
+    else if (coll->type == RAS_COLL_COMMS)
+      NCCLCHECK(rasCollCommsMerge(coll, msg));
+  }
+  // We merge the peers after merging the data, so that the data merge function can rely on peers being unchanged.
+  if (msg->collResp.nPeers > 0) {
+    NCCLCHECK(ncclRealloc(&coll->peers, coll->nPeers, coll->nPeers + msg->collResp.nPeers));
+    memcpy(coll->peers+coll->nPeers, msg->collResp.peers, msg->collResp.nPeers * sizeof(*coll->peers));
+    coll->nPeers += msg->collResp.nPeers;
+  }
+
+  // If we received all the data we were waiting for, send our response back.
+  if (coll->nFwdSent == coll->nFwdRecv)
+    NCCLCHECK(rasCollReadyResp(coll));
+exit:
+  return ncclSuccess;
+}
+
+// Removes a connection from all ongoing collectives.  Called when a connection is experiencing a delay or is being
+// terminated.
+void rasCollsPurgeConn(int connIdx) {
+  for (int i = 0; i < nRasCollectives; i++) {
+    struct rasCollective* coll = rasCollectives+i;
+    if (coll->type != RAS_MSG_NONE) {
+      char line[SOCKET_NAME_MAXLEN+1];
+      if (coll->fromConnIdx == connIdx) {
+        INFO(NCCL_RAS, "RAS purging collective %s:%ld because it comes from %s",
+             ncclSocketToString(&coll->rootAddr, line), coll->rootId,
+             ncclSocketToString(&rasConns[connIdx].addr, rasLine));
+        rasCollFree(coll);
+      } else {
+        for (int j = 0; j < coll->nFwdSent; j++) {
+          if (coll->fwdConns[j] == connIdx) {
+            coll->fwdConns[j] = -1;
+            coll->nFwdRecv++;
+            coll->nLegTimeouts++;
+            INFO(NCCL_RAS, "RAS not waiting for response from %s to collective %s:%ld "
+                 "(nFwdSent %d, nFwdRecv %d, nLegTimeouts %d)",
+                 ncclSocketToString(&rasConns[connIdx].addr, rasLine), ncclSocketToString(&coll->rootAddr, line),
+                 coll->rootId, coll->nFwdSent, coll->nFwdRecv, coll->nLegTimeouts);
+            if (coll->nFwdSent == coll->nFwdRecv)
+              (void)rasCollReadyResp(coll);
+            break;
+          }
+        } // for (j)
+      } // coll->fromConnIdx != connIdx
+    } // !RAS_MSG_NONE
+  } // for (i)
+}
+
+// Frees a rasCollective entry and any memory associated with it.
+void rasCollFree(struct rasCollective* coll) {
+  free(coll->fwdConns);
+  coll->fwdConns = nullptr;
+  free(coll->peers);
+  coll->peers = nullptr;
+  free(coll->data);
+  coll->data = nullptr;
+  coll->fromConnIdx = -1;
+  coll->type = RAS_MSG_NONE;
+}
+
+// Invoked from the main RAS thread loop to handle timeouts of the collectives.
+// We obviously want to have a reasonable *total* timeout that the RAS client can rely on, but we don't have strict
+// global coordination.  So we have, in effect, two timeouts: soft (5s) and hard (10s).  Soft equals the keep-alive
+// timeout.
+// When sending collective requests, we skip any connections that are experiencing delays.  After the 5s timeout, we
+// check again the status of all outstanding connections and if any is now delayed, we give up on it.
+// That works fine for directly observable delays, but if the problematic connection is further away from us, all
+// we can do is trust that the other peers will "do the right thing soon".  However, if there is a cascade of
+// problematic connections, they could still exceed the 5s total.  So after 10s we give up waiting no matter what
+// and send back whatever we have.  Unfortunately, the peer that the RAS client is connected to will in all likelihood
+// time out first, so at that point any delayed responses that eventually arrive are likely to be too late...
+void rasCollsHandleTimeouts(int64_t now, int64_t* nextWakeup) {
+  for (int collIdx = 0; collIdx < nRasCollectives; collIdx++) {
+    struct rasCollective* coll = rasCollectives+collIdx;
+    if (coll->type == RAS_MSG_NONE || coll->timeout == 0)
+      continue;
+
+    if (now - coll->startTime > coll->timeout) {
+      // We've exceeded the leg timeout.  For all outstanding responses, check their connections.
+      if (!coll->timeoutWarned) {
+        INFO(NCCL_RAS, "RAS collective %s:%ld timeout warning (%lds) -- %d responses missing",
+             ncclSocketToString(&coll->rootAddr, rasLine), coll->rootId,
+             (now - coll->startTime) / CLOCK_UNITS_PER_SEC, coll->nFwdSent - coll->nFwdRecv);
+        coll->timeoutWarned = true;
+      }
+      for (int i = 0; i < coll->nFwdSent; i++) {
+        if (coll->fwdConns[i] != -1) {
+          struct rasConnection* conn = rasConns+coll->fwdConns[i];
+          char line[SOCKET_NAME_MAXLEN+1];
+          if (!conn->experiencingDelays && conn->sockIdx != -1) {
+            struct rasSocket* sock = rasSockets+conn->sockIdx;
+            // Ensure that the connection is fully established and operational, and that the socket hasn't been
+            // re-created during the handling of the collective (which would suggest that the request may have been
+            // lost).
+            if (sock->status == RAS_SOCK_READY && sock->createTime < coll->startTime)
+              continue;
+          }
+          // In all other cases we declare a timeout so that we can (hopefully) recover.
+          INFO(NCCL_RAS, "RAS not waiting for response from %s to collective %s:%ld "
+               "(nFwdSent %d, nFwdRecv %d, nLegTimeouts %d)",
+               ncclSocketToString(&conn->addr, rasLine), ncclSocketToString(&coll->rootAddr, line),
+               coll->rootId, coll->nFwdSent, coll->nFwdRecv, coll->nLegTimeouts);
+          coll->fwdConns[i] = -1;
+          coll->nFwdRecv++;
+          coll->nLegTimeouts++;
+        } // if (coll->fwdConns[i] != -1)
+      } // for (i)
+      if (coll->nFwdSent == coll->nFwdRecv) {
+        (void)rasCollReadyResp(coll);
+      } else {
+        // At least some of the delays are *not* due to this process' connections experiencing delays, i.e., they
+        // must be due to delays at other processes.  Presumably those processes will give up waiting soon and the
+        // (incomplete) responses will arrive shortly, so we should wait a little longer.
+        if (now - coll->startTime > coll->timeout + RAS_COLLECTIVE_EXTRA_TIMEOUT) {
+          // We've exceeded even the longer timeout, which is unexpected.  Try to return whatever we have (though
+          // the originator of the collective, if it's not us, may have timed out already anyway).
+          INFO(NCCL_RAS, "RAS collective %s:%ld timeout error (%lds) -- giving up on %d missing responses",
+               ncclSocketToString(&coll->rootAddr, rasLine), coll->rootId,
+               (now - coll->startTime) / CLOCK_UNITS_PER_SEC, coll->nFwdSent - coll->nFwdRecv);
+          coll->nLegTimeouts += coll->nFwdSent - coll->nFwdRecv;
+          coll->nFwdRecv = coll->nFwdSent;
+          (void)rasCollReadyResp(coll);
+        } else {
+          *nextWakeup = std::min(*nextWakeup, coll->startTime+coll->timeout+RAS_COLLECTIVE_EXTRA_TIMEOUT);
+        }
+      } // conn->nFwdRecv < conn->nFwdSent
+    } else {
+      *nextWakeup = std::min(*nextWakeup, coll->startTime+coll->timeout);
+    }
+  } // for (collIdx)
+}
+
+
+/////////////////////////////////////////////////////////////////////////
+// Functions related to the handling of the RAS_COLL_CONNS collective. //
+/////////////////////////////////////////////////////////////////////////
+
+// Initializes the accumulated data with just the local data for now.
+// For this particular collective, we keep some reduced statistical data (min/max/avg travel time) as well
+// as connection-specific info in case we observed a negative min travel time (which, ideally, shouldn't happen,
+// but the system clocks may not be perfectly in sync).
+static ncclResult_t rasCollConnsInit(char** pData, int* pNData) {
+  struct rasCollConns connsData = {.travelTimeMin = INT64_MAX, .travelTimeMax = INT64_MIN};
+  struct rasCollConns* pConnsData;
+
+  // Update the statistical data first and in the process also calculate how much connection-specific space we
+  // will need.
+  for (int i = 0; i < nRasConns; i++) {
+    struct rasConnection* conn = rasConns+i;
+    if (conn->inUse && conn->travelTimeCount > 0) {
+      if (connsData.travelTimeMin > conn->travelTimeMin)
+        connsData.travelTimeMin = conn->travelTimeMin;
+      if (connsData.travelTimeMax < conn->travelTimeMax)
+        connsData.travelTimeMax = conn->travelTimeMax;
+      connsData.travelTimeSum += conn->travelTimeSum;
+      connsData.travelTimeCount += conn->travelTimeCount;
+      connsData.nConns++;
+      if (conn->travelTimeMin < 0)
+        connsData.nNegativeMins++;
+    }
+  }
+
+  *pNData = sizeof(connsData) + connsData.nNegativeMins*sizeof(*connsData.negativeMins);
+  NCCLCHECK(ncclCalloc(pData, *pNData));
+  pConnsData = (struct rasCollConns*)*pData;
+  memcpy(pConnsData, &connsData, sizeof(*pConnsData));
+  if (connsData.nNegativeMins > 0) {
+    for (int i = 0, negMinsIdx = 0; i < nRasConns; i++) {
+      struct rasConnection* conn = rasConns+i;
+      if (conn->inUse && conn->travelTimeMin < 0) {
+        struct rasCollConns::negativeMin* negativeMin = pConnsData->negativeMins+negMinsIdx;
+        memcpy(&negativeMin->source, &rasNetListeningSocket.addr, sizeof(negativeMin->source));
+        memcpy(&negativeMin->dest, &conn->addr, sizeof(negativeMin->dest));
+        negativeMin->travelTimeMin = conn->travelTimeMin;
+        negMinsIdx++;
+      }
+      assert(negMinsIdx <= connsData.nNegativeMins);
+    }
+  }
+
+  return ncclSuccess;
+}
+
+// Merges incoming collective RAS_COLL_CONNS response message into the local accumulated data.
+static ncclResult_t rasCollConnsMerge(struct rasCollective* coll, struct rasMsg* msg) {
+  struct rasCollConns* collData;
+  struct rasCollConns* msgData;
+  int dataOffset = rasMsgLength(RAS_MSG_COLLRESP) + msg->collResp.nPeers*sizeof(*msg->collResp.peers);
+  ALIGN_SIZE(dataOffset, alignof(int64_t));
+
+  msgData = (struct rasCollConns*)(((char*)msg) + dataOffset);
+  collData = (struct rasCollConns*)coll->data;
+
+  // Merge the stats.
+  if (collData->travelTimeMin > msgData->travelTimeMin)
+    collData->travelTimeMin = msgData->travelTimeMin;
+  if (collData->travelTimeMax < msgData->travelTimeMax)
+    collData->travelTimeMax = msgData->travelTimeMax;
+  collData->travelTimeSum += msgData->travelTimeSum;
+  collData->travelTimeCount += msgData->travelTimeCount;
+  collData->nConns += msgData->nConns;
+
+  // Append the info about negative minimums.
+  if (msgData->nNegativeMins > 0) {
+    int nData = sizeof(*collData) +
+      (collData->nNegativeMins+msgData->nNegativeMins) * sizeof(*collData->negativeMins);
+    NCCLCHECK(ncclRealloc(&coll->data, coll->nData, nData));
+    collData = (struct rasCollConns*)coll->data;
+    memcpy(coll->data+coll->nData, msgData->negativeMins,
+           msgData->nNegativeMins * sizeof(*collData->negativeMins));
+    coll->nData = nData;
+    collData->nNegativeMins += msgData->nNegativeMins;
+  }
+
+  return ncclSuccess;
+}
+
+
+/////////////////////////////////////////////////////////////////////////
+// Functions related to the handling of the RAS_COLL_COMMS collective. //
+/////////////////////////////////////////////////////////////////////////
+
+// Initializes the accumulated data with just the local data for now.
+// For this particular collective, we keep for every communicator information about every rank, to help identify
+// the missing ones and the discrepancies between the ones that did respond.
+static ncclResult_t rasCollCommsInit(char** pData, int* pNData) {
+  struct rasCollComms* commsData;
+  int nComms = 0, nRanks = 0;
+  std::lock_guard<std::mutex> lock(ncclCommsMutex);
+
+  // Start by counting the communicators so that we know how much space to allocate.
+  // We also need to sort the comms array, to make the subsequent merging easier, both between the ranks (in case
+  // of multiple GPUs per process) and between the peers.
+  if (!ncclCommsSorted) {
+    qsort(ncclComms, nNcclComms, sizeof(*ncclComms), &ncclCommsCompare);
+    ncclCommsSorted = true;
+  }
+  for (int i = 0; i < nNcclComms; i++) {
+    if (ncclComms[i] == nullptr) // nullptr's are always at the end after sorting.
+      break;
+    if (i == 0) {
+      nComms = 1;
+    } else if (ncclComms[i]->commHash != ncclComms[i-1]->commHash) {
+      nComms++;
+    }
+    nRanks++;
+  }
+
+  // rasNetCollCommsData has nested variable-length arrays, which makes the size calculation and subsequent
+  // pointer manipulations somewhat unwieldy...
+  *pNData = sizeof(*commsData) + nComms * sizeof(*commsData->comms) + nRanks * sizeof(*commsData->comms[0].ranks);
+  NCCLCHECK(ncclCalloc(pData, *pNData));
+  commsData = (struct rasCollComms*)*pData;
+  commsData->nComms = nComms;
+
+  // comm points at the space in the accumulated data where the info about the current communicator is to be stored.
+  struct rasCollComms::comm* comm = commsData->comms;
+  for (int i = 0; i < nNcclComms; i++) {
+    struct rasCollComms::comm::rank* rank;
+    ncclResult_t asyncError;
+    if (ncclComms[i] == nullptr)
+      break;
+    if (i == 0 || ncclComms[i]->commHash != ncclComms[i-1]->commHash) {
+      if (i > 0)
+        comm = (struct rasCollComms::comm*)(((char*)(comm+1)) + comm->nRanks * sizeof(*comm->ranks));
+      comm->commHash = ncclComms[i]->commHash;
+      comm->commNRanks = ncclComms[i]->nRanks;
+      comm->nRanks = 0;
+    } else if (ncclComms[i]->nRanks != ncclComms[i-1]->nRanks) {
+      INFO(NCCL_RAS, "RAS encountered inconsistent communicator data: size %d != %d -- "
+           "possible commHash collision (0x%lx)", ncclComms[i-1]->nRanks, ncclComms[i]->nRanks, comm->commHash);
+      continue; // Short of failing, the best we can do is skip...
+    } else if (ncclComms[i]->rank == ncclComms[i-1]->rank) {
+      INFO(NCCL_RAS, "RAS encountered duplicate data for rank %d -- possible commHash collision (0x%lx)",
+           ncclComms[i]->rank, comm->commHash);
+      continue; // Short of failing, the best we can do is skip...
+    }
+    if (comm->nRanks == comm->commNRanks) {
+      INFO(NCCL_RAS,
+           "RAS encountered more ranks than the communicator size (%d) -- possible commHash collision (0x%lx)",
+           comm->commNRanks, comm->commHash);
+      continue; // Short of failing, the best we can do is skip...
+    }
+    rank = comm->ranks+comm->nRanks;
+    rank->commRank = ncclComms[i]->rank;
+    // rasNetSendCollReq initializes coll->peers[0] to our rasNetListeningSocket.addr, so peerIdx is initially
+    // always 0.  It will increase after we send this response back to the peer we got the request from.
+    rank->peerIdx = 0;
+    rank->collOpCount = ncclComms[i]->collOpCount;
+    rank->status.initState = ncclComms[i]->initState;
+    if (ncclCommGetAsyncError(ncclComms[i], &asyncError) == ncclSuccess)
+      rank->status.asyncError = asyncError;
+    rank->status.finalizeCalled = (ncclComms[i]->finalizeCalled != 0);
+    rank->status.destroyFlag = (ncclComms[i]->destroyFlag != 0);
+    rank->status.abortFlag = (__atomic_load_n(ncclComms[i]->abortFlag, __ATOMIC_ACQUIRE) != 0);
+    rank->cudaDev = ncclComms[i]->cudaDev;
+    rank->nvmlDev = ncclComms[i]->nvmlDev;
+    comm->nRanks++;
+  }
+  assert(nComms == 0 || ((char*)(comm->ranks+comm->nRanks)) - (char*)commsData <= *pNData);
+
+  return ncclSuccess;
+}
+
+// Merges incoming collective RAS_COLL_COMMS response message into the local accumulated data.
+static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg* msg) {
+  struct rasCollComms* collData;
+  struct rasCollComms* msgData;
+  int dataOffset = rasMsgLength(RAS_MSG_COLLRESP) + msg->collResp.nPeers*sizeof(*msg->collResp.peers);
+  ALIGN_SIZE(dataOffset, alignof(int64_t));
+
+  msgData = (struct rasCollComms*)(((char*)msg) + dataOffset);
+  collData = (struct rasCollComms*)coll->data;
+
+  if (msgData->nComms > 0) {
+    struct rasCollComms* newData = nullptr;
+
+    // Allocate the new buffer pessimistically (sized as the sum of the two old ones).
+    NCCLCHECK(ncclCalloc((char**)&newData, coll->nData + msg->collResp.nData));
+    struct rasCollComms::comm* collComm = collData->comms;
+    struct rasCollComms::comm* msgComm = msgData->comms;
+    struct rasCollComms::comm* newComm = newData->comms;
+
+    for (int collIdx = 0, msgIdx = 0; collIdx < collData->nComms || msgIdx < msgData->nComms; newData->nComms++) {
+      int cmp;
+      if (collIdx < collData->nComms && msgIdx < msgData->nComms)
+        cmp = (collComm->commHash < msgComm->commHash ? -1 : (collComm->commHash > msgComm->commHash ? 1 : 0));
+      else
+        cmp = (collIdx < collData->nComms ? -1 : 1);
+
+      if (cmp == 0 && collComm->commNRanks != msgComm->commNRanks) {
+        INFO(NCCL_RAS, "RAS encountered inconsistent communicator data: size %d != %d -- "
+             "possible commHash collision (0x%lx)", collComm->commNRanks, msgComm->commNRanks, collComm->commHash);
+        cmp = (collComm->commNRanks < msgComm->commNRanks ? -1 : 1);
+        // We try to preserve both separately, although the input data might already be messed up anyway...
+      }
+
+      if (cmp == 0) {
+        // Merge the comms.
+        newComm->commHash = collComm->commHash;
+        newComm->commNRanks = collComm->commNRanks;
+        if (collComm->nRanks + msgComm->nRanks > collComm->commNRanks) {
+          INFO(NCCL_RAS,
+               "RAS encountered more ranks (%d) than the communicator size (%d) -- possible commHash collision (0x%lx)",
+               collComm->nRanks + msgComm->nRanks, newComm->commNRanks, newComm->commHash);
+          // We'll skip the extras in the loop below.
+        } else {
+          newComm->nRanks = collComm->nRanks + msgComm->nRanks;
+        }
+        // Merge the ranks.
+        for (int newRankIdx = 0, collRankIdx = 0, msgRankIdx = 0;
+             collRankIdx < collComm->nRanks || msgRankIdx < msgComm->nRanks;
+             newRankIdx++) {
+          int cmpRank;
+          if (newRankIdx == newComm->commNRanks)
+            break; // Short of failing, the best we can do is skip...
+          if (collRankIdx < collComm->nRanks && msgRankIdx < msgComm->nRanks)
+            cmpRank = (collComm->ranks[collRankIdx].commRank < msgComm->ranks[msgRankIdx].commRank ? -1 :
+                       (collComm->ranks[collRankIdx].commRank > msgComm->ranks[msgRankIdx].commRank ? 1 : 0));
+          else
+            cmpRank = (collRankIdx < collComm->nRanks ? -1 : 1);
+
+          // There shouldn't be any overlaps in ranks between different sources.
+          if (cmpRank == 0) {
+            INFO(NCCL_RAS, "RAS encountered duplicate data for rank %d -- possible commHash collision (0x%lx)",
+                 collComm->ranks[collRankIdx].commRank, newComm->commHash);
+            msgRankIdx++; // Short of failing, the best we can do is skip...
+          }
+          memcpy(newComm->ranks+newRankIdx, (cmpRank <= 0 ? collComm->ranks+collRankIdx++ :
+                                             msgComm->ranks+msgRankIdx++), sizeof(*newComm->ranks));
+          if (cmpRank > 0) {
+            // peerIdx values from msgComm need to shift after merge.
+            newComm->ranks[newRankIdx].peerIdx += coll->nPeers;
+          }
+        } // for (newRankIdx)
+        newComm = (struct rasCollComms::comm*)(((char*)(newComm+1)) + newComm->nRanks * sizeof(*newComm->ranks));
+        collComm = (struct rasCollComms::comm*)(((char*)(collComm+1)) + collComm->nRanks * sizeof(*collComm->ranks));
+        collIdx++;
+        msgComm = (struct rasCollComms::comm*)(((char*)(msgComm+1)) + msgComm->nRanks * sizeof(*msgComm->ranks));
+        msgIdx++;
+      } else if (cmp < 0) {
+        // Copy from collComm.
+        int commSize = sizeof(*collComm) + collComm->nRanks * sizeof(*collComm->ranks);
+        memcpy(newComm, collComm, commSize);
+        newComm = (struct rasCollComms::comm*)(((char*)(newComm)) + commSize);
+        collComm = (struct rasCollComms::comm*)(((char*)(collComm)) + commSize);
+        collIdx++;
+      } else { // cmp > 0
+        // Copy from msgComm.
+        int commSize = sizeof(*msgComm) + msgComm->nRanks * sizeof(*msgComm->ranks);
+        memcpy(newComm, msgComm, commSize);
+        for (int i = 0; i < newComm->nRanks; i++) {
+          // peerIdx values from msgComm need to shift after merge.
+          newComm->ranks[i].peerIdx += coll->nPeers;
+        }
+        newComm = (struct rasCollComms::comm*)(((char*)(newComm)) + commSize);
+        msgComm = (struct rasCollComms::comm*)(((char*)(msgComm)) + commSize);
+        msgIdx++;
+      } // cmp > 0
+    } // for (collIdx and msgIdx)
+
+    free(coll->data);
+    coll->data = (char*)newData;
+    // newComm points at the next element beyond the last one -- exactly what we need.
+    coll->nData = ((char*)newComm) - (char*)newData;
+  } // if (msgData->nComms > 0)
+
+  return ncclSuccess;
+}
+
+// Sorting callback for the ncclComms array.
+static int ncclCommsCompare(const void* p1, const void* p2) {
+  const ncclComm** pc1 = (const ncclComm**)p1;
+  const ncclComm** pc2 = (const ncclComm**)p2;
+
+  // Put nullptr's at the end.
+  if (*pc1 == nullptr || *pc2 == nullptr)
+    return (*pc1 != nullptr ? -1 : (*pc2 != nullptr ? 1 : 0));
+
+  if ((*pc1)->commHash == (*pc2)->commHash) {
+    return ((*pc1)->rank < (*pc2)->rank ? -1 : ((*pc1)->rank > (*pc2)->rank ? 1 : 0));
+  } else {
+    return ((*pc1)->commHash < (*pc2)->commHash ? -1 : 1);
+  }
+}
diff --git a/src/ras/peers.cc b/src/ras/peers.cc
new file mode 100644
index 0000000000..f2692d3e17
--- /dev/null
+++ b/src/ras/peers.cc
@@ -0,0 +1,960 @@
+/*************************************************************************
+ * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#define NDEBUG // Comment out during development only!
+#include <cassert>
+
+#include "alloc.h"
+#include "checks.h"
+#include "comm.h"
+#include "nccl.h"
+#include "ras_internal.h"
+
+
+// All the known peer NCCL processes. The array is sorted by addr to ensure locality (within a node and hopefully
+// also within a DC).  The array may grow over time and it *includes* dead peers.
+struct rasPeerInfo* rasPeers;
+int nRasPeers;
+// Hash of the rasPeers array, for figuring out when to sync with a remote peer.
+uint64_t rasPeersHash;
+// Index of this process within the rasPeers array (may change over time as the array grows).
+static int myPeerIdx = -1;
+
+// Addresses of all the dead peers, sorted.  In principle we could instead have a flag in rasPeerInfo for this,
+// but we expect rasPeers to be largely static (and large at scale!) and rasDeadPeers to be fairly dynamic and
+// much smaller, so we prefer to keep the dead info separately so that we don't end up sending the possibly large
+// rasPeerInfo array around all the time.
+union ncclSocketAddress* rasDeadPeers;
+// The number of dead peers.
+int nRasDeadPeers;
+// The array size (may be larger than nRasDeadPeers).
+static int rasDeadPeersSize;
+// Hash of the rasDeadPeers array, for figuring out when to sync with a remote peer.
+uint64_t rasDeadPeersHash;
+
+static ncclResult_t rasRanksConvertToPeers(struct rasRankInit* ranks, int nranks,
+                                           struct rasPeerInfo** rankPeers, int *nRankPeers, int* newNRasPeers);
+static ncclResult_t rasPeersUpdate(struct rasPeerInfo* rankPeers, int* nRankPeers, int newNRasPeers = -1);
+
+static ncclResult_t rasNetUpdatePeers(const struct rasPeerInfo* newPeers, int nNewPeers, bool updateDeadPeers,
+                                      struct rasRankInit* ranks = nullptr, int nranks = 0, int fromConnIdx = -1);
+static ncclResult_t rasLinkPropagateUpdate(struct rasLink* link, const struct rasPeerInfo* newPeers, int nNewPeers,
+                                           bool updateDeadPeers, struct rasRankInit* ranks, int nranks,
+                                           int fromConnIdx);
+static ncclResult_t rasConnPropagateUpdate(struct rasConnection* conn, const struct rasPeerInfo* newPeers,
+                                           int nNewPeers, bool updateDeadPeers, struct rasRankInit* ranks, int nranks);
+ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock);
+
+static ncclResult_t rasLinkReinitConns(struct rasLink* link);
+
+static ncclResult_t rasDeadPeersUpdate(union ncclSocketAddress* updatePeers, int* nUpdatePeers);
+static ncclResult_t getNewDeadEntry(union ncclSocketAddress** pAddr);
+
+static int rasAddrRankInitCompare(const void* k, const void* e);
+static int rasAddrPeerInfoCompare(const void* k, const void* e);
+static int rasRanksCompare(const void* e1, const void* e2);
+
+static void rasPeersDump();
+static void rasDeadPeersDump();
+static char* rasPeerDump(const struct rasPeerInfo* peer, char* result, size_t nres);
+
+
+/////////////////////////////////////////////////////////////////////////////
+// Functions related to the handling of local RAS_ADD_RANKS notifications. //
+/////////////////////////////////////////////////////////////////////////////
+
+// Handles RAS_ADD_RANKS notification -- adds new ranks to the internal list of all RAS peers, reconfigures RAS
+// network connections, and notifies the peers.
+ncclResult_t rasLocalHandleAddRanks(struct rasRankInit* ranks, int nranks) {
+  ncclResult_t ret = ncclSuccess;
+
+  INFO(NCCL_RAS, "RAS handling local addRanks request (old nRasPeers %d)", nRasPeers);
+
+  // Convert the input rasRankInit structures into our internal rasPeerInfo.
+  struct rasPeerInfo* rankPeers = nullptr;
+  int nRankPeers;
+  int newNRasPeers;
+  NCCLCHECKGOTO(rasRanksConvertToPeers(ranks, nranks, &rankPeers, &nRankPeers, &newNRasPeers), ret, fail);
+
+  // Update local rasPeers.
+  NCCLCHECKGOTO(rasPeersUpdate(rankPeers, &nRankPeers, newNRasPeers), ret, fail);
+
+  INFO(NCCL_RAS, "RAS finished local processing of addRanks request (new nRasPeers %d, nRankPeers %d)",
+       nRasPeers, nRankPeers);
+  // Print peers only if something changed and we're the "root".
+  if (nRankPeers > 0 && memcmp(&ranks[0].addr, &rasNetListeningSocket.addr, sizeof(ranks[0].addr)) == 0)
+    rasPeersDump();
+
+  // Propagate the changes through our RAS network links.
+  NCCLCHECKGOTO(rasNetUpdatePeers(rankPeers, nRankPeers, /*updateDeadPeers*/false, ranks, nranks), ret, fail);
+
+exit:
+  if (rankPeers)
+    free(rankPeers);
+  free(ranks);
+  return ret;
+fail:
+  goto exit;
+}
+
+// Converts the rasRankInit structure into rasPeerInfo.  This skips empty elements (in case of errors), orders
+// elements by the address/cudaDev, and merges elements with duplicate addresses (in case of multiple CUDA devices per
+// process).  In the process we also calculate how large the merged rasPeers array will need to be.
+static ncclResult_t rasRanksConvertToPeers(struct rasRankInit* ranks, int nranks,
+                                           struct rasPeerInfo** rankPeers, int *nRankPeers, int* newNRasPeers) {
+  ncclResult_t ret = ncclSuccess;
+  int peerIdx, rankPeerIdx;
+
+  // Handy when checking for empty (in case of errors) addresses.
+  union ncclSocketAddress emptyAddr;
+  memset(&emptyAddr, '\0', sizeof(emptyAddr));
+
+  // Begin by sorting the array by address and cudaDev (to match the rasPeers order).
+  qsort(ranks, nranks, sizeof(*ranks), &rasRanksCompare);
+
+  // We over-allocate peers here because to get an accurate count we would need to loop over the ranks first...
+  // nRankPeers will hold the actual count of used elements.
+  *rankPeers = nullptr;
+  NCCLCHECKGOTO(ncclCalloc(rankPeers, nranks), ret, fail);
+
+  peerIdx = rankPeerIdx = 0;
+  *newNRasPeers = nRasPeers;
+  for (int rankIdx = 0; rankIdx < nranks; rankIdx++) {
+    const struct rasRankInit* rank = ranks+rankIdx;
+    struct rasPeerInfo* rankPeer = *rankPeers+rankPeerIdx;
+
+    if (memcmp(&emptyAddr, &rank->addr, sizeof(emptyAddr)) == 0) {
+      // Skip empty rank entries.
+      continue;
+    }
+
+    // First check if the rank doesn't need to be merged into the previous entry in rankPeers
+    // (possible if there are multiple ranks with the same address).
+    if (rankPeerIdx > 0 && memcmp(&rank->addr, &rankPeer[-1].addr, sizeof(rank->addr)) == 0) {
+      // Merge into the previous entry in peers.
+      rankPeer[-1].cudaDevs |= (1UL << rank->cudaDev);
+      rankPeer[-1].nvmlDevs |= (1UL << rank->nvmlDev);
+      continue;
+    }
+
+    // Add a new entry to rankPeers.
+    assert(rankPeerIdx < nranks);
+    memcpy(&rankPeer->addr, &rank->addr, sizeof(rankPeer->addr));
+    rankPeer->pid = rank->pid;
+    rankPeer->cudaDevs = (1UL << rank->cudaDev);
+    rankPeer->nvmlDevs = (1UL << rank->nvmlDev);
+    rankPeerIdx++;
+
+    // Also check if there is already an entry with that address in the global rasPeers so that the caller can know how
+    // many more entries will be needed.
+    const struct rasPeerInfo* rasPeer = rasPeers+peerIdx;
+    int cmp = 0;
+    while (peerIdx < nRasPeers) {
+      cmp = ncclSocketsCompare(&rank->addr, &rasPeer->addr);
+      if (cmp <= 0)
+        break;
+      peerIdx++;
+      rasPeer++;
+    }
+    if (peerIdx == nRasPeers) {
+      // The current rank is "greater than" all existing peers, so it will need a new entry.  We stay in the loop so
+      // that we don't need to handle the remaining ranks separately.
+      (*newNRasPeers)++;
+      continue;
+    }
+    if (cmp < 0) {
+      (*newNRasPeers)++;
+    } else {
+      // Duplicates (cmp == 0) between the rank array and the peers array will be merged.
+      assert(rank->pid == rasPeer->pid);
+    }
+  }
+  assert(peerIdx <= nRasPeers);
+  *nRankPeers = rankPeerIdx;
+
+exit:
+  return ret;
+fail:
+  if (*rankPeers) {
+    free(*rankPeers);
+    *rankPeers = nullptr;
+  }
+  goto exit;
+}
+
+// Updates the rasPeers array with the new data.  The new data gets updated in the process as well: any data that
+// wasn't actually new is purged, so as to minimize the amount of data we forward to our peers.
+// On a successful return, nRankPeers contains the number of entries that were updated.
+static ncclResult_t rasPeersUpdate(struct rasPeerInfo* rankPeers, int* nRankPeers, int newNRasPeers) {
+  ncclResult_t ret = ncclSuccess;
+  int rankPeerIdxDst;
+  int rankPeerIdx, peerIdx;
+
+  if (newNRasPeers == -1) {
+    // First calculate the new size of rasPeers.
+    newNRasPeers = nRasPeers;
+    for (rankPeerIdx = peerIdx = 0; rankPeerIdx < *nRankPeers; rankPeerIdx++) {
+      struct rasPeerInfo* rankPeer = rankPeers+rankPeerIdx;
+      struct rasPeerInfo* rasPeer = rasPeers+peerIdx;
+      int cmp = 1;
+
+      while (peerIdx < nRasPeers) {
+        cmp = ncclSocketsCompare(&rankPeer->addr, &rasPeer->addr);
+
+        if (cmp < 0) {
+          // rankPeer will go in front of rasPeer.
+          newNRasPeers++;
+          break;
+        }
+
+        peerIdx++;
+        rasPeer++;
+
+        if (cmp == 0)
+          break;
+      }
+      if (cmp > 0) // No more rasPeer entries -- rankPeer will go at the end.
+        newNRasPeers++;
+    }
+  }
+
+  // If needed, allocate a new, larger rasPeers array.
+  struct rasPeerInfo* newRasPeers;
+  int myNewPeerIdx;
+  if (newNRasPeers > nRasPeers) {
+    NCCLCHECKGOTO(ncclCalloc(&newRasPeers, newNRasPeers), ret, fail);
+  } else {
+    newRasPeers = rasPeers;
+  }
+
+  // Now merge the rankPeers into newRasPeers.  In the process, modify rankPeers to become a "diff" between
+  // the old rasPeers and newRasPeers -- this will be the data structure to broadcast on the RAS network.
+  myNewPeerIdx = -1;
+  int newPeerIdx;
+  for (newPeerIdx = rankPeerIdx = peerIdx = 0; rankPeerIdx < *nRankPeers || peerIdx < nRasPeers;) {
+    struct rasPeerInfo* rankPeer = rankPeers+rankPeerIdx;
+    struct rasPeerInfo* rasPeer = rasPeers+peerIdx;
+    struct rasPeerInfo* newRasPeer = newRasPeers+newPeerIdx;
+
+    if (rankPeerIdx < *nRankPeers) {
+      if (peerIdx < nRasPeers) {
+        int cmp = ncclSocketsCompare(&rankPeer->addr, &rasPeer->addr);
+
+        if (cmp < 0) {
+          // rankPeer needs to occur before rasPeer -- that's possible only if we are adding new entries.
+          assert(newRasPeers != rasPeers);
+          // Add new entry to newRasPeers.
+          assert(newPeerIdx < newNRasPeers);
+          memcpy(newRasPeer, rankPeer, sizeof(*newRasPeer));
+          newPeerIdx++;
+          rankPeerIdx++;
+        }
+        else {
+          // cmp >= 0 -- Start by copying peer to newRasPeer, if needed.
+          if (newRasPeers != rasPeers) {
+            assert(newPeerIdx < newNRasPeers);
+            memcpy(newRasPeer, rasPeer, sizeof(*newRasPeer));
+          }
+          else { // in-place
+            assert(newRasPeer == rasPeer);
+          }
+
+          if (cmp == 0) {
+            // The address of rankPeer is the same as that of newRasPeer -- merge into it.
+            // First though calculate what GPUs from rankPeer are actually new (if any).
+            uint64_t newDevs = rankPeer->cudaDevs & ~newRasPeer->cudaDevs;
+            newRasPeer->cudaDevs |= rankPeer->cudaDevs;
+            // Update rankPeer->devs with the newly added devs only -- we'll clean it up at the end.
+            rankPeer->cudaDevs = newDevs;
+            // Repeat for nvmlDevs...
+            newDevs = rankPeer->nvmlDevs & ~newRasPeer->nvmlDevs;
+            newRasPeer->nvmlDevs |= rankPeer->nvmlDevs;
+            rankPeer->nvmlDevs = newDevs;
+            rankPeerIdx++;
+          }
+          // Given that we might've added new entries, we need to update myPeerIdx as well.
+          if (myPeerIdx == peerIdx)
+            myNewPeerIdx = newPeerIdx;
+          peerIdx++;
+          newPeerIdx++;
+        }
+      } else { // peerIdx == nRasPeers
+        // No more rasPeers -- add a new entry based on rank.
+        assert(newPeerIdx < newNRasPeers);
+        memcpy(newRasPeer, rankPeer, sizeof(*newRasPeer));
+        // If this is the first time this function is run, myPeerIdx will need to be set.  It's more work in that
+        // case as we need to compare the addresses of each peer until we find one.
+        if (myPeerIdx == -1 && memcmp(&newRasPeer->addr, &rasNetListeningSocket.addr, sizeof(newRasPeer->addr)) == 0)
+          myNewPeerIdx = newPeerIdx;
+        newPeerIdx++;
+        rankPeerIdx++;
+      }
+    } else { // rankPeerIdx == *nRankPeers
+      // No more rankPeers -- copy the rasPeer over if needed.
+      if (newRasPeers != rasPeers) {
+        assert(newPeerIdx < newNRasPeers);
+        memcpy(newRasPeer, rasPeer, sizeof(*newRasPeer));
+      }
+      else { // in-place at the end.
+        assert(newRasPeer == rasPeer);
+      }
+      if (myPeerIdx == peerIdx)
+        myNewPeerIdx = newPeerIdx;
+      peerIdx++;
+      newPeerIdx++;
+    }
+  }
+  assert(newPeerIdx == newNRasPeers);
+
+  if (newRasPeers != rasPeers) {
+    if (rasPeers)
+      free(rasPeers);
+    rasPeers = newRasPeers;
+    nRasPeers = newNRasPeers;
+    assert(myNewPeerIdx != -1);
+    myPeerIdx = myNewPeerIdx;
+  } else {
+    assert(myNewPeerIdx == myPeerIdx);
+  }
+  rasPeersHash = getHash((const char*)rasPeers, nRasPeers*sizeof(*rasPeers));
+
+  // Purge from rankPeers all entries that didn't actually contribute any new GPUs.
+  for (rankPeerIdx = rankPeerIdxDst = 0; rankPeerIdx < *nRankPeers; rankPeerIdx++) {
+    struct rasPeerInfo* rankPeer = rankPeers+rankPeerIdx;
+    if (rankPeer->cudaDevs != 0) {
+      if (rankPeerIdxDst != rankPeerIdx) {
+        memcpy(rankPeers+rankPeerIdxDst, rankPeer, sizeof(*rankPeers));
+      }
+      rankPeerIdxDst++;
+    }
+  }
+  assert(rankPeerIdxDst <= *nRankPeers);
+  *nRankPeers = rankPeerIdxDst;
+
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+// Searches through rasPeers given the peer address.  Returns the index of the found entry in the rasPeers
+// array or -1 if not found.
+int rasPeerFind(const union ncclSocketAddress* addr) {
+  struct rasPeerInfo* peer = (struct rasPeerInfo*)bsearch(addr, rasPeers, nRasPeers, sizeof(*rasPeers),
+                                                          rasAddrPeerInfoCompare);
+  return (peer ? peer-rasPeers : -1);
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////
+// Functions related to the propagation of peers updates over the RAS network. //
+/////////////////////////////////////////////////////////////////////////////////
+
+// Propagates information about new peers through the RAS network links.
+// ranks -- if provided -- lists all the peers who are already aware of this update (because they are the members
+// of the new communicator being established), and who thus don't need to be notified.  updatedDeadPeers can
+// be used, however, to request at least the propagation of rasDeadPeers to such peers.
+// fromConnIdx -- if provided -- identified the connection used to receive this update; there's no need to
+// propagate the update back through it.
+// Reconfigures the RAS network to accommodate the newly added peers, by modifying the links and establishing new
+// connections as needed.
+static ncclResult_t rasNetUpdatePeers(const struct rasPeerInfo* newPeers, int nNewPeers, bool updateDeadPeers,
+                                      struct rasRankInit* ranks, int nranks, int fromConnIdx) {
+  ncclResult_t ret = ncclSuccess;
+
+  // Do we actually have anything to do?
+  if (nNewPeers == 0 && !updateDeadPeers)
+    goto exit;
+
+  // Start by propagating the update through the RAS network links.  We consider any errors during this process
+  // to be non-fatal (we can re-sync later around a keep-alive exchange).
+  (void)rasLinkPropagateUpdate(&rasNextLink, newPeers, nNewPeers, updateDeadPeers, ranks, nranks, fromConnIdx);
+  (void)rasLinkPropagateUpdate(&rasPrevLink, newPeers, nNewPeers, updateDeadPeers, ranks, nranks, fromConnIdx);
+
+  // Calculate new link peers and open new connections if needed.
+  NCCLCHECKGOTO(rasLinkReinitConns(&rasNextLink), ret, fail);
+  NCCLCHECKGOTO(rasLinkReinitConns(&rasPrevLink), ret, fail);
+
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+// Sends a peers update through all the connections associated with a particular link.  See rasNetUpdatePeers
+// for the explanation of the function arguments.
+static ncclResult_t rasLinkPropagateUpdate(struct rasLink* link, const struct rasPeerInfo* newPeers, int nNewPeers,
+                                           bool updateDeadPeers, struct rasRankInit* ranks, int nranks,
+                                           int fromConnIdx) {
+  for (int i = 0; i < link->nConns; i++) {
+    struct rasLinkConn* linkConn = link->conns+i;
+    // Note that we don't send the update via the connection that we received this notification from in the first
+    // place (while it wouldn't loop indefinitely, it would add a needless extra exchange).
+    if (linkConn->connIdx != -1 && linkConn->connIdx != fromConnIdx) {
+      struct rasConnection* conn = rasConns+linkConn->connIdx;
+      // Failed propagations are not considered fatal (we will retry after a keep-alive).
+      (void)rasConnPropagateUpdate(conn, newPeers, nNewPeers, updateDeadPeers, ranks, nranks);
+    }
+  }
+
+  return ncclSuccess;
+}
+
+// Sends a peers update down a particular connection.  See rasNetUpdatePeers for the explanation of the function
+// arguments.
+static ncclResult_t rasConnPropagateUpdate(struct rasConnection* conn, const struct rasPeerInfo* newPeers,
+                                           int nNewPeers, bool updateDeadPeers, struct rasRankInit* ranks, int nranks) {
+  if (conn->sockIdx != -1 && rasSockets[conn->sockIdx].status == RAS_SOCK_READY) {
+    // If we have the rank info, check if the peer on the other side of this connection has participated in the new
+    // communicator.
+    int connRank = -1;
+    if (ranks && !updateDeadPeers) {
+      struct rasRankInit* rank = (struct rasRankInit*)bsearch(&conn->addr, ranks, nranks, sizeof(*ranks),
+                                                              rasAddrRankInitCompare);
+      if (rank)
+        connRank = rank-ranks;
+    }
+    if (connRank < 0) {
+      // It did not participate or we don't know -- we should send an update to that peer then.
+      NCCLCHECK(rasConnSendPeersUpdate(conn, newPeers, nNewPeers));
+    }
+  }
+
+  return ncclSuccess;
+}
+
+// Sends a RAS_MSG_PEERSUPDATE message, which can include both the rasPeers (preferably only the newly added peers
+// rather than the complete rasPeers array, to save on the network bandwidth) and rasDeadPeers (sent in its entirety
+// if at all, as it's assumed to be a lot smaller than rasPeers).
+ncclResult_t rasConnSendPeersUpdate(struct rasConnection* conn, const struct rasPeerInfo* peers, int nPeers) {
+  struct rasMsg* msg = nullptr;
+  int msgLen;
+  int deadPeersOffset = 0;
+  int nDeadPeers;
+
+  if (conn->lastSentPeersHash == rasPeersHash || conn->lastRecvPeersHash == rasPeersHash) {
+    nPeers = 0;
+  }
+  if (conn->lastSentDeadPeersHash == rasDeadPeersHash || conn->lastRecvDeadPeersHash == rasDeadPeersHash) {
+    nDeadPeers = 0;
+  } else {
+    // We expect the rasDeadPeers array to be much smaller than rasPeers so if we send it, we send it in full.
+    nDeadPeers = nRasDeadPeers;
+  }
+
+  if (nPeers == 0 && nDeadPeers == 0)
+    goto exit;
+
+  msgLen = rasMsgLength(RAS_MSG_PEERSUPDATE) + nPeers*sizeof(*peers);
+  if (nDeadPeers > 0) {
+    ALIGN_SIZE(msgLen, alignof(union ncclSocketAddress));
+    deadPeersOffset = msgLen;
+    msgLen += nDeadPeers*sizeof(*rasDeadPeers);
+  }
+
+  NCCLCHECK(rasMsgAlloc(&msg, msgLen));
+  msg->type = RAS_MSG_PEERSUPDATE;
+  msg->peersUpdate.peersHash = rasPeersHash;
+  msg->peersUpdate.nPeers = nPeers;
+  msg->peersUpdate.deadPeersHash = rasDeadPeersHash;
+  msg->peersUpdate.nDeadPeers = nDeadPeers;
+  memcpy(msg->peersUpdate.peers, peers, nPeers * sizeof(msg->peersUpdate.peers[0]));
+  memcpy(((char*)msg)+deadPeersOffset, rasDeadPeers, nDeadPeers * sizeof(*rasDeadPeers));
+
+  if (nPeers > 0)
+    conn->lastSentPeersHash = rasPeersHash;
+  if (nDeadPeers > 0)
+    conn->lastSentDeadPeersHash = rasDeadPeersHash;
+
+  INFO(NCCL_RAS, "RAS sending a peersUpdate to %s (nPeers %d, nDeadPeers %d)",
+       ncclSocketToString(&conn->addr, rasLine), nPeers, nDeadPeers);
+
+  rasConnEnqueueMsg(conn, msg, msgLen);
+exit:
+  return ncclSuccess;
+}
+
+// Handles the RAS_MSG_PEERSUPDATE message on the receiver side.  The received data is merged into the local
+// rasPeers and rasDeadPeers arrays.  If the checksums of the resulting arrays don't match those from the message,
+// sends its own RAS_MSG_PEERSUPDATE back to the source, to ensure a sync.
+// Subsequently propagates the update to its own peers.
+ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock) {
+  ncclResult_t ret = ncclSuccess;
+  struct rasMsg* newMsg = nullptr;
+  int newMsgLen = 0;
+  assert(sock->connIdx != -1);
+  struct rasConnection* conn = rasConns+sock->connIdx;
+  int nPeers, nDeadPeers;
+  int deadPeersOffset = 0;
+  bool updatePeers, updateDeadPeers;
+
+  INFO(NCCL_RAS, "RAS handling peersUpdate from %s (peersHash 0x%lx, deadPeersHash 0x%lx, nPeers %d, nDeadPeers %d)",
+       ncclSocketToString(&sock->sock.addr, rasLine), msg->peersUpdate.peersHash, msg->peersUpdate.deadPeersHash,
+       msg->peersUpdate.nPeers, msg->peersUpdate.nDeadPeers);
+  INFO(NCCL_RAS, "RAS my old rasPeersHash 0x%lx, rasDeadPeersHash 0x%lx, nRasPeers %d, nRasDeadPeers %d",
+       rasPeersHash, rasDeadPeersHash, nRasPeers, nRasDeadPeers);
+  conn->lastRecvPeersHash = msg->peersUpdate.peersHash;
+  conn->lastRecvDeadPeersHash = msg->peersUpdate.deadPeersHash;
+
+  // Prepare ours to send back.  We don't enqueue it right away because we want to make sure first that we need
+  // to send it.  We'll find out by comparing the hash values after the merge.
+  // We want to prepare the message pre-merge though because post-merge it will include the just received new peers,
+  // and it's pointless to send those back to where they just came from.
+  // nPeers and nDeadPeers are used primarily for message length calculations, so they have to assume the worst-case
+  // scenario (e.g., no overlap in case of nDeadPeers).
+  nPeers = (msg->peersUpdate.peersHash != rasPeersHash ? nRasPeers : 0);
+  nDeadPeers = (msg->peersUpdate.deadPeersHash != rasDeadPeersHash ? nRasDeadPeers+msg->peersUpdate.nDeadPeers : 0);
+  if (nPeers > 0 || nDeadPeers > 0) {
+    newMsgLen = rasMsgLength(RAS_MSG_PEERSUPDATE) + nPeers*sizeof(*rasPeers);
+    if (nDeadPeers > 0) {
+      ALIGN_SIZE(newMsgLen, alignof(union ncclSocketAddress));
+      newMsgLen += nDeadPeers*sizeof(*rasDeadPeers);
+    }
+    NCCLCHECKGOTO(rasMsgAlloc(&newMsg, newMsgLen), ret, fail);
+    newMsg->type = RAS_MSG_PEERSUPDATE;
+    // Note that after rasPeersUpdate below we may still decide not to send the peers.
+    memcpy(newMsg->peersUpdate.peers, rasPeers, nPeers * sizeof(newMsg->peersUpdate.peers[0]));
+    newMsg->peersUpdate.nPeers = nPeers;
+
+    if (nDeadPeers > 0) {
+      // Calculate the offset where dead peers are stored in the received message.  We do it before the peers
+      // update because it could modify msg->peersUpdate.nPeers...
+      deadPeersOffset = rasMsgLength(RAS_MSG_PEERSUPDATE) + msg->peersUpdate.nPeers * sizeof(msg->peersUpdate.peers[0]);
+      ALIGN_SIZE(deadPeersOffset, alignof(union ncclSocketAddress));
+    }
+
+    if (nPeers > 0)
+      NCCLCHECKGOTO(rasPeersUpdate(msg->peersUpdate.peers, &msg->peersUpdate.nPeers), ret, fail);
+    else
+      msg->peersUpdate.nPeers = 0;
+    if (nDeadPeers > 0)
+      NCCLCHECKGOTO(rasDeadPeersUpdate((union ncclSocketAddress*)(((char*)msg)+deadPeersOffset),
+                                       &msg->peersUpdate.nDeadPeers), ret, fail);
+    else
+      msg->peersUpdate.nDeadPeers = 0;
+
+    INFO(NCCL_RAS, "RAS finished local processing of peersUpdate "
+         "(new nRasPeers %d, nRasDeadPeers %d, nPeers %d, nDeadPeers %d)",
+         nRasPeers, nRasDeadPeers, msg->peersUpdate.nPeers, msg->peersUpdate.nDeadPeers);
+    if (msg->peersUpdate.nPeers > 0)
+      rasPeersDump();
+    if (msg->peersUpdate.nDeadPeers > 0)
+      rasDeadPeersDump();
+
+    // If post-merge the hashes are still different, send our (dead) peers back.
+    updatePeers = (conn->lastSentPeersHash != rasPeersHash && conn->lastRecvPeersHash != rasPeersHash);
+    updateDeadPeers = (conn->lastSentDeadPeersHash != rasDeadPeersHash &&
+                       conn->lastRecvDeadPeersHash != rasDeadPeersHash);
+    if (updatePeers || updateDeadPeers) {
+      newMsg->peersUpdate.peersHash = rasPeersHash;
+      newMsg->peersUpdate.deadPeersHash = rasDeadPeersHash;
+      if (updatePeers) {
+        assert(nPeers > 0);
+        conn->lastSentPeersHash = rasPeersHash;
+      } else {
+        // If hashes match, make sure that we don't send the rasPeers back.
+        newMsg->peersUpdate.nPeers = 0;
+      }
+
+      // We need to recalculate the message size from scratch now that both rasPeers and rasDeadPeers may have changed.
+      newMsgLen = rasMsgLength(RAS_MSG_PEERSUPDATE) + newMsg->peersUpdate.nPeers * sizeof(*rasPeers);
+
+      if (updateDeadPeers) {
+        assert(nRasDeadPeers > 0);
+        conn->lastSentDeadPeersHash = rasDeadPeersHash;
+
+        ALIGN_SIZE(newMsgLen, alignof(union ncclSocketAddress));
+        deadPeersOffset = newMsgLen;
+        newMsgLen += nRasDeadPeers*sizeof(*rasDeadPeers);
+
+        memcpy(((char*)newMsg)+deadPeersOffset, rasDeadPeers, nDeadPeers * sizeof(*rasDeadPeers));
+        conn->lastSentDeadPeersHash = rasDeadPeersHash;
+        newMsg->peersUpdate.nDeadPeers = nRasDeadPeers;
+      } else {
+        newMsg->peersUpdate.nDeadPeers = 0;
+      }
+
+      INFO(NCCL_RAS, "RAS sending back a peersUpdate (nPeers %d, nDeadPeers %d)",
+           newMsg->peersUpdate.nPeers, newMsg->peersUpdate.nDeadPeers);
+
+      rasConnEnqueueMsg(conn, newMsg, newMsgLen);
+      newMsg = nullptr;
+    } // if (updatePeers || updateDeadPeers)
+
+    // Propagate the changes through our RAS network links.
+    NCCLCHECKGOTO(rasNetUpdatePeers(msg->peersUpdate.peers, msg->peersUpdate.nPeers, updateDeadPeers, nullptr, 0,
+                                    sock->connIdx), ret, fail);
+  }
+
+exit:
+  rasMsgFree(newMsg);
+  return ret;
+fail:
+  goto exit;
+}
+
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// Functions related to the (re-)configuration of RAS connections after a peers update. //
+//////////////////////////////////////////////////////////////////////////////////////////
+
+// Reinitializes the connection(s) of a particular link, following a peers update.
+// Adding new peers can affect the calculation of the link's primary connection and also the fallbacks.
+// The newly added peers could also shift all the existing peerIdx values, invalidating the values in RasLinkConn
+// structures, so it's better to drop it all and recalculate from scratch.
+// We recalculate the primary peer; if an active connection to it already exists, then we're done.  If there
+// is no connection, we create one.  If a connection exists but is experiencing delays then we add a fallback and
+// the process repeats.
+// External conns are dropped from the links as well (they will be re-created via keepAlive messages as needed).
+static ncclResult_t rasLinkReinitConns(struct rasLink* link) {
+  struct rasLinkConn* linkConn;
+  struct rasConnection* conn = nullptr;
+  int newPeerIdx = myPeerIdx;
+
+  if (link->connsSize == 0) {
+    link->connsSize = RAS_INCREMENT;
+    NCCLCHECK(ncclCalloc(&link->conns, link->connsSize));
+  }
+  link->nConns = 0;
+
+  // Establish a connection for this link.  We iterate as long as the connections we find are experiencing delays.
+  while (newPeerIdx != -1) {
+    if (link->nConns == link->connsSize) {
+      NCCLCHECK(ncclRealloc(&link->conns, link->connsSize, link->connsSize+RAS_INCREMENT));
+      link->connsSize += RAS_INCREMENT;
+    }
+
+    newPeerIdx = rasLinkCalculatePeer(link, newPeerIdx, /*isFallback*/link->nConns > 1);
+    if (newPeerIdx == -1) {
+      INFO(NCCL_RAS, "RAS link %d: no more fallbacks to add (nConns %d)", link->direction, link->nConns);
+      if (link->nConns > 0)
+        break;
+    }
+    linkConn = link->conns+link->nConns;
+    linkConn->peerIdx = newPeerIdx;
+    linkConn->connIdx = (newPeerIdx != -1 ? rasConnFind(&rasPeers[newPeerIdx].addr) : -1);
+    linkConn->external = false;
+
+    // If the calculated connection does not exist, then we are at the end of the chain and this is the last iteration.
+    // Depending on the circumstances, we may first need to create that connection.
+    if (linkConn->connIdx == - 1) {
+      if (link->nConns == 0) {
+        if (linkConn->peerIdx != -1) {
+          INFO(NCCL_RAS, "RAS link %d: %s primary connection with %s",
+               link->direction, (myPeerIdx < linkConn->peerIdx ? "opening new" : "calculated deferred"),
+               ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine));
+          // We try to initiate primary connections from the side with a lower address (and thus an earlier peer index)
+          // to avoid races and the creation of duplicate connections.
+          if (myPeerIdx < linkConn->peerIdx) {
+            NCCLCHECK(rasConnCreate(&rasPeers[linkConn->peerIdx].addr, &linkConn->connIdx));
+          }
+          else { // If we didn't initiate the connection, start the timeout.
+            link->lastUpdatePeersTime = clockNano();
+          }
+        } // if (linkConn->peerIdx != -1)
+      } else { // link->nConns > 0
+        INFO(NCCL_RAS, "RAS link %d: opening new fallback connection %d with %s",
+             link->direction, link->nConns, ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine));
+        NCCLCHECK(rasConnCreate(&rasPeers[newPeerIdx].addr, &linkConn->connIdx));
+      } // link->nConns > 0
+    } else { // linkConn->connIdx != -1
+      if (link->nConns == 0) {
+        INFO(NCCL_RAS, "RAS link %d: calculated existing primary connection with %s",
+             link->direction, ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine));
+      } else {
+        INFO(NCCL_RAS, "RAS link %d: calculated existing fallback connection %d with %s",
+             link->direction, link->nConns, ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine));
+      }
+    }
+    link->nConns++;
+    if (linkConn->connIdx == -1)
+      break;
+    conn = rasConns+linkConn->connIdx;
+
+    // We check if the connection already went through the fallback calculation; if so, we'll need to create a new
+    // fallback in the next iteration, to ensure that RAS will keep retrying.
+    if (!conn->experiencingDelays)
+      break;
+
+    INFO(NCCL_RAS, "RAS connection experiencingDelays %d, startRetryTime %.2fs, socket status %d",
+         conn->experiencingDelays, (clockNano()-conn->startRetryTime)/1e9,
+         (conn->sockIdx == -1 ? -1 : rasSockets[conn->sockIdx].status));
+  }
+
+  return ncclSuccess;
+}
+
+// Calculates the index of the peer on the RAS network.  Can also be used to calculate the index of the next fallback
+// peer.
+// In the simplest case we want to try the "next closest" fallback, although we still need to check for and skip
+// any dead peers.
+// For fallbacks to fallbacks, we also apply a more pessimistic policy.  We skip all the remaining RAS threads that
+// are on the same node as the previous fallback (unless it's the same node that we're running on or we have strong
+// indications that the node is up).  We do that to avoid having to excessively wait iterating through, say, 8
+// processes when a whole node might be down.
+int rasLinkCalculatePeer(const struct rasLink* link, int peerIdx, bool isFallback) {
+  int newPeerIdx = (peerIdx + link->direction + nRasPeers) % nRasPeers;
+  do {
+    if (isFallback && !ncclSocketsSameNode(&rasPeers[peerIdx].addr, &rasNetListeningSocket.addr)) {
+      // peerIdx is a fallback and it is not running on the same node as us.
+      int tryPeerIdx = newPeerIdx;
+      int tryConnIdx = -1;
+
+      // Try to skip the remaining peers on the same node as peerIdx.  We may end up skipping over some peers that
+      // are alive, which is fine -- they will still have connectivity with the rest of the RAS network, just a
+      // little suboptimal one.
+      while (ncclSocketsSameNode(&rasPeers[tryPeerIdx].addr, &rasPeers[peerIdx].addr)) {
+        if (!rasPeerIsDead(&rasPeers[tryPeerIdx].addr)) {
+          tryConnIdx = rasConnFind(&rasPeers[tryPeerIdx].addr);
+          if (tryConnIdx != -1) {
+            struct rasConnection* tryConn = rasConns+tryConnIdx;
+            // Check if the connection is fully established and operational, i.e., if the underlying socket
+            // is ready and there's been recent communication on it.
+            if (tryConn->sockIdx != -1 && rasSockets[tryConn->sockIdx].status == RAS_SOCK_READY &&
+                !tryConn->experiencingDelays) {
+              // We convinced ourselves that the node is not down.  We don't adjust newPeerIdx in
+              // this case.  This is the only case when tryConnIdx != -1 after this loop.
+              break;
+            }
+          } // if (tryConnIdx != -1)
+        } // if (!rasPeerIsDead(&rasPeers[tryPeerIdx].addr))
+
+        tryConnIdx = -1;
+        tryPeerIdx = (tryPeerIdx + nRasPeers + link->direction) % nRasPeers;
+        if (tryPeerIdx == myPeerIdx)
+          break;
+      }
+
+      if (tryConnIdx == -1)
+        newPeerIdx = tryPeerIdx;
+      if (tryPeerIdx == myPeerIdx)
+        break;
+    } // if (isFallback && !ncclSocketsSameNode(&rasPeers[peerIdx].addr, &rasNetListeningSocket.addr))
+    
+    if (rasPeerIsDead(&rasPeers[newPeerIdx].addr)) {
+      newPeerIdx = (newPeerIdx + nRasPeers + link->direction) % nRasPeers;
+    }
+    else
+      break;
+  } while (newPeerIdx != myPeerIdx);
+
+  return (newPeerIdx != myPeerIdx ? newPeerIdx : -1);
+}
+
+
+//////////////////////////////////////////////////////
+// Functions related to the handling of dead peers. //
+//////////////////////////////////////////////////////
+
+// Marks a peer as dead in the local rasDeadPeers array.  Any propagation, reconfiguration, etc., needs to be
+// handled outside of this function.
+ncclResult_t rasPeerDeclareDead(const union ncclSocketAddress* addr) {
+  union ncclSocketAddress* deadAddr;
+
+  if (!rasPeerIsDead(addr)) {
+    NCCLCHECK(getNewDeadEntry(&deadAddr));
+    memcpy(deadAddr, addr, sizeof(*deadAddr));
+    qsort(rasDeadPeers, nRasDeadPeers, sizeof(*rasDeadPeers), &ncclSocketsCompare);
+
+    rasDeadPeersHash = getHash((const char*)rasDeadPeers, nRasDeadPeers*sizeof(*rasDeadPeers));
+
+    INFO(NCCL_RAS, "RAS declaring peer %s as DEAD; rasDeadPeersHash 0x%lx",
+         ncclSocketToString(addr, rasLine), rasDeadPeersHash);
+  }
+  return ncclSuccess;
+}
+
+// Invoked when an incoming RAS_MSG_PEERSUPDATE includes info on dead peers.  Updates the rasDeadPeers array.
+// Any propagation needs to be handled outside of this function, though it *does* disconnect any connections
+// with the newly dead peers.
+// On return, nUpdatePeers contains the number of newly added dead entries.
+static ncclResult_t rasDeadPeersUpdate(union ncclSocketAddress* updatePeers, int* nUpdatePeers) {
+  static union ncclSocketAddress* newPeers = nullptr;
+  static union ncclSocketAddress* oldPeers;
+
+  if (*nUpdatePeers == 0)
+    return ncclSuccess;
+
+  // Pessimistically estimate the new size of rasDeadPeers.
+  int nNewPeers = nRasDeadPeers + *nUpdatePeers;
+  if (nNewPeers > rasDeadPeersSize) {
+    nNewPeers = ROUNDUP(nNewPeers, RAS_INCREMENT);
+
+    NCCLCHECK(ncclCalloc(&newPeers, nNewPeers));
+    oldPeers = rasDeadPeers;
+  } else {
+    // We don't need to allocate a new array in this case.  We just shift the existing content to the end of the
+    // array to make room in the front for merging.
+    oldPeers = rasDeadPeers+(rasDeadPeersSize-nRasDeadPeers);
+    memmove(oldPeers, rasDeadPeers, nRasDeadPeers*sizeof(*rasDeadPeers));
+    newPeers = rasDeadPeers;
+  }
+
+  // Merge updatePeers with oldPeers into newPeers.
+  int oldPeersIdx, updatePeersIdx, newPeersIdx;
+  for (oldPeersIdx = updatePeersIdx = newPeersIdx = 0; oldPeersIdx < nRasDeadPeers || updatePeersIdx < *nUpdatePeers;) {
+    int cmp;
+    if (oldPeersIdx < nRasDeadPeers && updatePeersIdx < *nUpdatePeers) {
+      cmp = ncclSocketsCompare(oldPeers+oldPeersIdx, updatePeers+updatePeersIdx);
+    } else {
+      cmp = (oldPeersIdx < nRasDeadPeers ? -1 : 1);
+    }
+
+    memmove(newPeers+newPeersIdx++, (cmp <= 0 ? oldPeers+oldPeersIdx : updatePeers+updatePeersIdx), sizeof(*newPeers));
+    if (cmp <= 0)
+      oldPeersIdx++;
+    if (cmp > 0) {
+      rasConnDisconnect(updatePeers+updatePeersIdx);
+    }
+    if (cmp >= 0)
+      updatePeersIdx++;
+  }
+  *nUpdatePeers = newPeersIdx - nRasDeadPeers;
+  nRasDeadPeers = newPeersIdx;
+
+  if (newPeers != rasDeadPeers) {
+    free(rasDeadPeers);
+    rasDeadPeers = newPeers;
+    rasDeadPeersSize = nNewPeers;
+  }
+
+  rasDeadPeersHash = getHash((const char*)rasDeadPeers, nRasDeadPeers*sizeof(*rasDeadPeers));
+
+  return ncclSuccess;
+}
+
+// Returns the index of the first available entry in the rasDeadPeers array, enlarging the array if necessary.
+static ncclResult_t getNewDeadEntry(union ncclSocketAddress** pAddr) {
+  if (nRasDeadPeers == rasDeadPeersSize) {
+    NCCLCHECK(ncclRealloc(&rasDeadPeers, rasDeadPeersSize, rasDeadPeersSize+RAS_INCREMENT));
+    rasDeadPeersSize += RAS_INCREMENT;
+  }
+
+  *pAddr = rasDeadPeers+(nRasDeadPeers++);
+  return ncclSuccess;
+}
+
+// Checks whether a peer is dead by looking it up in the rasDeadPeers array.
+bool rasPeerIsDead(const union ncclSocketAddress* addr) {
+  return (rasDeadPeers != nullptr &&
+          bsearch(addr, rasDeadPeers, nRasDeadPeers, sizeof(*rasDeadPeers), ncclSocketsCompare) != nullptr);
+}
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Auxiliary functions -- primarily sorting/searching callbacks, plus some debug output support. //
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Searching callback for struct rasRankInit.  Compares the ncclSocketAddress key against a rasRankInit element.
+static int rasAddrRankInitCompare(const void* k, const void* e) {
+  const union ncclSocketAddress* key = (const union ncclSocketAddress*)k;
+  const struct rasRankInit* elem = (const struct rasRankInit*)e;
+
+  return ncclSocketsCompare(key, &elem->addr);
+}
+
+// Searching callback for struct rasPeerInfo.  Compares the ncclSocketAddress key against a rasPeerInfo element.
+static int rasAddrPeerInfoCompare(const void* k, const void* e) {
+  const union ncclSocketAddress* key = (const union ncclSocketAddress*)k;
+  const struct rasPeerInfo* elem = (const struct rasPeerInfo*)e;
+
+  return ncclSocketsCompare(key, &elem->addr);
+}
+
+// Sorting callback for struct rasRankInit. addr is the primary key; cudaDev is secondary.
+static int rasRanksCompare(const void* e1, const void* e2) {
+  const struct rasRankInit* r1 = (const struct rasRankInit*)e1;
+  const struct rasRankInit* r2 = (const struct rasRankInit*)e2;
+  int cmp = ncclSocketsCompare(&r1->addr, &r2->addr);
+  if (cmp == 0) {
+    if (r1->addr.sa.sa_family == 0) // Bail out in case of empty addresses...
+      return 0;
+    assert(r1->pid == r2->pid);
+    cmp = (r1->cudaDev < r2->cudaDev ? -1 : (r1->cudaDev > r2->cudaDev ? 1 : 0));
+    assert(cmp != 0); // There should be no complete duplicates within the rank array.
+  }
+  return cmp;
+}
+
+// Sorting callback for ncclSocketAddress.  We want to sort by the address family (IPv4 first), then the address,
+// then port.  Unfortunately, that's not the order of how they are laid out in memory, so one big memcmp won't do.
+// memcmp is still useful though for individual elements in the network byte order.
+int ncclSocketsCompare(const void* p1, const void* p2) {
+  const union ncclSocketAddress* a1 = (const union ncclSocketAddress*)p1;
+  const union ncclSocketAddress* a2 = (const union ncclSocketAddress*)p2;
+  // AF_INET (2) is less than AF_INET6 (10).
+  int family = a1->sa.sa_family;
+  if (family != a2->sa.sa_family) {
+    if (family > 0 && a2->sa.sa_family > 0)
+      return (family < a2->sa.sa_family ? -1 : 1);
+    else // Put empty addresses at the end (not that it matters...).
+      return (family > 0 ? -1 : 1);
+  }
+
+  int cmp;
+  if (family == AF_INET) {
+    if ((cmp = memcmp(&a1->sin.sin_addr, &a2->sin.sin_addr, sizeof(a1->sin.sin_addr))) == 0) {
+      cmp = memcmp(&a1->sin.sin_port, &a2->sin.sin_port, sizeof(a1->sin.sin_port));
+    }
+  }
+  else if (family == AF_INET6) {
+    if ((cmp = memcmp(&a1->sin6.sin6_addr, &a2->sin6.sin6_addr, sizeof(a1->sin6.sin6_addr))) == 0) {
+      cmp = memcmp(&a1->sin6.sin6_port, &a2->sin6.sin6_port, sizeof(a1->sin6.sin6_port));
+    }
+  } else {
+    // The only remaining valid case are empty addresses.
+    assert(family == 0);
+    cmp = 0; // Two empty addresses are equal...
+  }
+
+  return cmp;
+}
+
+// Returns true if two socket addresses are from the same node (actually, the same network interface on one node).
+bool ncclSocketsSameNode(const union ncclSocketAddress* a1, const union ncclSocketAddress* a2) {
+  // AF_INET (2) is less than AF_INET6 (10).
+  int family = a1->sa.sa_family;
+  if (family != a2->sa.sa_family)
+    return false;
+
+  if (family == AF_INET)
+    return (memcmp(&a1->sin.sin_addr, &a2->sin.sin_addr, sizeof(a1->sin.sin_addr)) == 0);
+  else if (family == AF_INET6)
+    return (memcmp(&a1->sin6.sin6_addr, &a2->sin6.sin6_addr, sizeof(a1->sin6.sin6_addr)) == 0);
+  else
+    return true; // Two empty addresses are equal...
+}
+
+// Debug output routine: dumps the rasPeers array.
+static void rasPeersDump() {
+  for (int p = 0; p < nRasPeers; p++) {
+    const struct rasPeerInfo* peer = rasPeers+p;
+    INFO(NCCL_RAS, "RAS peer %d: %s%s", p, rasPeerDump(peer, rasLine, sizeof(rasLine)), (p == myPeerIdx ? " [this process]" : ""));
+  }
+  if (nRasPeers > 0)
+    INFO(NCCL_RAS, "RAS peersHash 0x%lx", rasPeersHash);
+}
+
+// Debug output routine: dumps the rasDeadPeers array.
+static void rasDeadPeersDump() {
+  for (int p = 0; p < nRasDeadPeers; p++) {
+    int deadPeerIdx = rasPeerFind(rasDeadPeers+p);
+    INFO(NCCL_RAS, "RAS dead peer %d: %s", p,
+         (deadPeerIdx >= 0 ? rasPeerDump(rasPeers+deadPeerIdx, rasLine, sizeof(rasLine)) :
+          ncclSocketToString(rasDeadPeers+p, rasLine)));
+  }
+  if (nRasDeadPeers > 0)
+    INFO(NCCL_RAS, "RAS deadPeersHash 0x%lx", rasDeadPeersHash);
+}
+
+// Debug output routine: dumps part of an individual element from the rasPeers array.
+static char* rasPeerDump(const struct rasPeerInfo* peer, char* result, size_t nres) {
+  char line[SOCKET_NAME_MAXLEN+1], line2[1024];
+  snprintf(result, nres, "socket %s, pid %d, GPU%s %s", ncclSocketToString(&peer->addr, line), peer->pid,
+           (__builtin_popcountll(peer->cudaDevs) > 1 ? "s" : ""),
+           rasGpuDevsToString(peer->cudaDevs, peer->nvmlDevs, line2, sizeof(line2)));
+  return result;
+}
diff --git a/src/ras/ras.cc b/src/ras/ras.cc
new file mode 100644
index 0000000000..4905d7a69c
--- /dev/null
+++ b/src/ras/ras.cc
@@ -0,0 +1,668 @@
+/*************************************************************************
+ * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#define NDEBUG // Comment out during development only!
+#include <cassert>
+#include <cstddef>
+#include <mutex>
+#include <poll.h>
+#include <unistd.h>
+
+#include "alloc.h"
+#include "checks.h"
+#include "comm.h"
+#include "nccl.h"
+#include "utils.h"
+#include "ras_internal.h"
+
+// Type of a notification from a local NCCL thread.
+typedef enum {
+  RAS_ADD_RANKS = 0,
+  RAS_TERMINATE = 1
+} rasNotificationType;
+
+// Used for communication from local NCCL threads to the RAS thread.
+struct rasNotification {
+  rasNotificationType type;
+  union {
+    struct {
+      struct rasRankInit* ranks;
+      int nranks;
+    } addRanks;
+  };
+};
+static_assert(sizeof(struct rasNotification) <= PIPE_BUF, "The rasNotification structure is too large");
+
+// These ensure that we get only one RAS port/thread per process.
+static std::mutex rasInitMutex;
+static bool rasInitialized = false;
+static int rasInitRefCount = 0;
+
+// The RAS network listening socket of this RAS thread (random port).
+struct ncclSocket rasNetListeningSocket;
+
+static pthread_t rasThread;
+
+// Used for communication from regular NCCL threads to the RAS thread.
+static std::mutex rasNotificationMutex;
+static int rasNotificationPipe[2] = {-1, -1};
+
+// Data for the main poll() in the RAS thread.
+struct pollfd* rasPfds;
+static int nRasPfds;
+
+// We use it all over the place; no point in wasting the stack...
+char rasLine[SOCKET_NAME_MAXLEN+1];
+
+// An array holding the addresses of all NCCL communicators.  Modified by the NCCL threads (hence the mutex), read by
+// the RAS thread.
+std::mutex ncclCommsMutex;
+struct ncclComm** ncclComms = nullptr;
+int nNcclComms = 0;
+bool ncclCommsSorted = false; // Whether the array is currently sorted. We sort by the comms' commHash and rank.
+
+static ncclResult_t rasLocalNotify(const struct rasNotification* msg);
+static ncclResult_t rasLocalHandle();
+static void rasLocalHandleTerminate();
+
+static ncclResult_t rasMsgHandleConnInit(const struct rasMsg* msg, struct rasSocket* sock);
+static ncclResult_t rasMsgHandleConnInitAck(const struct rasMsg* msg, struct rasSocket* sock);
+static ncclResult_t rasNetSendNack(struct rasSocket* sock);
+
+static void* rasThreadMain(void*);
+
+NCCL_PARAM(RasTimeoutFactor, "RAS_TIMEOUT_FACTOR", 1);
+
+//////////////////////////////////////////////////
+// Functions invoked from regular NCCL threads. //
+//////////////////////////////////////////////////
+
+// Invoked by regular NCCL threads on every comm initialization.  This is the first function to call.
+// The myRank structure should be passed with the addr element initialized to the IP address of the bootstrap
+// network interface to use.  On a successful return, the address will be updated with the port number of the
+// RAS network listening socket.
+ncclResult_t ncclRasCommInit(struct ncclComm* comm, struct rasRankInit* myRank) {
+  ncclResult_t ret = ncclSuccess;
+  if (!rasInitialized) {
+    std::lock_guard<std::mutex> lock(rasInitMutex);
+    if (!rasInitialized) {
+      union ncclSocketAddress addr;
+
+      memcpy(&addr, &myRank->addr, sizeof(addr));
+      (addr.sa.sa_family == AF_INET ? addr.sin.sin_port : addr.sin6.sin6_port) = htons(0);
+      NCCLCHECKGOTO(ncclSocketInit(&rasNetListeningSocket, &addr, NCCL_SOCKET_MAGIC, ncclSocketTypeRasNetwork,
+                                   /*abortFlag*/nullptr, /*asyncFlag*/1), ret, fail);
+      NCCLCHECKGOTO(ncclSocketListen(&rasNetListeningSocket), ret, fail);
+      INFO(NCCL_RAS, "RAS network listening socket at %s",
+           ncclSocketToString(&rasNetListeningSocket.addr, rasLine));
+
+      (void)rasClientInitSocket();
+
+      SYSCHECKGOTO(pipe(rasNotificationPipe), "pipe", ret, fail);
+
+      PTHREADCHECKGOTO(pthread_create(&rasThread, nullptr, &rasThreadMain, nullptr), "pthread_create", ret, fail);
+      ncclSetThreadName(rasThread, "NCCL RAS");
+      (void)pthread_detach(rasThread);
+
+      rasInitialized = true;
+    }
+  }
+  ncclAtomicRefCountIncrement(&rasInitRefCount);
+
+  {
+    std::lock_guard<std::mutex> lock(ncclCommsMutex);
+
+    int i;
+    for (i = 0; i < nNcclComms; i++) {
+      if (ncclComms[i] == nullptr)
+        break;
+    }
+    if (i == nNcclComms) {
+      NCCLCHECK(ncclRealloc(&ncclComms, nNcclComms, nNcclComms+RAS_INCREMENT*8));
+      nNcclComms += RAS_INCREMENT*8;
+    }
+    ncclComms[i] = comm;
+    ncclCommsSorted = false;
+  }
+
+  if (myRank != nullptr)
+    memcpy(&myRank->addr, &rasNetListeningSocket.addr, sizeof(myRank->addr));
+
+exit:
+  return ret;
+fail:
+  if (rasNotificationPipe[1] != 0)
+    (void)close(rasNotificationPipe[1]);
+  if (rasNotificationPipe[0] != 0)
+    (void)close(rasNotificationPipe[0]);
+  (void)close(rasClientListeningSocket);
+  (void)ncclSocketClose(&rasNetListeningSocket);
+  goto exit;
+}
+
+// Invoked by regular NCCL threads on every comm termination.
+ncclResult_t ncclRasCommFini(const struct ncclComm* comm) {
+  if (!rasInitialized)
+    return ncclSuccess;
+  {
+    std::lock_guard<std::mutex> lock(ncclCommsMutex);
+    for (int i = 0; i < nNcclComms; i++) {
+      if (ncclComms[i] == comm) {
+        ncclComms[i] = nullptr;
+        ncclCommsSorted = false;
+        break;
+      }
+    }
+  }
+  if (ncclAtomicRefCountDecrement(&rasInitRefCount) == 0) {
+    struct rasNotification msg;
+    msg.type = RAS_TERMINATE;
+    NCCLCHECK(rasLocalNotify(&msg));
+  }
+  return ncclSuccess;
+}
+
+// Invoked by regular NCCL threads on every (non-split) comm initialization.  Provides info on all the ranks within
+// the communicator.
+ncclResult_t ncclRasAddRanks(struct rasRankInit* ranks, int nranks) {
+  struct rasNotification msg;
+  msg.type = RAS_ADD_RANKS;
+  msg.addRanks.ranks = ranks;
+  msg.addRanks.nranks = nranks;
+  NCCLCHECK(rasLocalNotify(&msg));
+  return ncclSuccess;
+}
+
+// Internal function running on regular NCCL threads -- asynchronously notifies the RAS thread.
+static ncclResult_t rasLocalNotify(const struct rasNotification* msg) {
+  if (!rasInitialized)
+    return ncclSuccess;
+
+  // Take an exclusive lock here to avoid multiplexing between multiple user threads (not sure if it's
+  // strictly required, but it won't hurt)...
+  std::lock_guard<std::mutex> lock(rasNotificationMutex);
+  size_t done = 0;
+  while (done < sizeof(*msg)) {
+    ssize_t written;
+    SYSCHECK(written = write(rasNotificationPipe[1], (char*)msg + done, sizeof(*msg) - done), "write");
+    done += written;
+  }
+  return ncclSuccess;
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////
+// Functions related to the handling of local notifications from NCCL threads. //
+/////////////////////////////////////////////////////////////////////////////////
+
+// Handles asynchronous local notifications arriving from regular NCCL threads.
+static ncclResult_t rasLocalHandle() {
+  struct rasNotification msg;
+
+  size_t done = 0;
+  while (done < sizeof(msg)) {
+    ssize_t nread;
+    SYSCHECK(nread = read(rasNotificationPipe[0], (char*)&msg + done, sizeof(msg) - done), "read");
+    if (nread == 0) // EOF
+      return ncclSystemError;
+    done += nread;
+  }
+
+  if (msg.type == RAS_ADD_RANKS) {
+    NCCLCHECK(rasLocalHandleAddRanks(msg.addRanks.ranks, msg.addRanks.nranks));
+  } else if (msg.type == RAS_TERMINATE) {
+    rasLocalHandleTerminate();
+  } else {
+    WARN("RAS received unknown notification type %d", msg.type);
+    return ncclInternalError;
+  }
+
+  return ncclSuccess;
+}
+
+// Handles local RAS_TERMINATE notification.
+static void rasLocalHandleTerminate() {
+  INFO(NCCL_RAS, "RAS handling local termination request");
+  // For now we don't do anything.
+}
+
+
+////////////////////////////////////////////////
+// Generic functions related to RAS messages. //
+////////////////////////////////////////////////
+
+// Allocates a RAS message of the desired length for sending.
+// Behind the scenes allocates encapsulating rasMsgMeta structure, which includes local metadata stored in front
+// of the message.
+// Must use rasMsgFree to free.
+ncclResult_t rasMsgAlloc(struct rasMsg** msg, size_t msgLen) {
+  struct rasMsgMeta* meta = nullptr;
+  NCCLCHECK(ncclCalloc((char**)&meta, offsetof(struct rasMsgMeta, msg) + msgLen));
+  *msg = &meta->msg;
+  // coverity[leaked_storage:FALSE] => rasMsgFree is used to free it
+  return ncclSuccess;
+}
+
+// To be used only with messages allocated with rasMsgAlloc.  I.e., it should be used for sent messages, not
+// for received ones.
+void rasMsgFree(struct rasMsg* msg) {
+  if (msg) {
+    struct rasMsgMeta* meta = (struct rasMsgMeta*)((char*)msg - offsetof(struct rasMsgMeta, msg));
+    free(meta);
+  }
+}
+
+// Enqueues a message for sending down a RAS connection.
+void rasConnEnqueueMsg(struct rasConnection* conn, struct rasMsg* msg, size_t msgLen, bool front) {
+  // Get to the metadata of this message.
+  struct rasMsgMeta* meta = (struct rasMsgMeta*)((char*)msg - offsetof(struct rasMsgMeta, msg));
+  bool ready = false;
+
+  meta->enqueueTime = clockNano();
+  meta->offset = 0;
+  meta->length = msgLen;
+
+  if (front)
+    ncclIntruQueueEnqueueFront(&conn->sendQ, meta);
+  else
+    ncclIntruQueueEnqueue(&conn->sendQ, meta);
+
+  if (conn->sockIdx != -1) {
+    struct rasSocket* sock = rasSockets+conn->sockIdx;
+    if (sock->status == RAS_SOCK_READY || (sock->status == RAS_SOCK_HANDSHAKE && msg->type == RAS_MSG_CONNINIT)) {
+      rasPfds[sock->pfd].events |= POLLOUT;
+      ready = true;
+    }
+  }
+  if (!ready) {
+    // It's not a bug, unless it's for things like keep-alive messages...
+    INFO(NCCL_RAS, "RAS enqueued message type %d on a non-ready connection with %s "
+         "(experiencingDelays %d, startRetryTime %.2fs, socket status %d)",
+         msg->type, ncclSocketToString(&conn->addr, rasLine),
+         conn->experiencingDelays, (conn->startRetryTime ? (clockNano()-conn->startRetryTime)/1e9 : 0.0),
+         (conn->sockIdx == -1 ? -1 : rasSockets[conn->sockIdx].status));
+  }
+}
+
+// Attempts to send the queued RAS messages to another RAS thread.
+ncclResult_t rasConnSendMsg(struct rasConnection* conn, int* closed, bool* allSent) {
+  struct ncclSocket* sock = &rasSockets[conn->sockIdx].sock;
+  struct rasMsgMeta* meta;
+  *closed = 0;
+  while ((meta = ncclIntruQueueHead(&conn->sendQ)) != nullptr) {
+    if (rasSockets[conn->sockIdx].status == RAS_SOCK_HANDSHAKE && meta->msg.type != RAS_MSG_CONNINIT) {
+      // We don't send anything beyond the handshake at this point.
+      meta = nullptr;
+      break;
+    }
+    if (meta->offset < sizeof(meta->length)) {
+      // Send the length of the message.
+      NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, sock, &meta->length, sizeof(meta->length), &meta->offset, closed));
+      if (*closed)
+        return ncclSuccess;
+      if (meta->offset < sizeof(meta->length))
+        break;
+    }
+    // Send the body of the message.
+    NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, sock, ((char*)&meta->msg)-sizeof(meta->length),
+                                 meta->length+sizeof(meta->length), &meta->offset, closed));
+    if (*closed)
+      return ncclSuccess;
+    if (meta->offset < meta->length+sizeof(meta->length))
+      break;
+    ncclIntruQueueDequeue(&conn->sendQ);
+    free(meta);
+  }
+
+  *allSent = !meta;
+
+  return ncclSuccess;
+}
+
+// Attempts to receive a message through a RAS socket.
+ncclResult_t rasMsgRecv(struct rasSocket* sock, struct rasMsg** msg, int* closed) {
+  *closed = 0;
+  if (sock->recvOffset < sizeof(sock->recvLength)) {
+    // Receive the length of the message.
+    NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &sock->sock, &sock->recvLength, sizeof(sock->recvLength),
+                                 &sock->recvOffset, closed));
+    if (*closed || sock->recvOffset < sizeof(sock->recvLength))
+      return ncclSuccess;
+    NCCLCHECK(ncclCalloc((char**)&sock->recvMsg, sock->recvLength));
+  }
+  // Receive the body of the message.
+  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &sock->sock, ((char*)sock->recvMsg)-sizeof(sock->recvLength),
+                               sock->recvLength+sizeof(sock->recvLength), &sock->recvOffset, closed));
+  if (*closed || sock->recvOffset < sock->recvLength+sizeof(sock->recvLength))
+    return ncclSuccess;
+
+  *msg = sock->recvMsg;
+  sock->recvMsg = nullptr;
+  sock->recvOffset = sock->recvLength = 0;
+
+  return ncclSuccess;
+}
+
+
+//////////////////////////////////////////////////////////////////
+// Functions related to the handling of specific message types. //
+//////////////////////////////////////////////////////////////////
+
+// Invoked from the main RAS thread to dispatch incoming messages to the appropriate handler.
+ncclResult_t rasMsgHandle(struct rasMsg* msg, struct rasSocket* sock) {
+  if (msg->type == RAS_MSG_CONNINIT) {
+    NCCLCHECK(rasMsgHandleConnInit(msg, sock));
+  } else if (msg->type == RAS_MSG_CONNINITACK) {
+    NCCLCHECK(rasMsgHandleConnInitAck(msg, sock));
+  } else if (msg->type == RAS_MSG_KEEPALIVE) {
+    NCCLCHECK(rasMsgHandleKeepAlive(msg, sock));
+  } else if (msg->type == RAS_MSG_PEERSUPDATE) {
+    NCCLCHECK(rasMsgHandlePeersUpdate(msg, sock));
+  } else if (msg->type == RAS_MSG_COLLREQ) {
+    NCCLCHECK(rasMsgHandleCollReq(msg, sock));
+  } else if (msg->type == RAS_MSG_COLLRESP) {
+    NCCLCHECK(rasMsgHandleCollResp(msg, sock));
+  } else {
+    WARN("RAS received unknown message type (%d) from %s", msg->type, ncclSocketToString(&sock->sock.addr, rasLine));
+    return ncclInternalError;
+  }
+
+  return ncclSuccess;
+}
+
+// Handles the first message sent over a RAS socket as part of the handshake.
+static ncclResult_t rasMsgHandleConnInit(const struct rasMsg* msg, struct rasSocket* sock) {
+  ncclResult_t ret = ncclSuccess;
+  struct rasConnection* conn = nullptr;
+  int connIdx, peerIdx;
+  struct rasMsg* newMsg = nullptr;
+  int newMsgLen;
+  char line[SOCKET_NAME_MAXLEN+1];
+
+  INFO(NCCL_RAS, "RAS handling connInit from %s (version %d, listeningAddr %s, peersHash 0x%lx, deadPeersHash 0x%lx)",
+       ncclSocketToString(&sock->sock.addr, rasLine), msg->connInit.ncclVersion,
+       ncclSocketToString(&msg->connInit.listeningAddr, line), msg->connInit.peersHash, msg->connInit.deadPeersHash);
+
+  if (msg->connInit.ncclVersion != NCCL_VERSION_CODE) {
+    // Close any such sockets immediately!  This is basically unrecoverable...
+    WARN("NCCL version mismatch with remote peer %s (local: %d, remote %d)",
+         ncclSocketToString(&sock->sock.addr, rasLine), NCCL_VERSION_CODE, msg->connInit.ncclVersion);
+    rasNetSendNack(sock);
+    rasSocketTerminate(sock, /*finalize*/true);
+    ret = ncclInvalidUsage;
+    goto exit;
+  }
+
+  if (rasPeerIsDead(&msg->connInit.listeningAddr)) {
+    // A peer long declared dead is suddenly alive again?!
+    INFO(NCCL_RAS, "RAS connection from peer %s that is considered dead!",
+         ncclSocketToString(&msg->connInit.listeningAddr, rasLine));
+    rasNetSendNack(sock);
+    rasSocketTerminate(sock, /*finalize*/true);
+    goto exit;
+  }
+
+  // Check for any existing connection with that RAS thread (could happen due to a network issue, or possibly a race).
+  connIdx = rasConnFind(&msg->connInit.listeningAddr);
+  if (connIdx != -1) {
+    conn = rasConns+connIdx;
+
+    INFO(NCCL_RAS,
+         "RAS found a matching existing connection (sendQ %sempty, experiencingDelays %d, startRetryTime %.2fs)",
+         (ncclIntruQueueEmpty(&conn->sendQ) ? "" : "not "),
+         conn->experiencingDelays, (conn->startRetryTime ? (clockNano()-conn->startRetryTime)/1e9 : 0.0));
+
+    if (conn->sockIdx != -1) {
+      struct rasSocket* connSock = rasSockets+conn->sockIdx;
+      INFO(NCCL_RAS, "RAS found an alternative existing socket (status %d, createTime %.2fs)",
+           connSock->status, (clockNano()-connSock->createTime)/1e9);
+      // In general we prefer to keep the newer connection, but "newer" can be a relative term: we may have
+      // a race where both sides attempt to establish a connection at roughly the same time, so the other side's
+      // incoming connection ends up looking newer than the locally-initiated one -- for *both* of them.
+      // If each side closed the "old" one, both would end up being closed.
+      // As we normally try to initiate connections from the side with a lower address (precisely to avoid such
+      // situations), we'll follow the same logic here: the "lower" side will reject the new connection (as it
+      // came from the "wrong" side), whereas the "higher" side will keep the new one (as it came from the correct
+      // side) and terminate the old one (that it presumably just opened).
+      if (ncclSocketsCompare(&rasNetListeningSocket.addr, &conn->addr) < 0) {
+        INFO(NCCL_RAS, "RAS terminating the new socket");
+        rasSocketTerminate(sock, /*finalize*/true);
+        goto exit;
+      } else {
+        INFO(NCCL_RAS, "RAS keeping the new socket and terminating the existing one");
+        rasSocketTerminate(connSock);
+      }
+    }
+  }
+  if (!conn) {
+    NCCLCHECK(getNewConnEntry(&conn));
+    memcpy(&conn->addr, &msg->connInit.listeningAddr, sizeof(conn->addr));
+    connIdx = conn - rasConns;
+  }
+
+  sock->status = RAS_SOCK_READY;
+  // rasConnResume will reset any experiencingDelays, startRetryTime, etc.
+
+  conn->sockIdx = sock-rasSockets;
+  sock->connIdx = connIdx;
+  memcpy(&sock->sock.addr, &msg->connInit.listeningAddr, sizeof(sock->sock.addr));
+
+  // Make sure that the connection is part of the right links forming the RAS network.  At this point we only
+  // update the expected (non-external) connections; external ones will be added during keep-alive handling.
+  peerIdx = rasPeerFind(&conn->addr);
+  // Note: it's possible for peerIdx to be -1 at this point if, due to races, the connInit arrives before
+  // the peers update.
+  if (peerIdx != -1) {
+    (void)rasLinkUpdateConn(&rasNextLink, connIdx, peerIdx);
+    (void)rasLinkUpdateConn(&rasPrevLink, connIdx, peerIdx);
+  }
+
+  // Send a confirmation to the server that requested the connection (so that the resilience code can mark
+  // the connection as live).
+  newMsgLen = rasMsgLength(RAS_MSG_CONNINITACK);
+  NCCLCHECK(rasMsgAlloc(&newMsg, newMsgLen));
+  newMsg->type = RAS_MSG_CONNINITACK;
+  newMsg->connInitAck.nack = 0;
+  rasConnEnqueueMsg(conn, newMsg, newMsgLen, /*front*/true);
+
+  conn->lastRecvPeersHash = msg->connInit.peersHash;
+  conn->lastRecvDeadPeersHash = msg->connInit.deadPeersHash;
+
+  if (msg->connInit.peersHash != rasPeersHash || msg->connInit.deadPeersHash != rasDeadPeersHash) {
+    // Send my rasPeers and request the same in return.
+    INFO(NCCL_RAS, "RAS connInit hash mismatch (my peersHash 0x%lx, deadPeersHash 0x%lx); sending my (dead) peers",
+         rasPeersHash, rasDeadPeersHash);
+    NCCLCHECK(rasConnSendPeersUpdate(conn, rasPeers, nRasPeers));
+  }
+exit:
+  return ret;
+}
+
+// Handles the second message sent over a RAS socket as part of the handshake.
+static ncclResult_t rasMsgHandleConnInitAck(const struct rasMsg* msg, struct rasSocket* sock) {
+  INFO(NCCL_RAS, "RAS handling connInitAck from %s (nack %d)",
+       ncclSocketToString(&sock->sock.addr, rasLine), msg->connInitAck.nack);
+
+  if (msg->connInitAck.nack) {
+    // The remote peer doesn't want to talk to us.  The easiest way to prevent it is by declaring it dead.
+    // We make a copy of the address because rasConnDisconnect will terminate the rasSocket.
+    union ncclSocketAddress addr;
+    memcpy(&addr, &sock->sock.addr, sizeof(addr));
+    rasConnDisconnect(&addr);
+    (void)rasPeerDeclareDead(&addr);
+
+    return ncclSuccess;
+  }
+
+  sock->status = RAS_SOCK_READY;
+  // rasConnResume will reset any experiencingDelays, startRetryTime, etc.
+
+  return ncclSuccess;
+}
+
+// Handles the deadPeer broadcast.
+void rasMsgHandleBCDeadPeer(const struct rasCollRequest* req, bool* pDone) {
+  INFO(NCCL_RAS, "RAS handling deadPeer (addr %s)", ncclSocketToString(&req->deadPeer.addr, rasLine));
+
+  if (!rasPeerIsDead(&req->deadPeer.addr)) {
+    rasConnDisconnect(&req->deadPeer.addr);
+    (void)rasPeerDeclareDead(&req->deadPeer.addr);
+    *pDone = false;
+  } else {
+    INFO(NCCL_RAS, "RAS already knew it was dead");
+    // No point in re-broadcasting what's already known.
+    *pDone = true;
+  }
+}
+
+// Attempts to immediately send a fatal NACK connInitAck response to a socket.  A bit of a hack (as it doesn't
+// follow our usual message queuing and polling convention) but, since this can be invoked only for newly opened
+// connections, and the message is tiny, it should be OK.  We can't use the regular path because the socket is
+// about to be terminated.
+static ncclResult_t rasNetSendNack(struct rasSocket* sock) {
+  struct rasMsg msg;
+  int length = rasMsgLength(RAS_MSG_CONNINITACK);
+  int closed = 0;
+  int offset;
+
+  INFO(NCCL_RAS, "RAS sending NACK to %s", ncclSocketToString(&sock->sock.addr, rasLine));
+
+  msg.type = RAS_MSG_CONNINITACK;
+  msg.connInitAck.nack = 1;
+  offset = 0;
+  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &sock->sock, &length, sizeof(length), &offset, &closed));
+  if (closed || offset < sizeof(length))
+    return ncclSuccess;
+  offset = 0;
+  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &sock->sock, &msg, length, &offset, &closed));
+  // We are closing this socket anyway -- it doesn't matter to us if we succeeded or not.
+
+  return ncclSuccess;
+}
+
+
+/////////////////////////////////////////////////////////////////
+// Functions related to the main event loop of the RAS thread. //
+/////////////////////////////////////////////////////////////////
+
+// Main function of the RAS thread.
+static void* rasThreadMain(void*) {
+  ncclResult_t ret = ncclSuccess; // Unused.
+  int pfd;
+  int rasNetListeningSocketFd;
+
+  INFO(NCCL_RAS, "RAS thread started");
+
+  // Initialize the global pollfd with the file descriptors we already have (the pipe and the listening socket).
+  NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, fail);
+  rasPfds[pfd].fd = rasNotificationPipe[0];
+  rasPfds[pfd].events = POLLIN;
+
+  NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, fail);
+  NCCLCHECKGOTO(ncclSocketGetFd(&rasNetListeningSocket, &rasNetListeningSocketFd), ret, fail);
+  rasPfds[pfd].fd = rasNetListeningSocketFd;
+  rasPfds[pfd].events = POLLIN;
+
+  NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, fail);
+  rasPfds[pfd].fd = rasClientListeningSocket;
+  rasPfds[pfd].events = POLLIN;
+
+  // Main event loop of the RAS thread.
+  for (int64_t nextWakeup=0;;) {
+    int timeout, nEvents;
+    int64_t now = clockNano();
+    if (nextWakeup > 0) {
+      // The "1" below helps avoid round-downs and especially zeroes.
+      if (nextWakeup > now)
+        timeout = (nextWakeup - now) / (CLOCK_UNITS_PER_SEC / 1000) + 1;
+      else
+        timeout = 1;
+    } else {
+      timeout = 1000; // 1 second.
+    }
+
+    nEvents = poll(rasPfds, nRasPfds, timeout);
+
+    nextWakeup = clockNano()+CLOCK_UNITS_PER_SEC;
+    if (nEvents == -1 && errno != EINTR)
+      INFO(NCCL_RAS, "RAS continuing in spite of an unexpected error from poll: %s", strerror(errno));
+
+    // Handle any poll-related events.
+    for (int pollIdx = 0; pollIdx < nRasPfds && nEvents > 0; pollIdx++) {
+      if (rasPfds[pollIdx].revents) {
+        nEvents--;
+        if (rasPfds[pollIdx].fd == rasNotificationPipe[0]) {
+          (void)rasLocalHandle();
+        } else if (rasPfds[pollIdx].fd == rasNetListeningSocketFd) {
+          (void)rasNetAcceptNewSocket();
+        } else if (rasPfds[pollIdx].fd == rasClientListeningSocket) {
+          (void)rasClientAcceptNewSocket();
+        } else {
+          // Check if it's one of the RAS sockets.
+          int sockIdx;
+          for (sockIdx = 0; sockIdx < nRasSockets; sockIdx++) {
+            struct rasSocket* sock = rasSockets+sockIdx;
+            if (sock->status != RAS_SOCK_CLOSED && rasPfds[pollIdx].fd == sock->sock.fd) {
+              rasSockEventLoop(sockIdx, pollIdx);
+              break;
+            }
+          } // for (sockIdx)
+
+          if (sockIdx == nRasSockets) {
+            // Try a client socket instead.
+            for (int clientIdx = 0; clientIdx < nRasClients; clientIdx++) {
+              struct rasClient* client = rasClients+clientIdx;
+              if (client->status != RAS_CLIENT_CLOSED && rasPfds[pollIdx].fd == client->sock) {
+                rasClientEventLoop(clientIdx, pollIdx);
+                break;
+              }
+            } // for (clientIdx)
+          } // if (sockIdx == nRasSockets)
+        } // dynamic fds
+      } // if (revents)
+    } // for (pollIdx)
+
+    now = clockNano();
+
+    rasSocksHandleTimeouts(now, &nextWakeup);
+
+    rasConnsHandleTimeouts(now, &nextWakeup);
+
+    rasNetHandleTimeouts(now, &nextWakeup);
+
+    rasCollsHandleTimeouts(now, &nextWakeup);
+  } // for (;;)
+
+fail:
+  WARN("fatal error - RAS thread terminating");
+  std::lock_guard<std::mutex> lock(rasInitMutex);
+  (void)close(rasNotificationPipe[1]);
+  (void)close(rasNotificationPipe[0]);
+  (void)close(rasClientListeningSocket);
+  (void)ncclSocketClose(&rasNetListeningSocket);
+  rasInitialized = false;
+  return nullptr;
+}
+
+// Returns the index of the first available entry in the rasPfds array, enlarging the array if necessary.
+ncclResult_t rasGetNewPollEntry(int* index) {
+  int i;
+  for (i = 0; i < nRasPfds; i++)
+    if (rasPfds[i].fd == -1)
+      break;
+  if (i == nRasPfds) {
+    NCCLCHECK(ncclRealloc(&rasPfds, nRasPfds, nRasPfds+RAS_INCREMENT));
+    nRasPfds += RAS_INCREMENT;
+    for (int j = i; j < nRasPfds; j++)
+      rasPfds[j].fd = -1;
+  }
+
+  memset(rasPfds+i, '\0', sizeof(*rasPfds));
+  rasPfds[i].fd = -1;
+
+  *index = i;
+  return ncclSuccess;
+}
diff --git a/src/ras/ras_internal.h b/src/ras/ras_internal.h
new file mode 100644
index 0000000000..68cac0b44b
--- /dev/null
+++ b/src/ras/ras_internal.h
@@ -0,0 +1,512 @@
+/*************************************************************************
+ * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_RAS_INTERNAL_H_
+#define NCCL_RAS_INTERNAL_H_
+
+#define NCCL_RAS_CLIENT_PORT 28028
+#define NCCL_RAS_CLIENT_PROTOCOL 2
+
+#define RAS_COLLECTIVE_LEG_TIMEOUT_SEC 5
+#define RAS_COLLECTIVE_EXTRA_TIMEOUT_SEC RAS_COLLECTIVE_LEG_TIMEOUT_SEC
+
+// End of the client section; everything below is meant for the NCCL threads only.
+#ifndef NCCL_RAS_CLIENT
+
+#include <mutex>
+
+#include "nccl.h"
+#include "ras.h"
+#include "socket.h"
+#include "utils.h"
+
+// Type of a RAS network or client message.
+typedef enum {
+  RAS_MSG_CONNINIT = 1,
+  RAS_MSG_CONNINITACK = 2,
+  RAS_MSG_KEEPALIVE = 3,
+  RAS_MSG_PEERSUPDATE = 4,
+  RAS_MSG_COLLREQ = 5,
+  RAS_MSG_COLLRESP = 6,
+} rasMsgType;
+
+// Type of a RAS network collective message.
+typedef enum {
+  RAS_MSG_NONE = 0,
+  RAS_BC_DEADPEER = 1,
+  // Broadcast operations above this line; collective operations below (1000 is the demarcation line).
+  RAS_COLL_CONNS = 1001, // Collect data about all RAS connections.
+  RAS_COLL_COMMS = 1002, // Collect data about all communicators.
+} rasCollectiveType;
+
+// Payload of a collective request message (RAS_MSG_COLLREQ).
+struct rasCollRequest {
+  union ncclSocketAddress rootAddr;
+  uint64_t rootId;
+
+  int64_t timeout;
+  rasCollectiveType type;
+  union {
+    struct {
+      union ncclSocketAddress addr;
+    } deadPeer;
+    struct {
+    } conns;
+    struct {
+    } comms;
+  };
+};
+
+// Payload of a collective response message (RAS_MSG_COLLRESP).
+struct rasCollResponse {
+  union ncclSocketAddress rootAddr;
+  uint64_t rootId;
+
+  int nLegTimeouts; // If >0, indicates incomplete data.
+  int nPeers;
+  int nData; // Size of data in bytes.
+  union ncclSocketAddress peers[0]; // Variable length.
+  // The peersAddrs array is followed by:
+  //alignas(int64_t) char data[0]; // Variable length, collective-dependent.
+};
+
+// Describes a peer NCCL process.  Every RAS thread keeps an (identical) array of them, one entry for each
+// NCCL process.
+struct rasPeerInfo {
+  union ncclSocketAddress addr;
+  pid_t pid;
+  uint64_t cudaDevs; // Bitmask.  Conveniently, NCCL_MAX_LOCAL_RANKS == 64.
+  uint64_t nvmlDevs; // Same, but not affected by CUDA_VISIBLE_DEVICES.
+};
+
+// Describes a RAS message.  Every message is preceded by a (32-bit) message length.  All data in the host
+// byte order.  Depending on the message type, the length of the message will vary.
+struct rasMsg {
+  rasMsgType type;
+  union {
+    struct {
+      int ncclVersion;
+      union ncclSocketAddress listeningAddr;
+      uint64_t peersHash;
+      uint64_t deadPeersHash;
+    } connInit; // Sent by the connecting side as the first message.
+    struct {
+      int nack; // If non-0, we should stop trying to reconnect.
+    } connInitAck; // Response from the accepting side to the above.
+    struct {
+      uint64_t peersHash;
+      uint64_t deadPeersHash;
+      int linkMask; // What links at the destination peer should the connection be part of
+                    // (bit 0: nextLink; bit 1: prevLink).
+      struct timespec realTime; // Wallclock time at the source, for statistical purposes (in principle there's
+                                // no guarantee that the nodes have synchronized clocks so we can't really rely
+                                // on it for anything important)..
+      int nack; // If non-0, it means that this message is a response to an unexpected keepAlive message.
+    } keepAlive;
+    struct {
+      uint64_t peersHash;
+      uint64_t deadPeersHash;
+      int nPeers;
+      int nDeadPeers;
+      struct rasPeerInfo peers[0]; // Variable length.
+      // The peers array is followed by the following:
+      //union ncclSocketAddress deadPeers[0]; // Variable length.
+    } peersUpdate;
+    struct {
+      int protocol; // Protocol version, sent to the client.
+    } clientInit;
+    struct {
+      int nData;
+      char data[0]; // Variable length.
+    } clientDump;
+    struct rasCollRequest collReq; // Variable length.
+    struct rasCollResponse collResp; // Variable length.
+  };
+};
+
+// Returns the size of the collective portion of a collective request message.
+static inline size_t rasCollDataLength(rasCollectiveType type) {
+  struct rasCollRequest* data;
+  switch (type) {
+    case RAS_BC_DEADPEER:
+      return offsetof(struct rasCollRequest, deadPeer) + sizeof(data->deadPeer);
+    case RAS_COLL_CONNS:
+      return offsetof(struct rasCollRequest, conns) + sizeof(data->conns);
+    case RAS_COLL_COMMS:
+      return offsetof(struct rasCollRequest, comms) + sizeof(data->comms);
+    case RAS_MSG_NONE:
+      return 0;
+  };
+  return 0;
+}
+
+// Returns the size for a message of a particular type.
+static inline size_t rasMsgLength(rasMsgType type, rasCollectiveType collType = RAS_MSG_NONE) {
+  struct rasMsg* msg;
+  switch (type) {
+    case RAS_MSG_CONNINIT:
+      return offsetof(struct rasMsg, connInit) + sizeof(msg->connInit);
+    case RAS_MSG_CONNINITACK:
+      return offsetof(struct rasMsg, connInitAck) + sizeof(msg->connInitAck);
+    case RAS_MSG_KEEPALIVE:
+      return offsetof(struct rasMsg, keepAlive) + sizeof(msg->keepAlive);
+    case RAS_MSG_PEERSUPDATE:
+      return offsetof(struct rasMsg, peersUpdate) + sizeof(msg->peersUpdate);
+    case RAS_MSG_COLLREQ:
+      return offsetof(struct rasMsg, collReq) + rasCollDataLength(collType);
+    case RAS_MSG_COLLRESP:
+      return offsetof(struct rasMsg, collResp) + sizeof(msg->collResp);
+  };
+  return 0;
+}
+
+// How much to enlarge any RAS array by if we run out of space.
+#define RAS_INCREMENT 4
+
+// Our clock has nanosecond resolution.
+#define CLOCK_UNITS_PER_SEC 1000000000L
+
+// Keep-alive messages are sent no sooner than a second after the last message was sent down a particular connection.
+#define RAS_KEEPALIVE_INTERVAL (1*CLOCK_UNITS_PER_SEC*ncclParamRasTimeoutFactor())
+
+// If no message arrives in 5 seconds via a particular connection that uses keep-alive messages, generate a warning
+// and try alternative connections.
+#define RAS_KEEPALIVE_TIMEOUT_WARN (5*CLOCK_UNITS_PER_SEC*ncclParamRasTimeoutFactor())
+
+// Abort a socket that uses keep-alive messages if no message arrives in 20 seconds.
+// We will try to re-establish communication via that connection (until RAS_PEER_DEAD_TIMEOUT).
+#define RAS_KEEPALIVE_TIMEOUT_ERROR RAS_STUCK_TIMEOUT
+
+// Retry connecting on failing sockets (ECONNREFUSED, etc.) once a second.
+#define RAS_CONNECT_RETRY (1*CLOCK_UNITS_PER_SEC*ncclParamRasTimeoutFactor())
+
+// If we can't connect in 5 seconds, we generate a warning and try alternative connections.
+#define RAS_CONNECT_WARN RAS_KEEPALIVE_TIMEOUT_WARN
+
+// Abort a busy socket (one we are trying to send on, or one that was being established) if there's been
+// no sign of progress in 20 second.  We will try to re-establish communication (up to RAS_PEER_DEAD_TIMEOUT).
+#define RAS_STUCK_TIMEOUT (20*CLOCK_UNITS_PER_SEC*ncclParamRasTimeoutFactor())
+
+// Terminate ad-hoc connections that have not been used in 60 seconds.
+#define RAS_IDLE_TIMEOUT (60*CLOCK_UNITS_PER_SEC*ncclParamRasTimeoutFactor())
+
+// If the socket is closed by peer within 5 seconds from the idle timeout, do not attempt to re-establish.
+#define RAS_IDLE_GRACE_PERIOD (5*CLOCK_UNITS_PER_SEC*ncclParamRasTimeoutFactor())
+
+// Declare a peer as dead and don't retry communicating with it if we couldn't reach it for 60 seconds.
+#define RAS_PEER_DEAD_TIMEOUT (60*CLOCK_UNITS_PER_SEC*ncclParamRasTimeoutFactor())
+
+// Abort a leg of a collective operation if the response takes more than 5 seconds to arrive *and* one of the
+// connections experiences delays.
+#define RAS_COLLECTIVE_LEG_TIMEOUT (RAS_COLLECTIVE_LEG_TIMEOUT_SEC*CLOCK_UNITS_PER_SEC*ncclParamRasTimeoutFactor())
+
+// Abort a whole collective operation after at most RAS_COLLECTIVE_LEG_TIMEOUT+RAS_COLLECTIVE_EXTRA_TIMEOUT (10s).
+#define RAS_COLLECTIVE_EXTRA_TIMEOUT (RAS_COLLECTIVE_EXTRA_TIMEOUT_SEC*CLOCK_UNITS_PER_SEC*ncclParamRasTimeoutFactor())
+
+// Structure used for tracking the progress of sending a RAS message.
+struct rasMsgMeta {
+  struct rasMsgMeta* next;
+  int64_t enqueueTime;
+  int offset; // Progress sending the message (including the message size itself (an int, which is sent first)).
+  int length; // Length of the message (*excluding* the message size).
+  struct rasMsg msg; // Variable length.
+};
+
+// Describes an ongoing collective RAS operation (apart from broadcasts, which don't need a response).
+// For every collective operation, each participating RAS thread will create its own.
+struct rasCollective {
+  union ncclSocketAddress rootAddr;
+  uint64_t rootId;
+
+  rasCollectiveType type;
+
+  int64_t timeout;
+  bool timeoutWarned;
+
+  int64_t startTime; // For timeout calculations.
+  int fromConnIdx; // The connection we received the request from.
+
+  int* fwdConns; // Indices of the connections we forwarded the request to; replaced by -1 as the responses arrive.
+  int nFwdSent; // Count of the above (local process only).
+  int nFwdRecv; // Count of the responses received or timeouts (local process only).
+
+  int nLegTimeouts; // Collective (from this process and the responses we received).
+
+  union ncclSocketAddress* peers; // Collective (from this process and the responses we received).
+  int nPeers;
+
+  char* data; // Collective (from this process and the responses we received).
+  int nData;
+};
+
+// Collective data in RAS_COLL_CONNS responses.
+struct rasCollConns {
+  int64_t travelTimeMin;
+  int64_t travelTimeMax;
+  int64_t travelTimeSum;
+  int64_t travelTimeCount;
+  int nConns;
+  int nNegativeMins;
+  struct negativeMin {
+    union ncclSocketAddress source;
+    union ncclSocketAddress dest;
+    int64_t travelTimeMin;
+  } negativeMins[0]; // Variable length.
+};
+
+// Collective data in RAS_COLL_COMMS responses.
+struct rasCollComms {
+  int nComms;
+  struct comm {
+    uint64_t commHash;
+    int commNRanks;
+    int nRanks; // number of elements in the array below, *not* in the communicator.
+    struct rank {
+      int commRank;
+      int peerIdx; // Index within rasCollective->peers, *not* rasPeers.
+      uint64_t collOpCount;
+      struct {
+        ncclResult_t initState:4;
+        ncclResult_t asyncError:4;
+        bool finalizeCalled:1;
+        bool destroyFlag:1;
+        bool abortFlag:1;
+      } status;
+      char cudaDev;
+      char nvmlDev;
+    } ranks[0]; // Variable length. Sorted by commRank.  Optimized for 1 GPU/process.
+  } comms[0]; // Variable length. Sorted by commHash.
+};
+
+// Holds data needed to keep track of a connection belonging to a RAS network link (either the primary one
+// or one of the fallbacks).
+struct rasLinkConn {
+  int peerIdx; // Index in the rasPeers array of the peer this entry describes.  Could be -1 (an entry initiated
+               // by an as of yet unknown peer -- should be a temporary situation that resolves via peer updates).
+  int connIdx; // Index in the rasConns array of the connection to the above peer.  Could be -1 (a placeholder
+               // for a connection to be started by the remote peer).
+  bool external; // true if the entry exists only due to an external request (requested by a remote peer, most
+                 // likely as part of fault recovery).  Such connections are kept as fallbacks even if there's a
+                 // valid primary connection, in order to ensure that keep-alive messages are sent.
+};
+
+// Describes a link that forms the backbone of the RAS network.  Links focus on direction (previous/next in
+// case of 1-D topology) rather than a particular destination.  The are implemented using rasConnections, but
+// they are persistent through the life of the RAS threads, whereas rasConnections can be terminated if the RAS
+// network is reconfigured or a peer dies.
+struct rasLink {
+  int direction; // 1 for nextLink, -1 for prevLink.
+
+  // Index 0 is the primary connection; any additional ones are fallbacks (that get created if we are having
+  // problems with the primary connection).  The elements are de-facto ordered (highest-preference ones have
+  // the lowest indices).
+  struct rasLinkConn* conns;
+  int nConns;
+  int connsSize; // Array size; could be larger than nConns.
+
+  // Keep track of a timeout in case we did not create a connection during the last peers update (because we expect
+  // the peer on the other side to do so) but that peer failed to initiate.
+  int64_t lastUpdatePeersTime;
+};
+
+// Describes a connection to another peer on the RAS network.  It is meant to be more persistent than a volatile
+// socket (described by the rasSocket structure), which can be affected by transient network issues.
+struct rasConnection {
+  bool inUse;
+
+  union ncclSocketAddress addr;
+
+  // Index of the current rasSocket in the rasSockets array.  Note that multiple rasSocket entries may point back
+  // to a single entry here, for sockets that are in the process of being terminated and re-established.
+  // We use indices, not pointers, because the arrays holding these structures can be re-alloced at run time.
+  // -1 if there is no such socket.
+  int sockIdx;
+
+  // We keep the rasPeersHash of remote connections to minimize the number of needless exchanges.
+  // There is a subtle difference in the meaning of lastSentPeersHash and lastRecvPeersHash.
+  // lastSentPeersHash stores *our* rasPeersHash from the time we last sent a peers *update* through this connection
+  // (which is different than sending just the hash, like we do in KEEPALIVE, etc.).
+  // lastRecvPeersHash stores the latest known rasPeersHash of the peer (received via KEEPALIVE, etc.).
+  uint64_t lastSentPeersHash;
+  uint64_t lastRecvPeersHash;
+
+  // Same but for rasDeadPeersHash.
+  uint64_t lastSentDeadPeersHash;
+  uint64_t lastRecvDeadPeersHash;
+
+  // Queue of messages to send.
+  struct ncclIntruQueue<struct rasMsgMeta, &rasMsgMeta::next> sendQ;
+
+  // Used for keeping track of timeouts that may extend beyond the lifetime of a socket.
+  // The timeout starts when the connection is being created (and is turned off when the initialization is completed
+  // successfully) or when we detect a problem, such as a socket timeout (in the latter case, we may need to
+  // retroactively calculate the start time).
+  // A value of 0 indicates that they are not currently in use.
+  int64_t startRetryTime;
+  int64_t lastRetryTime;
+
+  bool experiencingDelays; // A flag indicating that the connection is currently subject to RAS_KEEPALIVE_TIMEOUT_WARN
+                           // or RAS_CONNECT_WARN timeout.  If set, the warnings have been issued and the fallbacks
+                           // have been initiated if needed.
+  bool linkFlag; // Used within rasNet* calls to mark whether this connection was already handled when iterating over
+                 // multiple links (since a connection can belong to more than one link).
+  // The below four fields are for statistical purposes only.
+  int64_t travelTimeMin;
+  int64_t travelTimeMax;
+  int64_t travelTimeSum;
+  int64_t travelTimeCount;
+};
+
+// Status of a RAS socket.
+typedef enum {
+  RAS_SOCK_CLOSED = 0,
+  RAS_SOCK_CONNECTING = 1,
+  RAS_SOCK_HANDSHAKE = 2,
+  RAS_SOCK_READY = 3,
+  RAS_SOCK_TERMINATING = 4
+} rasSocketStatus;
+
+// Describes a socket implementing communication between two peers.
+struct rasSocket {
+  struct ncclSocket sock;
+
+  rasSocketStatus status;
+
+  int pfd; // Index in the rasPfds array.
+
+ // Index of the corresponding entry in the rasConns array.
+  // We use indices, not pointers, because the arrays holding these structures can be re-alloced at run time.
+  // -1 if there is no connection (normal condition on the accept side before the connInit message).
+  int connIdx;
+
+  int64_t createTime;
+  int64_t lastSendTime;
+  int64_t lastRecvTime;
+
+  // Data on the message currently being received.
+  int recvOffset;
+  int recvLength;
+  struct rasMsg* recvMsg;
+};
+
+// Status of a RAS client.
+typedef enum {
+  RAS_CLIENT_CLOSED = 0,
+  RAS_CLIENT_CONNECTED = 1,
+  RAS_CLIENT_INIT = 2,
+  RAS_CLIENT_CONNS = 3,
+  RAS_CLIENT_COMMS = 4,
+  RAS_CLIENT_FINISHED = 99
+} rasClientStatus;
+
+// Describes a RAS client.
+struct rasClient {
+  int sock;
+
+  rasClientStatus status;
+
+  int pfd; // Index in the rasPfds array.
+
+  char recvBuffer[1024];
+  int recvOffset;
+
+  // Queue of messages to send.
+  struct ncclIntruQueue<struct rasMsgMeta, &rasMsgMeta::next> sendQ;
+
+  int verbose;
+  int64_t timeout;
+
+  // State stored during asynchronous operations such as collectives.
+  int collIdx; // Index to the onging rasCollective.
+};
+
+
+// ras.cc
+extern struct pollfd* rasPfds;
+extern struct ncclSocket rasNetListeningSocket;
+extern std::mutex ncclCommsMutex;
+extern struct ncclComm** ncclComms;
+extern int nNcclComms;
+extern  bool ncclCommsSorted;
+extern char rasLine[SOCKET_NAME_MAXLEN+1];
+
+int64_t ncclParamRasTimeoutFactor();
+ncclResult_t rasMsgAlloc(struct rasMsg** msg, size_t msgLen);
+void rasMsgFree(struct rasMsg* msg);
+void rasConnEnqueueMsg(struct rasConnection* conn, struct rasMsg* msg, size_t msgLen, bool front = false);
+ncclResult_t rasConnSendMsg(struct rasConnection* conn, int* closed, bool* allSent);
+ncclResult_t rasMsgRecv(struct rasSocket* sock, struct rasMsg** msg, int* closed);
+ncclResult_t rasMsgHandle(struct rasMsg* msg, struct rasSocket* sock);
+void rasMsgHandleBCDeadPeer(const struct rasCollRequest* req, bool* pDone);
+ncclResult_t rasGetNewPollEntry(int* index);
+
+
+// rasnet.cc
+extern struct rasLink rasNextLink, rasPrevLink;
+extern struct rasConnection* rasConns;
+extern int nRasConns;
+extern struct rasSocket *rasSockets;
+extern int nRasSockets;
+
+ncclResult_t getNewConnEntry(struct rasConnection** pConn);
+ncclResult_t rasConnCreate(const union ncclSocketAddress* addr, int* pConnIdx);
+int rasConnFind(const union ncclSocketAddress* addr);
+void rasConnsHandleTimeouts(int64_t now, int64_t* nextWakeup);
+void rasConnDisconnect(const union ncclSocketAddress* addr);
+ncclResult_t rasNetAcceptNewSocket();
+void rasSocksHandleTimeouts(int64_t now, int64_t* nextWakeup);
+void rasSocketTerminate(struct rasSocket* sock, bool finalize = false, uint64_t startRetryOffset = 0,
+                        bool retry = true);
+void rasSockEventLoop(int sockIdx, int pollIdx);
+void rasNetHandleTimeouts(int64_t now, int64_t* nextWakeup);
+ncclResult_t rasMsgHandleKeepAlive(const struct rasMsg* msg, struct rasSocket* sock);
+ncclResult_t rasLinkUpdateConn(struct rasLink* link, int connIdx, int peerIdx, bool external = false,
+                               bool insert = false, bool pretend = false, int* pLinkIdx = nullptr);
+
+// peers.cc
+extern struct rasPeerInfo* rasPeers;
+extern int nRasPeers;
+extern uint64_t rasPeersHash;
+extern union ncclSocketAddress* rasDeadPeers;
+extern int nRasDeadPeers;
+extern uint64_t rasDeadPeersHash;
+
+ncclResult_t rasLocalHandleAddRanks(struct rasRankInit* ranks, int nranks);
+int rasPeerFind(const union ncclSocketAddress* addr);
+ncclResult_t rasConnSendPeersUpdate(struct rasConnection* conn, const struct rasPeerInfo* peers, int nPeers);
+ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock);
+int rasLinkCalculatePeer(const struct rasLink* link, int peerIdx, bool isFallback = false);
+ncclResult_t rasPeerDeclareDead(const union ncclSocketAddress* addr);
+bool rasPeerIsDead(const union ncclSocketAddress* addr);
+int ncclSocketsCompare(const void* p1, const void* p2);
+bool ncclSocketsSameNode(const union ncclSocketAddress* a1, const union ncclSocketAddress* a2);
+
+
+// collectives.cc
+extern struct rasCollective* rasCollectives;
+
+void rasCollReqInit(struct rasCollRequest* req);
+ncclResult_t rasNetSendCollReq(const struct rasCollRequest* req, size_t reqLen, bool* pAllDone = nullptr,
+                               int* pCollIdx = nullptr, int fromConnIdx = -1);
+ncclResult_t rasMsgHandleCollReq(struct rasMsg* msg, struct rasSocket* sock);
+ncclResult_t rasMsgHandleCollResp(struct rasMsg* msg, struct rasSocket* sock);
+void rasCollsPurgeConn(int connIdx);
+void rasCollFree(struct rasCollective* coll);
+void rasCollsHandleTimeouts(int64_t now, int64_t* nextWakeup);
+
+// client_support.cc
+extern int rasClientListeningSocket;
+extern struct rasClient* rasClients;
+extern int nRasClients;
+ncclResult_t rasClientInitSocket();
+ncclResult_t rasClientAcceptNewSocket();
+ncclResult_t rasClientResume(struct rasCollective* coll);
+void rasClientEventLoop(int clientIdx, int pollIdx);
+const char* rasGpuDevsToString(uint64_t cudaDevs, uint64_t nvmlDevs, char* buf, size_t size);
+
+#endif // !NCCL_RAS_CLIENT
+
+#endif // !NCCL_RAS_INTERNAL_H_
diff --git a/src/ras/rasnet.cc b/src/ras/rasnet.cc
new file mode 100644
index 0000000000..441ad192c0
--- /dev/null
+++ b/src/ras/rasnet.cc
@@ -0,0 +1,1189 @@
+/*************************************************************************
+ * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#define NDEBUG // Comment out during development only!
+#include <cassert>
+
+#include "ras_internal.h"
+
+// Links forming the backbone of the RAS network (currently a ring).
+struct rasLink rasNextLink = {1}, rasPrevLink = {-1};
+
+// Connections on the RAS network.
+struct rasConnection* rasConns;
+int nRasConns;
+
+// Sockets implementing the RAS network.
+struct rasSocket *rasSockets;
+int nRasSockets;
+
+// Magic file descriptor number when we want poll() to ignore an entry.  Anything negative would do, but
+// I didn't want to use -1 because it has a special meaning for us.
+#define POLL_FD_IGNORE -2
+
+static void rasConnOpen(struct rasConnection* conn);
+static ncclResult_t rasConnPrepare(struct rasConnection* conn);
+static void rasConnTerminate(struct rasConnection* conn);
+
+static ncclResult_t getNewSockEntry(struct rasSocket** pSock);
+
+static ncclResult_t rasLinkHandleNetTimeouts(struct rasLink* link, int64_t now, int64_t* nextWakeup);
+static void rasConnHandleNetTimeouts(int connIdx, int64_t now, int64_t* nextWakeup);
+static void rasConnSendKeepAlive(struct rasConnection* conn, bool nack = false);
+
+static ncclResult_t rasLinkAddFallback(struct rasLink* link, int connIdx);
+static void rasConnResume(struct rasConnection* conn);
+static void rasLinkSanitizeFallbacks(struct rasLink* link);
+static void rasLinkDropConn(struct rasLink* link, int connIdx, int linkIdx = -1);
+static int rasLinkFindConn(const struct rasLink* link, int connIdx);
+
+
+///////////////////////////////////////////////
+// Functions related to the RAS connections. //
+///////////////////////////////////////////////
+
+// Allocates an entry in the rasConns array, enlarging the array if necessary.
+ncclResult_t getNewConnEntry(struct rasConnection** pConn) {
+  struct rasConnection* conn;
+  int i;
+  for (i = 0; i < nRasConns; i++)
+    if (!rasConns[i].inUse)
+      break;
+  if (i == nRasConns) {
+    NCCLCHECK(ncclRealloc(&rasConns, nRasConns, nRasConns+RAS_INCREMENT));
+    nRasConns += RAS_INCREMENT;
+  }
+
+  conn = rasConns+i;
+  memset(conn, '\0', sizeof(*conn));
+  conn->inUse = true;
+  conn->sockIdx = -1;
+  ncclIntruQueueConstruct(&conn->sendQ);
+  conn->travelTimeMin = INT64_MAX;
+  conn->travelTimeMax = INT64_MIN;
+
+  *pConn = conn;
+  return ncclSuccess;
+}
+
+// Creates a new RAS network connection to a remote peer address.
+ncclResult_t rasConnCreate(const union ncclSocketAddress* addr, int* pConnIdx) {
+  ncclResult_t ret = ncclSuccess;
+  struct rasConnection* conn = nullptr;
+
+  // First check if a connection entry for this peer already exists.
+  int connIdx = rasConnFind(addr);
+  if (connIdx != -1) {
+    conn = rasConns+connIdx;
+  }
+
+  if (conn && conn->sockIdx != -1) {
+    // An entry exists and has a socket associated with it -- nothing left for us to do.
+    if (pConnIdx)
+      *pConnIdx = connIdx;
+    goto exit;
+  }
+
+  if (!conn) {
+    NCCLCHECKGOTO(getNewConnEntry(&conn), ret, exit);
+    memcpy(&conn->addr, addr, sizeof(conn->addr));
+    // We are establishing a new connection -- start the timeout.
+    conn->startRetryTime = clockNano();
+    connIdx = conn - rasConns;
+  }
+
+  if (pConnIdx)
+    *pConnIdx = connIdx;
+
+  rasConnOpen(conn);
+
+exit:
+  return ret;
+}
+
+// Opens a connection to a remote peer.
+static void rasConnOpen(struct rasConnection* conn) {
+  ncclResult_t ret; // Not used.
+  struct rasSocket* sock;
+  bool closeSocketOnFail = false;
+  int ready;
+
+  NCCLCHECKGOTO(getNewSockEntry(&sock), ret, fail);
+  NCCLCHECKGOTO(ncclSocketInit(&sock->sock, &conn->addr, NCCL_SOCKET_MAGIC, ncclSocketTypeRasNetwork, nullptr,
+                               /*asyncFlag*/1, /*customRetry*/1), ret, fail);
+  closeSocketOnFail = true;
+  NCCLCHECKGOTO(ncclSocketConnect(&sock->sock), ret, fail);
+  NCCLCHECKGOTO(ncclSocketReady(&sock->sock, &ready), ret, fail);
+
+  NCCLCHECKGOTO(rasGetNewPollEntry(&sock->pfd), ret, fail);
+
+  // We delay the initialization of sockIdx, connIdx and status until this point so that in case of failures
+  // we don't need to clean them up.
+  conn->sockIdx = sock-rasSockets;
+  sock->connIdx = conn-rasConns;
+  rasPfds[sock->pfd].fd = sock->sock.fd;
+
+  // We ignore the possibly ready status of the socket at this point and consider it CONNECTING because
+  // there are other things we want to do before sending the CONNINIT, such as adding the connection to
+  // the network links, etc.
+  sock->status = RAS_SOCK_CONNECTING;
+  rasPfds[sock->pfd].events = (POLLIN | POLLOUT);
+  if (sock->sock.state == ncclSocketStateConnecting)
+    rasPfds[sock->pfd].fd = POLL_FD_IGNORE; // Don't poll on this socket before connect().
+
+exit:
+  conn->lastRetryTime = clockNano();
+  // We deliberately ignore ret as this function will be retried later if needed.
+  return;
+fail:
+  if (closeSocketOnFail)
+    (void)ncclSocketClose(&sock->sock);
+  goto exit;
+}
+
+// Sends an initial RAS message to the peer after connecting to it.
+static ncclResult_t rasConnPrepare(struct rasConnection* conn) {
+  struct rasMsg* msg = nullptr;
+  int msgLen = rasMsgLength(RAS_MSG_CONNINIT);
+
+  // The first message the RAS threads exchange provides the listening address of the connecting thread
+  // and the NCCL version to ensure that users aren't mixing things up.
+  NCCLCHECK(rasMsgAlloc(&msg, msgLen));
+  msg->type = RAS_MSG_CONNINIT;
+  msg->connInit.ncclVersion = NCCL_VERSION_CODE;
+  memcpy(&msg->connInit.listeningAddr, &rasNetListeningSocket.addr, sizeof(msg->connInit.listeningAddr));
+  msg->connInit.peersHash = rasPeersHash;
+  msg->connInit.deadPeersHash = rasDeadPeersHash;
+  // We don't update lastSent[Dead]PeersHash because we aren't actually sending the peers themselves here.
+
+  rasConnEnqueueMsg(conn, msg, msgLen, /*front*/true);
+
+  // We'll finish the initialization in rasMsgHandleConnInitAck, after the other side responds.
+  return ncclSuccess;
+}
+
+// Searches through rasConns for a connection with a provided address.
+int rasConnFind(const union ncclSocketAddress* addr) {
+  // rasConns is not sorted (given the number of indices, it would be a massive hassle to keep it that way)
+  // so binary search won't do...
+  for (int i = 0; i < nRasConns; i++) {
+    struct rasConnection* conn = rasConns+i;
+    if (conn->inUse && memcmp(&conn->addr, addr, sizeof(conn->addr)) == 0)
+      return i;
+  }
+
+  return -1;
+}
+
+// Handles any connection-related timeouts.  Many timeouts affect the underlying sockets and thus have been handled
+// in the socket timeout handler earlier by terminating the problematic sockets.  If a socket connection doesn't
+// exist or needs to be re-established (due to having just been terminated), we handle that here.
+// This is also where we declare peers as dead, etc.
+// Invoked from the main RAS event loop.
+void rasConnsHandleTimeouts(int64_t now, int64_t* nextWakeup) {
+  for (int connIdx = 0; connIdx < nRasConns; connIdx++) {
+    struct rasConnection* conn = rasConns+connIdx;
+
+    if (!conn->inUse)
+      continue;
+
+    if (conn->sockIdx != -1) {
+      struct rasSocket* sock = rasSockets+conn->sockIdx;
+      bool sockTerminated = false;
+
+      // Retry the socket connections that have been refused.
+      if (sock->status == RAS_SOCK_CONNECTING && sock->sock.state == ncclSocketStateConnecting) {
+        if (now - sock->lastSendTime > RAS_CONNECT_RETRY) {
+          int ready;
+          if (ncclSocketReady(&sock->sock, &ready) != ncclSuccess) {
+            INFO(NCCL_RAS, "Unexpected error from ncclSocketReady; terminating the socket connection with %s",
+                 ncclSocketToString(&sock->sock.addr, rasLine));
+            rasSocketTerminate(sock, /*finalize*/true);
+            // We will retry below in the same loop.
+            sockTerminated = true;
+          } else {
+            // We update lastSendTime even if !ready because we need it up-to-date for timeout calculations.
+            sock->lastSendTime = clockNano();
+            if (!ready && sock->sock.state == ncclSocketStateConnecting)
+              *nextWakeup = std::min(*nextWakeup, sock->lastSendTime+RAS_CONNECT_RETRY);
+            else
+              rasPfds[sock->pfd].fd = sock->sock.fd; // Enable the handling via the main loop.
+          } // if (ncclSocketReady)
+        } else {
+          *nextWakeup = std::min(*nextWakeup, sock->lastSendTime+RAS_CONNECT_RETRY);
+        }
+      } // if (sock->status == RAS_SOCK_CONNECTING && sock->sock.state == ncclSocketStateConnecting)
+
+      // For connections that have data to send but that we've been unable to send a message on for a while,
+      // consider their sockets lost and terminate them.
+      if (!sockTerminated && !ncclIntruQueueEmpty(&conn->sendQ) && sock->status == RAS_SOCK_READY) {
+        if (now - std::max(sock->lastSendTime, ncclIntruQueueHead(&conn->sendQ)->enqueueTime) > RAS_STUCK_TIMEOUT) {
+          INFO(NCCL_RAS, "RAS send stuck timeout error (%lds) on socket connection with %s",
+               (now - std::max(sock->lastSendTime, ncclIntruQueueHead(&conn->sendQ)->enqueueTime)) /
+               CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine));
+          rasSocketTerminate(sock, /*finalize*/false, RAS_STUCK_TIMEOUT);
+          // We will retry below in the same loop.
+        } else {
+          *nextWakeup = std::min(*nextWakeup, std::max(sock->lastSendTime,
+                                                       ncclIntruQueueHead(&conn->sendQ)->enqueueTime)+RAS_STUCK_TIMEOUT);
+        }
+      } // if (!ncclIntruQueueEmpty(&conn->sendQ) && sock->status == RAS_SOCK_READY)
+    } // if (conn->sockIdx != -1)
+
+    // For connections that are being (re-)established, irrespective of whether there's a valid socket associated
+    // with them (conn->startIdx != -1), we need to check if any connection-level timeout has expired.
+    if (conn->startRetryTime) {
+      // If we've been trying to open a connection for too long (60s), give up and mark the peer as dead
+      // so that we don't try again.
+      if (now - conn->startRetryTime > RAS_PEER_DEAD_TIMEOUT) {
+        struct rasCollRequest bCast;
+        INFO(NCCL_RAS, "RAS connect retry timeout (%lds) on socket connection with %s",
+             (now-conn->startRetryTime)/CLOCK_UNITS_PER_SEC, ncclSocketToString(&conn->addr, rasLine));
+
+        // Broadcast the info about a dead peer to everybody.  This will handle it locally as well, including
+        // declaring the peer dead and terminating the connection.
+        rasCollReqInit(&bCast);
+        bCast.type = RAS_BC_DEADPEER;
+        memcpy(&bCast.deadPeer.addr, &conn->addr, sizeof(bCast.deadPeer.addr));
+        (void)rasNetSendCollReq(&bCast, rasCollDataLength(RAS_BC_DEADPEER));
+
+        continue;
+      } else {
+        *nextWakeup = std::min(*nextWakeup, conn->startRetryTime+RAS_PEER_DEAD_TIMEOUT);
+      }
+
+      // RAS_STUCK_TIMEOUT has already been handled in the socket function (we'll pick it up later via
+      // the conn->sockIdx == -1 test).
+
+      // We print warnings after the same time as with keep-alive (5s), and we pessimistically immediately try
+      // to establish fallback connections.
+      if (now - conn->startRetryTime > RAS_CONNECT_WARN) {
+        if (!conn->experiencingDelays) {
+          INFO(NCCL_RAS, "RAS connect timeout warning (%lds) on socket connection with %s",
+               (now-conn->startRetryTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&conn->addr, rasLine));
+
+          // See if the connection was meant to be a part of a RAS link and if so, try to initiate fallback
+          // connection(s).  At this point, it's mostly just a precaution; we will continue trying to establish
+          // the primary connection until RAS_PEER_DEAD_TIMEOUT expires.
+          conn->experiencingDelays = true;
+          (void)rasLinkAddFallback(&rasNextLink, connIdx);
+          (void)rasLinkAddFallback(&rasPrevLink, connIdx);
+          // rasConns may have been reallocated by the above calls.
+          conn = rasConns+connIdx;
+
+          // Stop collectives from waiting for a response over it.
+          rasCollsPurgeConn(connIdx);
+        } // if (!conn->experiencingDelays)
+      } else {
+        *nextWakeup = std::min(*nextWakeup, conn->startRetryTime+RAS_CONNECT_WARN);
+      }
+
+      // If a socket was terminated (or never opened, due to some error), try to open it now.
+      // We retry once a second.
+      if (conn->sockIdx == -1) {
+        if (now - conn->lastRetryTime > RAS_CONNECT_RETRY) {
+          INFO(NCCL_RAS, "RAS trying to reconnect with %s (experiencingDelays %d, startRetryTime %.2fs)",
+               ncclSocketToString(&conn->addr, rasLine), conn->experiencingDelays,
+               (conn->startRetryTime ? (now-conn->startRetryTime)/1e9 : 0.0));
+          rasConnOpen(conn);
+        }
+        if (conn->sockIdx == -1)
+          *nextWakeup = std::min(*nextWakeup, conn->lastRetryTime+RAS_CONNECT_RETRY);
+      }
+    } // if (conn->startRetryTime)
+  } // for (connIdx)
+}
+
+// Checks if we have a connection to a given peer and if so, terminates it.  The connection is removed from the
+// RAS links, though fallbacks are initiated if necessary.  Typically called just before declaring a peer dead.
+void rasConnDisconnect(const union ncclSocketAddress* addr) {
+  int connIdx = rasConnFind(addr);
+  if (connIdx != -1) {
+    (void)rasLinkAddFallback(&rasNextLink, connIdx);
+    (void)rasLinkAddFallback(&rasPrevLink, connIdx);
+    rasLinkDropConn(&rasNextLink, connIdx);
+    rasLinkDropConn(&rasPrevLink, connIdx);
+
+    rasConnTerminate(rasConns+connIdx);
+  }
+}
+
+// Terminates a connection and frees the rasConns entry.
+static void rasConnTerminate(struct rasConnection* conn) {
+  int connIdx = conn - rasConns;
+
+  // Make sure there are no lingering rasSockets pointing to it.
+  for (int i = 0; i < nRasSockets; i++) {
+    struct rasSocket* sock = rasSockets+i;
+    if (sock->status != RAS_SOCK_CLOSED && sock->connIdx == connIdx)
+      rasSocketTerminate(sock, /*finalize*/true);
+  }
+
+  // Also check any ongoing collectives.
+  rasCollsPurgeConn(connIdx);
+
+  while (struct rasMsgMeta* meta = ncclIntruQueueTryDequeue(&conn->sendQ)) {
+    free(meta);
+  }
+
+  INFO(NCCL_RAS, "RAS terminating a connection with %s", ncclSocketToString(&conn->addr, rasLine));
+
+  conn->inUse = false;
+  conn->sockIdx = -1; // Should be that way already, but just to be extra sure...
+}
+
+
+///////////////////////////////////////////
+// Functions related to the RAS sockets. //
+///////////////////////////////////////////
+
+// Accepts a new RAS network socket connection.  The socket is not usable until after the handshake, as a
+// corresponding rasConnection can't be established without knowing the peer's address.
+ncclResult_t rasNetAcceptNewSocket() {
+  ncclResult_t ret = ncclSuccess;
+  struct rasSocket* sock;
+  int ready;
+  bool socketInitialized = false;
+  NCCLCHECKGOTO(getNewSockEntry(&sock), ret, fail);
+
+  NCCLCHECKGOTO(ncclSocketInit(&sock->sock, nullptr, NCCL_SOCKET_MAGIC, ncclSocketTypeRasNetwork, nullptr,
+                               /*asyncFlag*/1), ret, fail);
+  socketInitialized = true;
+  NCCLCHECKGOTO(ncclSocketAccept(&sock->sock, &rasNetListeningSocket), ret, fail);
+  NCCLCHECKGOTO(ncclSocketReady(&sock->sock, &ready), ret, fail);
+
+  if (sock->sock.fd != -1) {
+    NCCLCHECKGOTO(rasGetNewPollEntry(&sock->pfd), ret, fail);
+    rasPfds[sock->pfd].fd = sock->sock.fd;
+    rasPfds[sock->pfd].events = POLLIN; // Initially we'll just wait for a handshake from the other side.  This also
+                                        // helps the code tell the sides apart.
+    sock->status = RAS_SOCK_CONNECTING;
+
+    INFO(NCCL_RAS, "RAS new incoming socket connection from %s", ncclSocketToString(&sock->sock.addr, rasLine));
+  }
+
+exit:
+  return ret;
+fail:
+  if (socketInitialized)
+    NCCLCHECK(ncclSocketClose(&sock->sock));
+  goto exit;
+}
+
+// Returns the index of the first available entry in the rasConns array, enlarging the array if necessary.
+static ncclResult_t getNewSockEntry(struct rasSocket** pSock) {
+  struct rasSocket* sock;
+  int i;
+  for (i = 0; i < nRasSockets; i++)
+    if (rasSockets[i].status == RAS_SOCK_CLOSED)
+      break;
+  if (i == nRasSockets) {
+    NCCLCHECK(ncclRealloc(&rasSockets, nRasSockets, nRasSockets+RAS_INCREMENT));
+    nRasSockets += RAS_INCREMENT;
+  }
+
+  sock = rasSockets+i;
+  memset(sock, '\0', sizeof(*sock));
+  sock->pfd = -1;
+  sock->connIdx = -1;
+  sock->createTime = sock->lastSendTime = sock->lastRecvTime = clockNano();
+
+  *pSock = sock;
+  return ncclSuccess;
+}
+
+// Invoked from the main RAS event loop to handle RAS socket timeouts.
+void rasSocksHandleTimeouts(int64_t now, int64_t* nextWakeup) {
+  for (int sockIdx = 0; sockIdx < nRasSockets; sockIdx++) {
+    struct rasSocket* sock = rasSockets+sockIdx;
+
+    if (sock->status == RAS_SOCK_CLOSED)
+      continue;
+
+    // For socket connections that are still being established, give up on the ones that take too long to initialize.
+    if (sock->status == RAS_SOCK_CONNECTING || sock->status == RAS_SOCK_HANDSHAKE) {
+      if (now - sock->createTime > RAS_STUCK_TIMEOUT) {
+        if (sock->connIdx == -1) {
+          INFO(NCCL_RAS, "RAS init timeout error (%lds) on incoming socket connection from %s",
+               (now-sock->createTime)/CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine));
+        } else {
+          struct rasConnection* conn = rasConns+sock->connIdx;
+          INFO(NCCL_RAS, "RAS init timeout error (%lds) on socket connection with %s "
+               "(experiencingDelays %d, startRetryTime %.2fs, socket status %d)",
+               (now-sock->createTime)/CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine),
+               conn->experiencingDelays, (conn->startRetryTime ? (now-conn->startRetryTime)/1e9 : 0.0),
+               sock->status);
+        }
+        rasSocketTerminate(sock, /*finalize*/true);
+        // We may retry later.
+        continue;
+      } else {
+        *nextWakeup = std::min(*nextWakeup, sock->createTime+RAS_STUCK_TIMEOUT);
+      }
+    } // if (sock->status == RAS_SOCK_CONNECTING || sock->status == RAS_SOCK_HANDSHAKE)
+
+    // For sockets that are being terminated, force finalization of the ones that haven't made progress in too long.
+    if (sock->status == RAS_SOCK_TERMINATING) {
+      if (now - std::max(sock->lastSendTime, sock->lastRecvTime) > RAS_STUCK_TIMEOUT) {
+        INFO(NCCL_RAS, "RAS termination stuck timeout error (%lds) on socket connection with %s",
+             (now-std::max(sock->lastSendTime, sock->lastRecvTime)) / CLOCK_UNITS_PER_SEC,
+             ncclSocketToString(&sock->sock.addr, rasLine));
+        rasSocketTerminate(sock, /*finalize*/true);
+        // This socket is presumably already being re-established, if needed.
+        continue;
+      } else {
+        *nextWakeup = std::min(*nextWakeup, std::max(sock->lastSendTime, sock->lastRecvTime)+RAS_STUCK_TIMEOUT);
+      }
+    } // if (sock->status == RAS_SOCK_TERMINATING)
+
+    // Terminate sockets that haven't been used in a good while.  In principle this shouldn't trigger for anything
+    // important due to shorter timeouts on RAS network connections, but in case of weird situations like process
+    // suspend, rasSocketTerminate will do additional checking.
+    if (sock->status == RAS_SOCK_READY) {
+      if (now - std::max(sock->lastSendTime, sock->lastRecvTime) > RAS_IDLE_TIMEOUT) {
+        INFO(NCCL_RAS, "RAS idle timeout (%lds) on socket connection with %s",
+             (now - std::max(sock->lastSendTime, sock->lastRecvTime)) / CLOCK_UNITS_PER_SEC,
+             ncclSocketToString(&sock->sock.addr, rasLine));
+        rasSocketTerminate(sock, /*finalize*/false, /*startRetryOffset*/0, /*retry*/false);
+        continue;
+        // The RAS network timeout handler will terminate the conn it was associated with, if any.
+      } else {
+        *nextWakeup = std::min(*nextWakeup, std::max(sock->lastSendTime, sock->lastRecvTime)+RAS_IDLE_TIMEOUT);
+      }
+    } // if (sock->status == RAS_SOCK_READY)
+  } // for (sockIdx)
+}
+
+// Handles the termination of a RAS socket.
+// We try to do it in stages for established sockets (in READY state).  We shut down just the sending side
+// for them and change their state to TERMINATING, so that we can still receive data that may be in the buffers.
+// Once we get an EOF when receiving data, we finalize the termination.
+// For not fully established sockets, we can terminate immediately as there's no useful data to extract.
+void rasSocketTerminate(struct rasSocket* sock, bool finalize, uint64_t startRetryOffset, bool retry) {
+  assert(sock->status != RAS_SOCK_CLOSED);
+  if (sock->connIdx != -1) {
+    struct rasConnection* conn = rasConns+sock->connIdx;
+    // If the sockIdx of the connection points back to us, it means that we are the current socket of this
+    // connection, so we have additional work to do before we can terminate it.
+    if (conn->sockIdx == sock-rasSockets) {
+      // Reset it to indicate there's no valid socket associated with that connection anymore.
+      conn->sockIdx = -1;
+
+      // Don't attempt to retry on sockets that have been unused for so long that the remote peer probably
+      // deliberately closed them.  Make an exception for sockets that are part of the RAS network links.
+      if ((retry &&
+           clockNano() - std::max(sock->lastSendTime, sock->lastRecvTime) < RAS_IDLE_TIMEOUT - RAS_IDLE_GRACE_PERIOD) ||
+          rasLinkFindConn(&rasNextLink, sock->connIdx) != -1 || rasLinkFindConn(&rasPrevLink, sock->connIdx) != -1) {
+        // For connections that were fine until now, the connection-level timeout starts at termination, and possibly
+        // even earlier, depending on what event trigerred the termination -- if it was another timeout expiring, then
+        // we need to include that timeout as well.
+        if (conn->startRetryTime == 0) {
+          conn->startRetryTime = conn->lastRetryTime = clockNano() - startRetryOffset;
+        }
+
+        // We also filter through the sendQ, eliminating any messages that won't need to be sent when the socket
+        // connection is re-established (that's essentially the server init and keep-alives).
+        // As ncclIntruQueue can't be iterated, we transfer the content in bulk to a temporary and then filter the
+        // messages as we move them back one-by-one.
+        struct ncclIntruQueue<struct rasMsgMeta, &rasMsgMeta::next> sendQTmp;
+        ncclIntruQueueConstruct(&sendQTmp);
+        ncclIntruQueueTransfer(&sendQTmp, &conn->sendQ);
+        while (struct rasMsgMeta* meta = ncclIntruQueueTryDequeue(&sendQTmp)) {
+          if (meta->msg.type != RAS_MSG_CONNINIT && meta->msg.type != RAS_MSG_CONNINITACK &&
+              meta->msg.type != RAS_MSG_KEEPALIVE) {
+            if (meta->offset != 0) {
+              // Reset the progress of any partially-sent messages (they will need to be resent from the beginning;
+              // in principle that could apply to the first message only).
+              meta->offset = 0;
+            }
+            ncclIntruQueueEnqueue(&conn->sendQ, meta);
+          } else { // RAS_MSG_CONNINIT || RAS_MSG_CONNINITACK || RAS_MSG_KEEPALIVE
+            free(meta);
+          }
+        } // while (meta)
+      } // if (retry)
+
+      // Stop collectives from waiting for a response over this connection.
+      rasCollsPurgeConn(sock->connIdx);
+    } // if (conn->sockIdx == sock-rasSockets)
+  } // if (sock->connIdx != -1)
+
+  if (sock->status != RAS_SOCK_CONNECTING && sock->connIdx != -1 && !finalize && (rasPfds[sock->pfd].events & POLLIN)) {
+    if (sock->status != RAS_SOCK_TERMINATING) {
+      // The receiving side is still open -- close just the sending side.
+      (void)ncclSocketShutdown(&sock->sock, SHUT_WR);
+      rasPfds[sock->pfd].events &= ~POLLOUT; // Nothing more to send.
+      // The timeout for this socket starts ticking now...
+      sock->lastSendTime = clockNano();
+      sock->status = RAS_SOCK_TERMINATING;
+    }
+    // Else it must be in RAS_SOCK_TERMINATING state already -- in that case we do nothing here and instead
+    // we wait for an EOF on the receiving side or for a timeout.
+  } else {
+    // Either the caller requested finalization or we cannot receive on it.
+    (void)ncclSocketClose(&sock->sock);
+    sock->status = RAS_SOCK_CLOSED;
+    rasPfds[sock->pfd].fd = -1;
+    rasPfds[sock->pfd].events = rasPfds[sock->pfd].revents = 0;
+    sock->pfd = sock->connIdx = -1;
+    sock->recvOffset = sock->recvLength = 0;
+    free(sock->recvMsg);
+    sock->recvMsg = nullptr;
+  }
+}
+
+// Handles a ready socket FD from the main event loop.
+void rasSockEventLoop(int sockIdx, int pollIdx) {
+  struct rasSocket* sock = rasSockets+sockIdx;
+
+  if (sock->status == RAS_SOCK_CONNECTING) {
+    int ready;
+    // Socket is not yet fully established. Continue the OS or NCCL-level handshake.
+    if (ncclSocketReady(&sock->sock, &ready) != ncclSuccess) {
+      INFO(NCCL_RAS, "RAS unexpected error from ncclSocketReady; terminating the socket connection with %s",
+           ncclSocketToString(&sock->sock.addr, rasLine));
+      rasSocketTerminate(sock);
+      // We may retry further down.
+    } else {
+      if (ready) {
+        // We can tell the connect-side based on what events is set to.
+        bool connectSide = (rasPfds[pollIdx].events & POLLOUT);
+        (connectSide ? sock->lastSendTime : sock->lastRecvTime) = clockNano();
+        sock->status = RAS_SOCK_HANDSHAKE;
+        if (connectSide) {
+          assert(sock->connIdx != -1);
+          if (rasConns[sock->connIdx].sockIdx == sockIdx) {
+            if (rasConnPrepare(rasConns+sock->connIdx) != ncclSuccess) {
+              INFO(NCCL_RAS, "RAS unexpected error from rasConnPrepare; terminating the socket connection with %s",
+                   ncclSocketToString(&sock->sock.addr, rasLine));
+              rasSocketTerminate(sock);
+              // We may retry further down.
+            }
+          } else {
+            // The connection this socket is associated with no longer considers it to be the current one.
+            // This could possibly happen due to a race condition.  Simply terminate it.
+            INFO(NCCL_RAS, "RAS connected with %s via a socket that's no longer current!",
+                 ncclSocketToString(&sock->sock.addr, rasLine));
+            rasSocketTerminate(sock);
+          }
+        } // if (connectSide)
+      } else { // !ready
+        if (sock->sock.state == ncclSocketStateConnecting)
+          rasPfds[sock->pfd].fd = POLL_FD_IGNORE; // Don't poll on this socket before connect().
+      }
+    } // if (ncclSocketReady)
+  } else { // RAS_SOCK_HANDSHAKE || RAS_SOCK_READY || RAS_SOCK_TERMINATING.
+    // The extra test for TERMINATING is there to take care of a race when the handling of one socket
+    // results in another socket being terminated, but one that already has revents waiting from poll.
+    if (sock->status != RAS_SOCK_TERMINATING && (rasPfds[pollIdx].revents & POLLOUT)) {
+      int closed = 0;
+      bool allSent = false;
+      assert(sock->connIdx != -1);
+      struct rasConnection* conn = rasConns+sock->connIdx;
+      assert(conn->sockIdx == sockIdx);
+      if (rasConnSendMsg(conn, &closed, &allSent) != ncclSuccess) {
+        INFO(NCCL_RAS, "RAS unexpected error from rasConnSendMsg; terminating the socket connection with %s",
+             ncclSocketToString(&sock->sock.addr, rasLine));
+        rasSocketTerminate(sock);
+        // We may retry further down.
+      } else if (closed) {
+        INFO(NCCL_RAS, "RAS socket connection with %s closed by peer on send; terminating it",
+             ncclSocketToString(&sock->sock.addr, rasLine));
+        rasSocketTerminate(sock);
+        // We may retry further down.
+      } else {
+        sock->lastSendTime = clockNano();
+        if (allSent)
+          rasPfds[sock->pfd].events &= ~POLLOUT; // Nothing more to send for now.
+      }
+    }
+    if (rasPfds[pollIdx].revents & POLLIN) {
+      struct rasMsg* msg;
+      do {
+        int closed = 0;
+        msg = nullptr;
+        if (rasMsgRecv(sock, &msg, &closed) != ncclSuccess) {
+          INFO(NCCL_RAS, "RAS unexpected error from rasMsgRecv; terminating the socket connection with %s",
+               ncclSocketToString(&sock->sock.addr, rasLine));
+          rasSocketTerminate(sock, /*finalize*/true);
+          // We may retry further down.
+        } else if (closed) {
+          const char* socketType;
+          if (sock->connIdx == -1)
+            socketType = "incoming";
+          else if (rasConns[sock->connIdx].sockIdx != sockIdx)
+            socketType = "old";
+          else if (sock->status == RAS_SOCK_HANDSHAKE)
+            socketType = "new";
+          else
+            socketType = "current";
+          INFO(NCCL_RAS, "RAS %s socket connection with %s closed by peer on receive; terminating it",
+               socketType, ncclSocketToString(&sock->sock.addr, rasLine));
+          rasSocketTerminate(sock, /*finalize*/true);
+          // We may retry further down.
+        } else {
+          sock->lastRecvTime = clockNano();
+          if (msg) {
+            (void)rasMsgHandle(msg, sock);
+            free(msg);
+            // Message handlers can terminate a socket in certain cases; we need to check for
+            // that here so that we don't try to receive from a closed socket.
+            // No handlers are currently believed to create new sockets but better to be safe than sorry
+            // and re-init the sock variable.
+            sock = rasSockets+sockIdx;
+            if (sock->status == RAS_SOCK_CLOSED)
+              break;
+          }
+          if (sock->connIdx != -1) {
+            struct rasConnection* conn = rasConns+sock->connIdx;
+            if (conn->sockIdx == sockIdx && (conn->startRetryTime || conn->experiencingDelays))
+              rasConnResume(conn);
+          }
+        }
+      } while (msg);
+    } // if (POLLIN)
+  } // RAS_SOCK_HANDSHAKE || RAS_SOCK_READY || RAS_SOCK_TERMINATING
+}
+
+
+////////////////////////////////////////////////////////////////
+// Functions related to the handling of RAS network timeouts. //
+////////////////////////////////////////////////////////////////
+
+// Invoked from the main RAS event loop to handle RAS network timeouts.
+void rasNetHandleTimeouts(int64_t now, int64_t* nextWakeup) {
+  // A connection can belong to multiple links but, when it comes to various timeouts, we want to handle each
+  // connection just once.  We solve that with a simple flag within a connection.  This also allows us to distinguish
+  // connections that are part of a link from those that are not.
+  for (int connIdx = 0; connIdx < nRasConns; connIdx++)
+    rasConns[connIdx].linkFlag = false;
+
+  (void)rasLinkHandleNetTimeouts(&rasNextLink, now, nextWakeup);
+  (void)rasLinkHandleNetTimeouts(&rasPrevLink, now, nextWakeup);
+
+  for (int connIdx = 0; connIdx < nRasConns; connIdx++) {
+    struct rasConnection* conn = rasConns+connIdx;
+    if (conn->inUse && !conn->linkFlag) {
+      // The connection is not part of any link.  Check if it should be terminated.
+      if (conn->sockIdx == -1 && ncclIntruQueueEmpty(&conn->sendQ)) {
+        rasConnTerminate(conn);
+        continue;
+      }
+    }
+  }
+}
+
+// Checks for and handles timeouts at the link level; primarily the keep-alives for link connections.
+static ncclResult_t rasLinkHandleNetTimeouts(struct rasLink* link, int64_t now, int64_t* nextWakeup) {
+  for (int i = 0; i < link->nConns; i++) {
+    struct rasLinkConn* linkConn = link->conns+i;
+    if (linkConn->connIdx != -1) {
+      if (!rasConns[linkConn->connIdx].linkFlag) {
+        rasConnHandleNetTimeouts(linkConn->connIdx, now, nextWakeup);
+        // rasConns may have been reallocated by the above call, which is why we don't have a conn variable here.
+        // For the same reason we re-init linkConn.
+        linkConn = link->conns+i;
+        rasConns[linkConn->connIdx].linkFlag = true;
+      }
+    } else if (i == 0 && link->lastUpdatePeersTime != 0) {
+      // This triggers when rasLinkReinitConns didn't create the primary connection because we have a higher address
+      // than the peer.  If that peer fails to initiate within RAS_CONNECT_WARN, we need to take action.
+      if (now - link->lastUpdatePeersTime > RAS_CONNECT_WARN) {
+        INFO(NCCL_RAS, "RAS peer connect timeout warning (%lds) on socket connection from %s",
+             (now-link->lastUpdatePeersTime) / CLOCK_UNITS_PER_SEC,
+             ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine));
+        NCCLCHECK(rasConnCreate(&rasPeers[linkConn->peerIdx].addr, &linkConn->connIdx));
+        if (linkConn->connIdx != -1) {
+          rasConns[linkConn->connIdx].linkFlag = true;
+        }
+        // We used to connect to the first fallback but I think trying to connect to the calculated primary first
+        // in this case is more intuitive.
+        //(void)rasLinkTryFallback(link, -1);
+        link->lastUpdatePeersTime = 0;
+      } else {
+        *nextWakeup = std::min(*nextWakeup, link->lastUpdatePeersTime+RAS_CONNECT_WARN);
+      }
+    } // if (i == 0 && link->lastUpdatePeerTime != 0)
+  } // for (i)
+
+  return ncclSuccess;
+}
+
+// Handles the sending of keep-alive messages and related timeouts for connections that are part of the RAS links.
+static void rasConnHandleNetTimeouts(int connIdx, int64_t now, int64_t* nextWakeup) {
+  struct rasConnection* conn = rasConns+connIdx;
+  if (conn->sockIdx != -1) {
+    struct rasSocket* sock = rasSockets+conn->sockIdx;
+
+    if (sock->status == RAS_SOCK_READY) {
+      // Send a regular keep-alive message if we haven't sent anything in a while and we don't have anything queued.
+      if (ncclIntruQueueEmpty(&conn->sendQ)) {
+        if (now - sock->lastSendTime > RAS_KEEPALIVE_INTERVAL) {
+          rasConnSendKeepAlive(conn);
+        } else {
+          *nextWakeup = std::min(*nextWakeup, sock->lastSendTime+RAS_KEEPALIVE_INTERVAL);
+        }
+      }
+
+      // For short timeouts print a warning but also pessimistically immediately try to establish fallback connections.
+      if (now - sock->lastRecvTime > RAS_KEEPALIVE_TIMEOUT_WARN) {
+        if (!conn->experiencingDelays) {
+          INFO(NCCL_RAS, "RAS keep-alive timeout warning (%lds) on socket connection with %s",
+               (now-sock->lastRecvTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine));
+
+          // At this point, it's mostly just a precaution; we will continue with the primary connection until
+          // RAS_PEER_DEAD_TIMEOUT expires.
+          conn->experiencingDelays = true;
+          (void)rasLinkAddFallback(&rasNextLink, connIdx);
+          (void)rasLinkAddFallback(&rasPrevLink, connIdx);
+          // rasConns and rasSockets may have been reallocated by the above calls.
+          conn = rasConns+connIdx;
+          sock = rasSockets+conn->sockIdx;
+
+          // Stop collectives from waiting for a response over it.
+          rasCollsPurgeConn(connIdx);
+        }
+      } else {
+        *nextWakeup = std::min(*nextWakeup, sock->lastRecvTime+RAS_KEEPALIVE_TIMEOUT_WARN);
+      }
+
+      // For long timeouts we need to act.
+      if (now - sock->lastRecvTime > RAS_KEEPALIVE_TIMEOUT_ERROR) {
+        INFO(NCCL_RAS, "RAS keep-alive timeout error (%lds) on socket connection with %s",
+             (now-sock->lastRecvTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine));
+        rasSocketTerminate(sock, /*finalize*/true, RAS_KEEPALIVE_TIMEOUT_ERROR);
+        *nextWakeup = now; // Retry will be in the next iteration of the main loop so ensure we don't wait.
+      } else {
+        *nextWakeup = std::min(*nextWakeup, sock->lastRecvTime+RAS_KEEPALIVE_TIMEOUT_ERROR);
+      }
+    } // if (sock->status == RAS_SOCK_READY)
+  } // if (conn->sockIdx != -1)
+}
+
+// Sends a keep-alive message to a peer on the RAS network.
+static void rasConnSendKeepAlive(struct rasConnection* conn, bool nack) {
+  struct rasMsg* msg = nullptr;
+  int msgLen = rasMsgLength(RAS_MSG_KEEPALIVE);
+  if (rasMsgAlloc(&msg, msgLen) == ncclSuccess) {
+    int linkIdx;
+    msg->type = RAS_MSG_KEEPALIVE;
+    msg->keepAlive.peersHash = rasPeersHash;
+    msg->keepAlive.deadPeersHash = rasDeadPeersHash;
+    msg->keepAlive.nack = (nack ? 1 : 0);
+
+    linkIdx = rasLinkFindConn(&rasNextLink, conn-rasConns);
+    if (linkIdx != -1 && !rasNextLink.conns[linkIdx].external)
+      msg->keepAlive.linkMask |= 2; // Our rasNextLink should be the peer's rasPrevLink.
+    linkIdx = rasLinkFindConn(&rasPrevLink, conn-rasConns);
+    if (linkIdx != -1 && !rasPrevLink.conns[linkIdx].external)
+      msg->keepAlive.linkMask |= 1; // Our rasPrevLink should be the peer's rasNextLink.
+
+    (void)clock_gettime(CLOCK_REALTIME, &msg->keepAlive.realTime);
+
+    rasConnEnqueueMsg(conn, msg, msgLen);
+  }
+}
+
+// Handles incoming keep-alive messages.
+ncclResult_t rasMsgHandleKeepAlive(const struct rasMsg* msg, struct rasSocket* sock) {
+  struct timespec currentTime;
+  int64_t travelTime;
+  int peerIdx;
+
+  assert(sock->connIdx != -1);
+  struct rasConnection* conn = rasConns+sock->connIdx;
+  SYSCHECK(clock_gettime(CLOCK_REALTIME, &currentTime), "clock_gettime");
+  travelTime = (currentTime.tv_sec-msg->keepAlive.realTime.tv_sec)*1000*1000*1000 +
+    (currentTime.tv_nsec-msg->keepAlive.realTime.tv_nsec);
+
+  if (msg->keepAlive.peersHash != conn->lastRecvPeersHash) {
+    conn->lastRecvPeersHash = msg->keepAlive.peersHash;
+  }
+  if (msg->keepAlive.deadPeersHash != conn->lastRecvDeadPeersHash) {
+    conn->lastRecvDeadPeersHash = msg->keepAlive.deadPeersHash;
+  }
+
+  // Make sure that the connection is part of the appropriate links forming the RAS network.  In particular, this
+  // will add any externally-requested connections to the appropriate links (or remove existing ones, if no longer
+  // needed).
+  peerIdx = rasPeerFind(&conn->addr);
+  // Note: it's possible for peerIdx to be -1 at this point if, due to races, the keepAlive arrives before
+  // the peers update.
+  (void)rasLinkUpdateConn(&rasNextLink, (msg->keepAlive.linkMask & 1) ? sock->connIdx : -1, peerIdx, /*external*/true);
+  (void)rasLinkUpdateConn(&rasPrevLink, (msg->keepAlive.linkMask & 2) ? sock->connIdx : -1, peerIdx, /*external*/true);
+
+  // If the keep-alive message is from a peer that doesn't actually need this connection (i.e., for that peer the
+  // connection is just an external fallback), we should check if *we* still need it.  It might be that we don't,
+  // and because we stopped sending the keep-alives, our peer doesn't know about it.  rasLinkUpdateConn calls above
+  // will have wiped any external fallbacks, so anything that remains must be needed.
+  if (!msg->keepAlive.nack && msg->keepAlive.linkMask == 0) {
+    if (rasLinkFindConn(&rasNextLink, sock->connIdx) == -1 && rasLinkFindConn(&rasPrevLink, sock->connIdx) == -1) {
+      // We don't need this connection either.  Notify the peer about it.  To avoid an infinite loop, we set the
+      // special nack flag in the message to distinguish it from regular keep-alives.
+      rasConnSendKeepAlive(conn, /*nack*/true);
+    }
+  }
+
+  if (conn->travelTimeMin > travelTime)
+    conn->travelTimeMin = travelTime;
+  if (conn->travelTimeMax < travelTime)
+    conn->travelTimeMax = travelTime;
+  conn->travelTimeSum += travelTime;
+  conn->travelTimeCount++;
+
+  if (msg->keepAlive.peersHash != rasPeersHash || msg->keepAlive.deadPeersHash != rasDeadPeersHash) {
+    // This could happen due to a short-lived race condition between the peers propagation
+    // process and the periodic keep-alive messages (perhaps we'll see it regularly at scale?).
+    // Just in case there's some unforeseen problem with the peers propagation though, exchange with the
+    // remote to get everybody in sync.
+    INFO(NCCL_RAS, "RAS keepAlive hash mismatch from %s (peersHash 0x%lx, deadPeersHash 0x%lx)",
+         ncclSocketToString(&sock->sock.addr, rasLine), msg->keepAlive.peersHash, msg->keepAlive.deadPeersHash);
+    INFO(NCCL_RAS, "RAS my peersHash 0x%lx, deadPeersHash 0x%lx", rasPeersHash, rasDeadPeersHash);
+    NCCLCHECK(rasConnSendPeersUpdate(conn, rasPeers, nRasPeers));
+  }
+  return ncclSuccess;
+}
+
+
+///////////////////////////////////////////////////////////////////////////////
+// Functions related to the RAS links and recovery from connection failures. //
+///////////////////////////////////////////////////////////////////////////////
+
+// Checks if the connection (that we just detected some problem with) is part of the RAS link and if so,
+// tries to initiate a(nother) fallback connection if needed.
+// External connections are generally ignored by this whole process: in particular, we don't add fallbacks for
+// timing out external connections.  However, we will use an active external connection if it would be a better
+// option than whatever we can come up with.
+static ncclResult_t rasLinkAddFallback(struct rasLink* link, int connIdx) {
+  int peerIdx = -1;
+  int linkIdx = -1;
+  int firstExtLinkIdx = -1;
+  int newPeerIdx;
+
+  // First check if the connection is part of this link.  In the process also check if any of the link's connections
+  // might be active -- if so, there's no need to initiate any more fallbacks and we can bail out.
+  for (int i = 0; i < link->nConns; i++) {
+    struct rasLinkConn* linkConn = link->conns+i;
+
+    if (linkConn->peerIdx == -1) {
+      // Such elements are always at the very end of the array and we can't use them so we can just as well break.
+      break;
+    }
+
+    // Check for any other connection that might be a viable fallback (basically, anything that is not experiencing
+    // delays).
+    if (linkConn->connIdx != -1 && linkConn->connIdx != connIdx) {
+      struct rasConnection* conn = rasConns+linkConn->connIdx;
+      if (!conn->experiencingDelays) {
+        if (!linkConn->external)
+          goto exit; // We don't need to do anything if there's a non-external connection.
+        else if (linkConn->peerIdx != -1) {
+          // Record the location of the first potentially viable external connection in the chain; we may prefer it
+          // over anything we can come up with.
+          if (firstExtLinkIdx == -1)
+            firstExtLinkIdx = i;
+          if (linkIdx != -1)
+            break; // Break out of the loop if we already have all the data we might need.
+        } // linkConn->external && linkConn->peerIdx != -1
+      } // if (!conn->experiencingDelays)
+    } // if (linkConn->connIdx != -1)
+
+    if (linkConn->connIdx == connIdx) {
+      if (linkConn->external)
+        goto exit; // We don't add fallbacks for external connections...
+      peerIdx = linkConn->peerIdx;
+      linkIdx = i;
+      // We are not breaking out of the loop here because we want to check for active connections on *all* potentially
+      // viable elements (in particular, there could be some external ones beyond this one).
+    }
+  }
+
+  if (linkIdx == -1)
+    goto exit;
+
+  // We found an existing element so the connection is part of the link.  No existing non-external connections of this
+  // link are active, so a fallback is needed.
+  assert(peerIdx != -1);
+  newPeerIdx = rasLinkCalculatePeer(link, peerIdx, /*isFallback*/linkIdx > 0);
+  // In principle we want to add (at most) one fallback.  However, if the found fallback connection already exists
+  // and is also experiencing delays, we need to keep iterating.
+  while (newPeerIdx != -1) {
+    int newConnIdx = rasConnFind(&rasPeers[newPeerIdx].addr);
+    // If we previously found a potential external fallback connection, check if it's better than what we just found.
+    if (firstExtLinkIdx != -1) {
+      linkIdx = -1;
+      // Calculate the index that the newly found fallback would have (pretend mode).
+      NCCLCHECK(rasLinkUpdateConn(link, newConnIdx, newPeerIdx, /*external*/false, /*insert*/true, /*pretend*/true,
+                                  &linkIdx));
+      assert(linkIdx != -1);
+      if (firstExtLinkIdx < linkIdx) {
+        // The external connection *is* better -- use it as a fallback instead and be done.
+        link->conns[firstExtLinkIdx].external = false;
+        goto exit;
+      }
+    }
+    NCCLCHECK(rasLinkUpdateConn(link, newConnIdx, newPeerIdx, /*external*/false, /*insert*/true, /*pretend*/false,
+                                &linkIdx));
+    if (firstExtLinkIdx != -1 && linkIdx <= firstExtLinkIdx)
+      firstExtLinkIdx++; // Adjust if we inserted a new conn at a lower index.
+
+    INFO(NCCL_RAS, "RAS link %d: %s fallback connection %d with %s",
+         link->direction, (newConnIdx == -1 ? "opening new" : "calculated existing"),
+         linkIdx, ncclSocketToString(&rasPeers[newPeerIdx].addr, rasLine));
+    // Note that we don't follow here our convention of "lower address is the one establishing connections" --
+    // that convention is for optimizing regular operations, but we don't want to take chances during fault
+    // recovery. It may temporarily result in duplicate connections, but we have a mechanism to deal with those.
+    if (newConnIdx == -1)
+      NCCLCHECK(rasConnCreate(&rasPeers[newPeerIdx].addr, &link->conns[linkIdx].connIdx));
+
+    struct rasConnection* conn = rasConns+link->conns[linkIdx].connIdx;
+    // If the fallback connection is also experiencing delays, we need to keep trying.
+    if (!conn->experiencingDelays)
+      break;
+    INFO(NCCL_RAS, "RAS connection experiencingDelays %d, startRetryTime %.2fs, socket status %d",
+         conn->experiencingDelays, (conn->startRetryTime ? (clockNano()-conn->startRetryTime)/1e9 : 0.0),
+         (conn->sockIdx == -1 ? -1 : rasSockets[conn->sockIdx].status));
+
+    newPeerIdx = rasLinkCalculatePeer(link, newPeerIdx, /*isFallback*/true);
+  }
+  if (newPeerIdx == -1)
+      INFO(NCCL_RAS, "RAS link %d: no more fallbacks to add (nConns %d)", link->direction, link->nConns);
+exit:
+  return ncclSuccess;
+}
+
+// Invoked when we receive a message over a connection that was just activated or was experiencing delays.
+// Cleans up the fallbacks, timers, etc, as appropriate.
+static void rasConnResume(struct rasConnection* conn) {
+  if (conn->sockIdx != -1 && rasSockets[conn->sockIdx].status == RAS_SOCK_READY) {
+    INFO(NCCL_RAS, "RAS %s connection with %s (sendQ %sempty, experiencingDelays %d, startRetryTime %.2fs)",
+         (conn->experiencingDelays && conn->startRetryTime == 0 ? "recovered" : "established"),
+         ncclSocketToString(&conn->addr, rasLine), (ncclIntruQueueEmpty(&conn->sendQ) ? "" : "not "),
+         conn->experiencingDelays, (conn->startRetryTime ? (clockNano()-conn->startRetryTime)/1e9 : 0.0));
+
+    conn->experiencingDelays = false;
+
+    conn->startRetryTime = conn->lastRetryTime = 0;
+
+    rasLinkSanitizeFallbacks(&rasNextLink);
+    rasLinkSanitizeFallbacks(&rasPrevLink);
+
+    if (!ncclIntruQueueEmpty(&conn->sendQ))
+      rasPfds[rasSockets[conn->sockIdx].pfd].events |= POLLOUT;
+  }
+}
+
+// Checks if the primary connection is fully established and if so, purges the fallbacks (as they are no longer needed).
+static void rasLinkSanitizeFallbacks(struct rasLink* link) {
+  if (link->nConns > 0 && link->conns[0].connIdx != -1) {
+    struct rasConnection* conn = rasConns+link->conns[0].connIdx;
+    if (conn->sockIdx != -1 && rasSockets[conn->sockIdx].status == RAS_SOCK_READY && !conn->experiencingDelays) {
+      // We have a good primary.  Simply drop all the fallbacks (the external ones will get recreated via the
+      // keepAlive messages).
+      for (int i = 1; i < link->nConns; i++) {
+        INFO(NCCL_RAS, "RAS link %d: dropping %sfallback connection %d with %s",
+             link->direction, (link->conns[i].external ? "external " : ""), i,
+             ncclSocketToString(&rasConns[link->conns[i].connIdx].addr, rasLine));
+      }
+      link->nConns = 1;
+      link->lastUpdatePeersTime = 0;
+    }
+  }
+}
+
+// Attempt to drop a connection from a link.
+static void rasLinkDropConn(struct rasLink* link, int connIdx, int linkIdx) {
+  if (linkIdx == -1)
+    linkIdx = rasLinkFindConn(link, connIdx);
+  if (linkIdx != -1) {
+    if (linkIdx == 0) {
+      INFO(NCCL_RAS, "RAS link %d: dropping primary connection with %s",
+           link->direction, ncclSocketToString(&rasConns[connIdx].addr, rasLine));
+    } else {
+      INFO(NCCL_RAS, "RAS link %d: dropping %sfallback connection %d with %s",
+           link->direction, (link->conns[linkIdx].external ? "external " : ""), linkIdx,
+           ncclSocketToString(&rasConns[connIdx].addr, rasLine));
+    }
+    memmove(link->conns+linkIdx, link->conns+linkIdx+1, (link->nConns-(linkIdx+1))*sizeof(*link->conns));
+    if (link->nConns > 1)
+      link->nConns--;
+    else {
+      link->conns[0].peerIdx = link->conns[0].connIdx = -1;
+    }
+
+    if (linkIdx == 0) {
+      // First ensure that the conn becoming the primary is not marked as external (we don't want to lose it if
+      // the remote peer loses interest in it).
+      link->conns[0].external = false;
+      if (link->conns[0].connIdx != -1) {
+        INFO(NCCL_RAS, "RAS link %d: former fallback connection 1 with %s is the new primary",
+             link->direction, ncclSocketToString(&rasConns[link->conns[0].connIdx].addr, rasLine));
+      }
+      rasLinkSanitizeFallbacks(link);
+    }
+  }
+}
+
+// Checks if a given connection is a member of this link and if so, returns its entry index.
+// Returns -1 if connection not found.
+static int rasLinkFindConn(const struct rasLink* link, int connIdx) {
+  for (int i = 0; i < link->nConns; i++) {
+    if (link->conns[i].connIdx == connIdx)
+      return i;
+  }
+  return -1;
+}
+
+// Note: the behavior of this function has become super-complex and so it should be considered for refactoring.
+// Searches for and updates an entry in a RAS network link.  The conns array is de-facto sorted by peerIdx: it is
+// ordered by preference, though peerIdx values can wrap around (given the ring/torus topology) and they can also
+// be -1 (the latter are stored at the end).
+// external provides an updated value for the entry's external field.  A false value, if requested, is always set;
+// a true value, however, is only set if a new entry is added (external == true implies insert), i.e., if an entry
+// already exists and the function is invoked with external == true, the new value will be ignored.
+// If insert is set, it will, if necessary, insert a new entry if one is not already there.
+// If pretend is set, it will not modify the array and will just set *pLinkIdx as appropriate.
+// pLinkIdx is a pointer to an (optional) result where the index of the added/updated entry is stored.
+// -1 can be passed as peerIdx if unknown (possible in case of race conditions, and only if external).
+// -1 can be passed as connIdx if unknown or, if insert is *not* set, to indicate that the entry is to be removed
+// (the entry's external must match the argument external for it to be removed).
+ncclResult_t rasLinkUpdateConn(struct rasLink* link, int connIdx, int peerIdx, bool external, bool insert,
+                               bool pretend, int* pLinkIdx) {
+  int i, oldLinkIdx = -1;
+
+  if (external && connIdx != -1)
+    insert = true;
+
+  if (connIdx != -1) {
+    // Start by checking if we already have an element with this connIdx.
+    oldLinkIdx = rasLinkFindConn(link, connIdx);
+    if (oldLinkIdx != -1) {
+      struct rasLinkConn* linkConn = link->conns+oldLinkIdx;
+      if (linkConn->peerIdx != -1)
+        assert(linkConn->peerIdx == peerIdx);
+
+      if (linkConn->peerIdx == peerIdx) {
+        if (!external && !pretend)
+          linkConn->external = false; // Ensure that external is cleared if so requested.
+        if (pLinkIdx)
+          *pLinkIdx = oldLinkIdx;
+        goto exit; // Nothing more to do if both connIdx and peerIdx are up to date.
+      }
+
+      // Otherwise (linkConn->peerIdx == -1 && peerIdx != -1) we have a conn that, due to -1 peerIdx, is in a wrong
+      // place in the array -- we need to find the right spot.  linkConn->peerIdx == -1 can only happen for external
+      // connections.
+      assert(external);
+    }
+  }
+
+  if (peerIdx != -1) {
+    // Search for the right spot in the conns array.
+    for (i = 0; i < link->nConns; i++) {
+      struct rasLinkConn* linkConn = link->conns+i;
+      if (peerIdx != -1 && linkConn->peerIdx == peerIdx) {
+        // The exact conn element already exists.
+        if (connIdx == -1 && !insert) {
+          // Drop the connection from the link.
+          if (linkConn->external == external) {
+            if (!pretend)
+              rasLinkDropConn(link, linkConn->connIdx, i);
+            else if (pLinkIdx)
+              *pLinkIdx = i;
+          }
+        } else { // connIdx != -1 || insert
+          if (!pretend) {
+            if (linkConn->connIdx != -1)
+              assert(linkConn->connIdx == connIdx);
+            else
+              linkConn->connIdx = connIdx;
+            if (!external)
+              linkConn->external = false; // Ensure that external is cleared if so requested.
+            if (i == 0) {
+              // We received a connection from the remote peer that matches the primary connection we've been
+              // waiting for.
+              rasLinkSanitizeFallbacks(link);
+            }
+          } // if (!pretend)
+          if (pLinkIdx)
+            *pLinkIdx = i;
+        } // connIdx != -1 || insert
+
+        goto exit;
+      } // if (peerIdx != -1 && linkConn->peerIdx == peerIdx)
+      if (!insert)
+        continue;
+      // Ensure that the i-1 index is also valid.
+      if (i == 0)
+        continue;
+      // Conns with peerIdx == -1 are stored at the end, so anything else needs to go before them.
+      if (peerIdx != -1 && linkConn->peerIdx == -1)
+        break;
+      // Detect a roll-over and handle it specially.
+      if (link->direction * (link->conns[i-1].peerIdx - linkConn->peerIdx) > 0) {
+        if (link->direction * (peerIdx - link->conns[i-1].peerIdx) > 0 ||
+            link->direction * (peerIdx - linkConn->peerIdx) < 0)
+          break;
+      } else { // Regular, monotonic case with the peerIdx value between two existing elements.
+        if (link->direction * (peerIdx - link->conns[i-1].peerIdx) > 0 &&
+            link->direction * (peerIdx - linkConn->peerIdx) < 0)
+          break;
+      }
+    } // for (i)
+  } else {
+    // If peerIdx == -1, insert the new element at the very end.  This can only happen for external connections.
+    assert(external && oldLinkIdx == -1);
+    i = link->nConns;
+  }
+  if (!insert)
+    goto exit;
+
+  // i holds the index at which to insert a new element.
+  if (pretend) {
+    if (pLinkIdx)
+      *pLinkIdx = i;
+    goto exit;
+  }
+
+  if (oldLinkIdx == -1) {
+    struct rasLinkConn* linkConn;
+    if (link->nConns == link->connsSize) {
+      NCCLCHECK(ncclRealloc(&link->conns, link->connsSize, link->connsSize+RAS_INCREMENT));
+      link->connsSize += RAS_INCREMENT;
+    }
+    linkConn = link->conns+i;
+    // Shift existing conns with indices >= i to make room for the new one.
+    memmove(linkConn+1, linkConn, (link->nConns-i)*sizeof(*link->conns));
+    linkConn->peerIdx = peerIdx;
+    linkConn->connIdx = connIdx;
+    linkConn->external = external;
+    if (external) {
+      INFO(NCCL_RAS, "RAS link %d: adding external fallback connection %d with %s", link->direction, i,
+           ncclSocketToString((connIdx != -1 ? &rasConns[connIdx].addr : &rasPeers[peerIdx].addr), rasLine));
+    }
+    link->nConns++;
+  }
+  else { // oldLinkIdx > -1
+    // We already have the conn, we just need to move it to a new spot.
+    struct rasLinkConn* linkConn = link->conns+i;
+    assert(i <= oldLinkIdx); // We can only get here if linkConn->peerIdx == -1 && peerIdx != -1.
+    if (i != oldLinkIdx) {
+      struct rasLinkConn tmp;
+      struct rasLinkConn* linkConnNext = link->conns+i+1; // Just to silence the compiler.
+      // Move the existing conn from index oldLinkIdx to a (lower) index i, shifting the existing conns
+      // with indices in the range [i, oldLinkIdx).
+      memcpy(&tmp, link->conns+oldLinkIdx, sizeof(tmp));
+      memmove(linkConnNext, linkConn, (oldLinkIdx-i)*sizeof(*linkConn));
+      memcpy(linkConn, &tmp, sizeof(*linkConn));
+    }
+    if (!external)
+      linkConn->external = false; // Ensure that external is cleared if so requested.
+  } // oldLinkIdx > -1
+  if (pLinkIdx)
+    *pLinkIdx = i;
+exit:
+  return ncclSuccess;
+}
diff --git a/src/register.cc b/src/register.cc
deleted file mode 100644
index c4ca4b4a0c..0000000000
--- a/src/register.cc
+++ /dev/null
@@ -1,204 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "argcheck.h" // Need some checks here since we access comm
-#include "nccl.h"
-#include "comm.h"
-#include "net.h"
-#include "register.h"
-#include "transport.h"
-
-ncclResult_t ncclNetDeregister(struct ncclComm* comm, struct ncclReg* reg) {
-  struct ncclRegCache* cache = &comm->regCache;
-  ncclDebugNoWarn = NCCL_NET;
-  for (int d=0; d<reg->nDevs; d++) {
-    if (reg->handles[d] != NULL) NCCLCHECK(comm->ncclNet->deregMr(cache->sComms[reg->devs[d]], reg->handles[d]));
-  }
-  reg->nDevs = 0;
-  free(reg->handles);
-  reg->handles = NULL;
-  ncclDebugNoWarn = 0;
-  return ncclSuccess;
-}
-
-ncclResult_t ncclNetRegister(struct ncclComm* comm, void* addr, size_t size, struct ncclReg* reg) {
-  struct ncclRegCache* cache = &comm->regCache;
-  int netCount = 0;
-  if (comm->topo != NULL) NCCLCHECK(ncclTopoGetNetCount(comm->topo, &netCount));
-  if (netCount == 0) return ncclSuccess;
-
-  ncclResult_t ret = ncclSuccess;
-
-  // Find local devices for p2p operations
-  for (int c=0; c<comm->p2pnChannels; c++) {
-    int dev;
-    if (ncclTopoGetLocalNet(comm->topo, comm->rank, c, NULL, &dev) != ncclSuccess) goto end; // No local net
-    ncclNetProperties_t props;
-    NCCLCHECKGOTO(comm->ncclNet->getProperties(dev, &props), ret, end);
-    if (props.regIsGlobal == 0) { // We need to be sure all NICs support global registration.
-      reg->nDevs = 0;
-      break;
-    }
-    int found = 0;
-    for (int d=0; d<reg->nDevs; d++) if (reg->devs[d] == dev) found = 1;
-    if (!found) reg->devs[reg->nDevs++] = dev;
-  }
-
-  NCCLCHECKGOTO(ncclCalloc(&reg->handles, reg->nDevs), ret, end);
-
-  ncclDebugNoWarn = NCCL_NET;
-  for (int d=0; d<reg->nDevs; d++) {
-    int dev = reg->devs[d];
-    reg->handles[d] = NULL;
-
-    if (cache->sComms[dev] == NULL) {
-      // Create a loopback network comm object for that device to register the buffers.
-      void *lComm = NULL;
-      ncclNetHandle_t netHandle;
-      bool connected = false;
-      NCCLCHECKGOTO(comm->ncclNet->listen(dev, &netHandle, &lComm), ret, end);
-      while (!connected) {
-        if (*comm->abortFlag) {
-          goto end;
-        }
-        if (cache->sComms[dev] == NULL)
-          NCCLCHECKGOTO(comm->ncclNet->connect(dev, &netHandle, cache->sComms+dev, NULL), ret, end);
-        if (cache->rComms[dev] == NULL)
-          NCCLCHECKGOTO(comm->ncclNet->accept(lComm, cache->rComms+dev, NULL), ret, end);
-        connected = (cache->rComms[dev] != NULL) && (cache->sComms[dev] != NULL);
-      }
-      NCCLCHECK(comm->ncclNet->closeListen(lComm));
-    }
-    if (comm->ncclNet->regMr(cache->sComms[dev], addr, size, NCCL_PTR_CUDA, reg->handles+d) != ncclSuccess) {
-      reg->handles[d] = NULL;
-      NCCLCHECK(ncclNetDeregister(comm, reg));
-      reg->nDevs = 0;
-      goto end;
-    }
-  }
-end:
-  INFO(NCCL_INIT, "Register ptr %p size %ld on %d net devices", addr, size, reg->nDevs);
-  ncclDebugNoWarn = 0;
-  if (ret != ncclSuccess) NCCLCHECK(ncclNetDeregister(comm, reg));
-  return ret;
-}
-
-ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** reg) {
-  struct ncclRegCache* cache = &comm->regCache;
-  uintptr_t pageSize = cache->pageSize;
-  uintptr_t addr = (uintptr_t)data & -pageSize;
-  size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
-
-  *reg = NULL;
-  for (int slot=0; /*true*/; slot++) {
-    if (slot == cache->population || addr < cache->slots[slot]->addr) return ncclSuccess;
-    if ((addr >= cache->slots[slot]->addr) &&
-        ((addr-cache->slots[slot]->addr)/pageSize+pages) <= cache->slots[slot]->pages) {
-      *reg = cache->slots[slot];
-      return ncclSuccess;
-    }
-  }
-}
-NCCL_PARAM(LocalRegister, "LOCAL_REGISTER", 1);
-
-ncclResult_t ncclRegister(struct ncclComm* comm, void* data, size_t size, void** handle) {
-  if (!ncclParamLocalRegister()) {
-    *handle = NULL;
-    return ncclSuccess;
-  }
-  INFO(NCCL_REG, "register comm %p buffer %p size %zi", comm, data, size);
-  struct ncclRegCache* cache = &comm->regCache;
-  uintptr_t pageSize = cache->pageSize;
-  uintptr_t addr = (uintptr_t)data & -pageSize;
-  size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
-  for (int slot=0; /*true*/; slot++) {
-    if ((slot == cache->population) || (addr < cache->slots[slot]->addr)) {
-      if (cache->population == cache->capacity) { // must grow cache
-        cache->capacity = cache->capacity < 32 ? 32 : 2*cache->capacity;
-        NCCLCHECK(ncclRealloc(&cache->slots, cache->population, cache->capacity));
-      }
-      memmove(cache->slots+slot+1, cache->slots+slot, (cache->population-slot)*sizeof(struct ncclReg*));
-      NCCLCHECK(ncclCalloc(cache->slots+slot, 1));
-      struct ncclReg* regSlot = cache->slots[slot];
-      regSlot->addr = addr;
-      regSlot->pages = pages;
-      regSlot->refs = 1;
-      NCCLCHECK(ncclNetRegister(comm, (void*)addr, pages*pageSize, regSlot));
-      regSlot->state |= NET_REG_COMPLETE;
-      cache->population += 1;
-      *handle = regSlot;
-      return ncclSuccess;
-    } else if ((addr >= cache->slots[slot]->addr) &&
-        ((addr-cache->slots[slot]->addr)/pageSize+pages) <= cache->slots[slot]->pages) {
-      cache->slots[slot]->refs++;
-      *handle = cache->slots[slot];
-      return ncclSuccess;
-    }
-  }
-}
-
-ncclResult_t ncclRegCleanup(struct ncclComm* comm) {
-  struct ncclRegCache* cache = &comm->regCache;
-  for (int i=0; i<cache->population; i++) {
-    INFO(NCCL_INIT, "Cleanup buffer %p pages %lx", (void*)cache->slots[i]->addr, cache->slots[i]->pages);
-    NCCLCHECK(ncclNetDeregister(comm, cache->slots[i]));
-    if (cache->slots[i]->state & NVLS_REG_COMPLETE) NCCLCHECK(ncclNvlsDeregBuffer(&cache->slots[i]->mcHandle, cache->slots[i]->regAddr, cache->slots[i]->dev, cache->slots[i]->regSize));
-    free(cache->slots[i]);
-  }
-  free(cache->slots);
-  for (int d=0; d<MAXCHANNELS; d++) {
-    if (cache->sComms[d]) NCCLCHECK(comm->ncclNet->closeSend(cache->sComms[d]));
-    if (cache->rComms[d]) NCCLCHECK(comm->ncclNet->closeRecv(cache->rComms[d]));
-  }
-  return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclCommRegister, const ncclComm_t comm, void* buff, size_t size, void** handle);
-ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) {
-  NCCLCHECK(CommCheck(comm, "ncclCommRegister", "comm"));
-  if (comm->checkPointers) NCCLCHECK(CudaPtrCheck(buff, comm, "buff", "ncclCommRegister"));
-  NCCLCHECK(ncclRegister(comm, buff, size, handle));
-  return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclCommDeregister, const ncclComm_t comm, void* handle);
-ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle) {
-  NCCLCHECK(CommCheck(comm, "ncclCommRegister", "comm"));
-  struct ncclReg* reg = (struct ncclReg*)handle;
-  struct ncclRegCache* cache = &comm->regCache;
-  int slot;
-  int saveDev;
-  if (handle == NULL) goto exit;
-  CUDACHECK(cudaGetDevice(&saveDev));
-  CUDACHECK(cudaSetDevice(comm->cudaDev));
-  for (slot=0; slot<cache->population && cache->slots[slot] != reg; slot++);
-  if (slot == cache->population) {
-    WARN("Deregister: Could not find handle");
-    return ncclInvalidUsage;
-  }
-  if (--reg->refs) return ncclSuccess;
-  NCCLCHECK(ncclNetDeregister(comm, reg));
-  if (reg->state & NVLS_REG_COMPLETE) {
-    NCCLCHECK(ncclNvlsDeregBuffer(&reg->mcHandle, reg->regAddr, reg->dev, reg->regSize));
-    reg->regAddr = (CUdeviceptr)NULL;
-  }
-  if (reg->state & COLLNET_REG_COMPLETE) {
-    NCCLCHECK(ncclCollnetDeregBuffer(comm, reg->collnetProxyconn, reg->collnetHandle));
-  }
-  if (reg->state & IPC_REG_COMPLETE) {
-    for (int i = 0; i < NCCL_MAX_LOCAL_RANKS; ++i)
-      if (reg->ipcInfos[i])
-        NCCLCHECK(ncclIpcDeregBuffer(comm, reg->ipcInfos[i]));
-    if (reg->regIpcAddrs.hostPeerRmtAddrs) free(reg->regIpcAddrs.hostPeerRmtAddrs);
-    if (reg->regIpcAddrs.devPeerRmtAddrs) NCCLCHECK(ncclCudaFree(reg->regIpcAddrs.devPeerRmtAddrs));
-  }
-  free(reg);
-  memmove(cache->slots+slot, cache->slots+slot+1, (cache->population-slot-1)*sizeof(struct ncclReg*));
-  cache->population -= 1;
-  CUDACHECK(cudaSetDevice(saveDev));
-exit:
-  return ncclSuccess;
-}
diff --git a/src/register/coll_reg.cc b/src/register/coll_reg.cc
new file mode 100644
index 0000000000..4282dc9c8c
--- /dev/null
+++ b/src/register/coll_reg.cc
@@ -0,0 +1,446 @@
+#include "register.h"
+#include "transport.h"
+#include "enqueue.h"
+
+static ncclResult_t registerCheckP2PConnection(struct ncclComm* comm, struct ncclConnector* conn, struct ncclTopoGraph* graph, int peer, bool* needReg) {
+  if (conn->connected) {
+    if (conn->conn.flags & (NCCL_P2P_READ | NCCL_P2P_WRITE)) {
+      *needReg = true;
+    } else {
+      // network connection
+      *needReg = false;
+    }
+  } else {
+    struct ncclPeerInfo* peerInfo = &comm->peerInfo[peer];
+    struct ncclPeerInfo* myInfo = &comm->peerInfo[comm->rank];
+    int canConnect = 0;
+    NCCLCHECK(ncclTransports[0]->canConnect(&canConnect, comm, graph, myInfo, peerInfo));
+    if (canConnect) {
+      *needReg = true;
+    } else {
+      *needReg = false;
+    }
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclRegisterCollNvlsBuffers(
+    struct ncclComm* comm, struct ncclTaskColl* info,
+    void* outRegBufSend[NCCL_MAX_LOCAL_RANKS],
+    void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS],
+    struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue,
+    bool* regNeedConnect
+  ) {
+  ncclResult_t result = ncclSuccess;
+
+  info->regBufType = NCCL_REGULAR_BUFFER;
+  *regNeedConnect = true;
+  if (!(ncclParamLocalRegister() || (comm->planner.persistent && ncclParamGraphRegister()))) goto exit;
+#if CUDART_VERSION >= 11030
+  if (info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) {
+    if (!comm->nvlsRegSupport || info->opDev.op == ncclDevPreMulSum) goto exit;
+    int nvlsReged = 0;
+    int collnetReged = 0;
+    const void *sendbuff = info->sendbuff;
+    void *recvbuff = info->recvbuff;
+    void *recvHandle = NULL, *sendHandle = NULL;
+    if (info->func == ncclFuncAllGather) sendbuff = NULL;
+    if (info->func == ncclFuncReduceScatter) recvbuff = NULL;
+    size_t elementSize = ncclTypeSize(info->datatype);
+    size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count);
+    size_t recvbuffSize = elementSize*ncclFuncRecvCount(info->func, comm->nRanks, info->count);
+
+    /* first try graph registration. */
+    if (comm->planner.persistent && ncclParamGraphRegister()) {
+      ncclNvlsGraphRegisterBuffer(comm, sendbuff, recvbuff, sendbuffSize, recvbuffSize, &nvlsReged, outRegBufSend, outRegBufRecv, cleanupQueue, &info->nCleanupQueueElts);
+    }
+
+    if (nvlsReged == 0 && ncclParamLocalRegister()) {
+      ncclNvlsLocalRegisterBuffer(comm, sendbuff, recvbuff, sendbuffSize, recvbuffSize, &nvlsReged, outRegBufSend, outRegBufRecv);
+    }
+
+    if (nvlsReged && comm->nNodes > 1 && info->algorithm == NCCL_ALGO_NVLS) {
+      if (comm->planner.persistent && ncclParamGraphRegister()) {
+        ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetSend, &collnetReged, &sendHandle, cleanupQueue, &info->nCleanupQueueElts);
+        if (collnetReged) ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &collnetReged, &recvHandle, cleanupQueue, &info->nCleanupQueueElts);
+      }
+
+      if (collnetReged == 0 && ncclParamLocalRegister()) {
+        ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetSend, &collnetReged, &sendHandle);
+        if (collnetReged) ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &collnetReged, &recvHandle);
+      }
+    }
+
+    if (nvlsReged) {
+      *regNeedConnect = 0;
+      /* tweak NVLS channels usage; for registered NVLS buffer, we only need 4/5 channels to
+       * saturate bandwidth. */
+      if (comm->nNodes == 1) {
+        if (info->func == ncclFuncReduceScatter)
+          info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 5));
+        else
+          info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 4));
+      } else {
+        info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 6));
+      }
+      info->regBufType |= NCCL_NVLS_REG_BUFFER;
+    }
+
+    if (collnetReged) {
+      info->regBufType |= NCCL_NET_REG_BUFFER;
+      info->sendMhandle = sendHandle;
+      info->recvMhandle = recvHandle;
+    }
+  }
+exit:
+#endif
+  return result;
+}
+
+ncclResult_t ncclRegisterCollBuffers(
+    struct ncclComm* comm, struct ncclTaskColl* info,
+    void* outRegBufSend[NCCL_MAX_LOCAL_RANKS],
+    void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS],
+    struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue,
+    bool* regNeedConnect
+  ) {
+  ncclResult_t result = ncclSuccess;
+
+  info->regBufType = NCCL_REGULAR_BUFFER;
+  *regNeedConnect = true;
+  if (!(ncclParamLocalRegister() || (comm->planner.persistent && ncclParamGraphRegister()))) goto exit;
+#if CUDART_VERSION >= 11030
+  if (info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) {
+    /* this part of nvls reg code is temporarily not used and obsolete. */
+    if (!comm->nvlsRegSupport || info->opDev.op == ncclDevPreMulSum) goto exit;
+    int nvlsReged = 0;
+    int collnetReged = 0;
+    const void *sendbuff = info->sendbuff;
+    void *recvbuff = info->recvbuff;
+    void *recvHandle = NULL, *sendHandle = NULL;
+    if (info->func == ncclFuncAllGather) sendbuff = NULL;
+    if (info->func == ncclFuncReduceScatter) recvbuff = NULL;
+    size_t elementSize = ncclTypeSize(info->datatype);
+    size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count);
+    size_t recvbuffSize = elementSize*ncclFuncRecvCount(info->func, comm->nRanks, info->count);
+
+    /* first try local registration. */
+    if (ncclParamLocalRegister()) {
+      ncclNvlsLocalRegisterBuffer(comm, sendbuff, recvbuff, sendbuffSize, recvbuffSize, &nvlsReged, outRegBufSend, outRegBufRecv);
+    }
+
+    if (nvlsReged == 0 && comm->planner.persistent && ncclParamGraphRegister()) {
+      ncclNvlsGraphRegisterBuffer(comm, sendbuff, recvbuff, sendbuffSize, recvbuffSize, &nvlsReged, outRegBufSend, outRegBufRecv, cleanupQueue, &info->nCleanupQueueElts);
+    }
+
+    if (comm->nNodes > 1 && info->algorithm == NCCL_ALGO_NVLS) {
+      if (ncclParamLocalRegister()) {
+        ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetSend, &collnetReged, &sendHandle);
+        if (collnetReged) ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &collnetReged, &recvHandle);
+      }
+
+      if (collnetReged == 0 && comm->planner.persistent && ncclParamGraphRegister()) {
+        ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetSend, &collnetReged, &sendHandle, cleanupQueue, &info->nCleanupQueueElts);
+        if (collnetReged) ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &collnetReged, &recvHandle, cleanupQueue, &info->nCleanupQueueElts);
+      }
+    }
+
+    if (nvlsReged) {
+      *regNeedConnect = 0;
+      /* tweak NVLS channels usage; for registered NVLS buffer, we only need 4/5 channels to
+       * saturate bandwidth. */
+      if (comm->nNodes == 1) {
+        if (info->func == ncclFuncReduceScatter)
+          info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 5));
+        else
+          info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 4));
+      } else {
+        info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 6));
+      }
+      info->regBufType |= NCCL_NVLS_REG_BUFFER;
+    }
+
+    if (collnetReged) {
+      info->regBufType |= NCCL_NET_REG_BUFFER;
+      info->sendMhandle = sendHandle;
+      info->recvMhandle = recvHandle;
+    }
+  } else if (info->protocol == NCCL_PROTO_SIMPLE) {
+    // IPC buffer registration
+    if (info->func == ncclFuncReduceScatter && info->algorithm != NCCL_ALGO_COLLNET_DIRECT) goto exit;
+    if (info->algorithm == NCCL_ALGO_RING && ((info->func == ncclFuncAllReduce && info->sendbuff == info->recvbuff) || info->func == ncclFuncReduce)) goto exit;
+    if ((info->algorithm == NCCL_ALGO_TREE || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) && info->sendbuff == info->recvbuff) goto exit;
+    if (info->func == ncclFuncAllGather && info->algorithm == NCCL_ALGO_PAT) goto exit;
+
+    int peerRanks[NCCL_MAX_LOCAL_RANKS];
+    int nPeers = 0;
+    size_t elementSize = ncclTypeSize(info->datatype);
+    size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count);
+    size_t recvbuffSize = elementSize*ncclFuncRecvCount(info->func, comm->nRanks, info->count);
+    int regBufFlag = 0;
+    memset(peerRanks, 0xff, sizeof(int) * NCCL_MAX_LOCAL_RANKS);
+
+    if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) {
+      struct ncclChannel* channel = comm->channels;
+      int ipcRegFlag = 0, netSendRegFlag = 0, netRecvRegFlag = 0;
+      void *sendHandle, *recvHandle;
+      if (info->func != ncclFuncReduceScatter && comm->intraNodeP2pSupport) {
+        for (int r = 0; r < NCCL_MAX_DIRECT_ARITY; ++r) {
+          for (int down = 0; down < 2; ++down) {
+            int peer = down ? channel->collnetDirect.down[r] : channel->collnetDirect.up[r];
+            if (peer != -1) {
+              struct ncclConnector* peerConn = &channel->peers[peer]->recv[0];
+              bool needReg = false;
+
+              NCCLCHECK(registerCheckP2PConnection(comm, peerConn, &comm->graphs[NCCL_ALGO_COLLNET_DIRECT], peer, &needReg));
+              if (needReg) {
+                bool found = false;
+                for (int p = 0; p < nPeers; ++p) {
+                  if (peerRanks[p] == peer) {
+                    found = true;
+                    break;
+                  }
+                }
+                if (!found) peerRanks[nPeers++] = peer;
+              }
+            }
+          }
+        }
+
+        if (nPeers > 0) {
+          if (comm->planner.persistent && ncclParamGraphRegister()) {
+            ncclIpcGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &ipcRegFlag, &info->sendbuffOffset, &info->sendbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts);
+            if (ipcRegFlag) ncclIpcGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &ipcRegFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts);
+          }
+          if (!ipcRegFlag && ncclParamLocalRegister()) {
+            ncclIpcLocalRegisterBuffer(comm, info->sendbuff, sendbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &ipcRegFlag, &info->sendbuffOffset, &info->sendbuffRmtAddrs);
+            if (ipcRegFlag) ncclIpcLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &ipcRegFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs);
+          }
+        }
+        if (ipcRegFlag) {
+          info->regBufType |= NCCL_IPC_REG_BUFFER;
+        }
+      }
+
+      // register collnet buffer
+      if (info->opDev.op != ncclDevPreMulSum && info->opDev.op != ncclDevSumPostDiv && !(info->func == ncclFuncAllReduce && !comm->isOneRPN)) {
+        if (comm->planner.persistent && ncclParamGraphRegister()) {
+          ncclCollnetGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &netSendRegFlag, &sendHandle, cleanupQueue, &info->nCleanupQueueElts);
+          info->sendMhandle = sendHandle;
+          if (netSendRegFlag) {
+            ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &netRecvRegFlag, &recvHandle, cleanupQueue, &info->nCleanupQueueElts);
+            info->recvMhandle = recvHandle;
+          }
+        }
+
+        if ((netSendRegFlag == 0 || netRecvRegFlag == 0) && ncclParamLocalRegister()) {
+          if (!netSendRegFlag) {
+            ncclCollnetLocalRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &netSendRegFlag, &sendHandle);
+            info->sendMhandle = sendHandle;
+          }
+          if (netSendRegFlag && !netRecvRegFlag) {
+            ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &netRecvRegFlag, &recvHandle);
+            info->recvMhandle = recvHandle;
+          }
+        }
+      }
+
+      if (netSendRegFlag && netRecvRegFlag) {
+        if (comm->isOneRPN) info->nMaxChannels = 1;
+        info->regBufType |= NCCL_NET_REG_BUFFER;
+      }
+    } else if (info->algorithm == NCCL_ALGO_RING) {
+      struct ncclReg* recvRegRecord = NULL;
+      struct ncclReg* sendRegRecord = NULL;
+      int sendNetPeers = comm->nChannels;
+      int recvNetPeers = comm->nChannels;
+      struct ncclConnector** sendNetConns = NULL;
+      struct ncclConnector** recvNetConns = NULL;
+      void** sendNetHandles = NULL;
+      void** recvNetHandles = NULL;
+      void** srecvNetHandles = NULL;
+      bool hasRecvNetPeer = false;
+      bool hasSendNetPeer = false;
+
+      NCCLCHECK(ncclRegFind(comm, info->recvbuff, recvbuffSize, &recvRegRecord));
+      if (recvRegRecord == NULL && !(comm->planner.persistent && ncclParamGraphRegister())) goto exit;
+      NCCLCHECK(ncclRegFind(comm, info->sendbuff, sendbuffSize, &sendRegRecord));
+      if (sendRegRecord == NULL && !(comm->planner.persistent && ncclParamGraphRegister())) goto exit;
+      NCCLCHECK(ncclCalloc(&sendNetConns, comm->nChannels));
+      NCCLCHECK(ncclCalloc(&sendNetHandles, comm->nChannels));
+      NCCLCHECK(ncclCalloc(&recvNetConns, comm->nChannels));
+      NCCLCHECK(ncclCalloc(&recvNetHandles, comm->nChannels));
+      NCCLCHECK(ncclCalloc(&srecvNetHandles, comm->nChannels));
+
+      for (int c = 0; c < comm->nChannels; ++c) {
+        struct ncclChannel* channel = comm->channels + c;
+        for (int r = 0; r < 2; ++r) {
+          int peer;
+          struct ncclConnector* peerConn;
+          if (r == 0) {
+            peer = channel->ring.prev;
+            peerConn = &channel->peers[peer]->recv[0];
+            if (peerConn->conn.flags & NCCL_DIRECT_NIC) {
+              recvNetConns[c] = peerConn;
+              hasRecvNetPeer = true;
+            }
+          } else {
+            peer = channel->ring.next;
+            peerConn = &channel->peers[peer]->send[0];
+            if (peerConn->conn.flags & NCCL_DIRECT_NIC) {
+              sendNetConns[c] = peerConn;
+              hasSendNetPeer = true;
+            }
+          }
+          if (peerConn->conn.flags & (NCCL_P2P_READ | NCCL_P2P_WRITE)) {
+            bool found = false;
+            for (int p = 0; p < nPeers; ++p) {
+              if (peerRanks[p] == peer) {
+                found = true;
+                break;
+              }
+            }
+            if (!found) peerRanks[nPeers++] = peer;
+          }
+        }
+      }
+      if (nPeers > 0 && comm->intraNodeP2pSupport) {
+        if (comm->planner.persistent && ncclParamGraphRegister()) {
+          ncclIpcGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts);
+        }
+        if (!regBufFlag && ncclParamLocalRegister()) {
+          ncclIpcLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs);
+        }
+      }
+      if (regBufFlag) {
+        info->regBufType = NCCL_IPC_REG_BUFFER;
+      }
+
+      // start net registration
+      regBufFlag = 0;
+      if (!comm->useNetPXN && comm->useGdr && comm->netDeviceType != NCCL_NET_DEVICE_UNPACK) {
+        if (comm->planner.persistent && ncclParamGraphRegister()) {
+          if (hasSendNetPeer) {
+            ncclNetGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, sendNetConns, sendNetPeers, &regBufFlag, sendNetHandles, cleanupQueue, &info->nCleanupQueueElts);
+            if (regBufFlag)
+              ncclNetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, sendNetConns, sendNetPeers, &regBufFlag, srecvNetHandles, cleanupQueue, &info->nCleanupQueueElts);
+          }
+          if ((regBufFlag || !hasSendNetPeer) && hasRecvNetPeer)
+            ncclNetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, recvNetConns, recvNetPeers, &regBufFlag, recvNetHandles, cleanupQueue, &info->nCleanupQueueElts);
+        }
+        if (!regBufFlag && ncclParamLocalRegister()) {
+          if (hasSendNetPeer) {
+            ncclNetLocalRegisterBuffer(comm, info->sendbuff, sendbuffSize, sendNetConns, sendNetPeers, &regBufFlag, sendNetHandles);
+            if (regBufFlag)
+              ncclNetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, sendNetConns, sendNetPeers, &regBufFlag, srecvNetHandles);
+          }
+          if ((regBufFlag || !hasSendNetPeer) && hasRecvNetPeer)
+            ncclNetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, recvNetConns, recvNetPeers, &regBufFlag, recvNetHandles);
+        }
+      }
+
+      if (regBufFlag) {
+        info->regBufType |= NCCL_NET_REG_BUFFER;
+        info->sendNetHandles = sendNetHandles;
+        info->recvNetHandles = recvNetHandles;
+        info->srecvNetHandles = srecvNetHandles;
+        if (comm->isOneRPN && (info->func == ncclFuncAllGather || info->func == ncclFuncBroadcast)) {
+          info->nMaxChannels = 1;
+        }
+      } else {
+        free(sendNetHandles);
+        free(recvNetHandles);
+        free(srecvNetHandles);
+      }
+
+      free(sendNetConns);
+      free(recvNetConns);
+    } else if (info->algorithm == NCCL_ALGO_TREE || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) {
+      struct ncclReg* recvRegRecord;
+      int netSendRegFlag = 0, netRecvRegFlag = 0;
+      void *sendHandle, *recvHandle;
+      NCCLCHECK(ncclRegFind(comm, info->recvbuff, recvbuffSize, &recvRegRecord));
+      if (recvRegRecord == NULL && !(comm->planner.persistent && ncclParamGraphRegister())) goto exit;
+      if (comm->intraNodeP2pSupport) {
+        for (int c = 0; c < comm->nChannels; ++c) {
+          struct ncclChannel* channel = comm->channels + c;
+          struct ncclTree* tree = NULL;
+          int peers[NCCL_MAX_TREE_ARITY + 1];
+
+          if (info->algorithm == NCCL_ALGO_TREE)
+            tree = &channel->tree;
+          else
+            tree = &channel->collnetChain;
+          for (int p = 0; p < NCCL_MAX_TREE_ARITY; ++p) peers[p] = tree->down[p];
+          peers[NCCL_MAX_TREE_ARITY] = tree->up;
+          for (int p = 0; p < NCCL_MAX_TREE_ARITY + 1; ++p) {
+            int peer = peers[p];
+            bool peerNeedReg = false;
+            struct ncclConnector* recvConn = NULL;
+            // P2P transport
+            if (peer == -1 || peer == comm->nRanks) continue;
+            recvConn = &channel->peers[peer]->recv[0];
+            NCCLCHECK(registerCheckP2PConnection(comm, recvConn, &comm->graphs[info->algorithm], peer, &peerNeedReg));
+
+            if (peerNeedReg) {
+              bool found = false;
+              for (int pindex = 0; pindex < nPeers; ++pindex) {
+                if (peerRanks[pindex] == peer) {
+                  found = true;
+                  break;
+                }
+              }
+              if (!found) peerRanks[nPeers++] = peer;
+            }
+          }
+        }
+        if (nPeers > 0) {
+          if (comm->planner.persistent && ncclParamGraphRegister()) {
+            ncclIpcGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts);
+          }
+          if (!regBufFlag && ncclParamLocalRegister()) {
+            ncclIpcLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs);
+          }
+        }
+        if (regBufFlag) {
+          info->regBufType = NCCL_IPC_REG_BUFFER;
+        }
+      }
+
+      // register collnet chain 1RPN buffer
+      if (info->algorithm == NCCL_ALGO_COLLNET_CHAIN && info->opDev.op != ncclDevPreMulSum && info->opDev.op != ncclDevSumPostDiv && comm->isOneRPN) {
+        if (comm->planner.persistent && ncclParamGraphRegister()) {
+          ncclCollnetGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &netSendRegFlag, &sendHandle, cleanupQueue, &info->nCleanupQueueElts);
+          info->sendMhandle = sendHandle;
+          if (netSendRegFlag) {
+            ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &netRecvRegFlag, &recvHandle, cleanupQueue, &info->nCleanupQueueElts);
+            info->recvMhandle = recvHandle;
+          }
+        }
+
+        if ((netSendRegFlag == 0 || netRecvRegFlag == 0) && ncclParamLocalRegister()) {
+          if (!netSendRegFlag) {
+            ncclCollnetLocalRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &netSendRegFlag, &sendHandle);
+            info->sendMhandle = sendHandle;
+          }
+          if (netSendRegFlag && !netRecvRegFlag) {
+            ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &netRecvRegFlag, &recvHandle);
+            info->recvMhandle = recvHandle;
+          }
+        }
+      }
+
+      if (netSendRegFlag && netRecvRegFlag) {
+        if (comm->isOneRPN) info->nMaxChannels = 1;
+        info->regBufType |= NCCL_NET_REG_BUFFER;
+      }
+    }
+
+    if (info->regBufType == NCCL_IPC_REG_BUFFER && comm->nNodes == 1 && 16 < info->nMaxChannels && info->nMaxChannels <= 24) {
+      info->nMaxChannels = 16;
+    }
+  }
+exit:
+#endif
+  return result;
+}
diff --git a/src/register/register.cc b/src/register/register.cc
new file mode 100644
index 0000000000..9e8f6eaafc
--- /dev/null
+++ b/src/register/register.cc
@@ -0,0 +1,179 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "argcheck.h" // Need some checks here since we access comm
+#include "nccl.h"
+#include "comm.h"
+#include "net.h"
+#include "register.h"
+#include "transport.h"
+
+ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** reg) {
+  struct ncclRegCache* cache = &comm->regCache;
+  uintptr_t pageSize = cache->pageSize;
+  uintptr_t addr = (uintptr_t)data & -pageSize;
+  size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
+
+  *reg = NULL;
+  for (int slot=0; /*true*/; slot++) {
+    if (slot == cache->population || addr < cache->slots[slot]->addr) return ncclSuccess;
+    if ((addr >= cache->slots[slot]->addr) &&
+        ((addr-cache->slots[slot]->addr)/pageSize+pages) <= cache->slots[slot]->pages) {
+      *reg = cache->slots[slot];
+      return ncclSuccess;
+    }
+  }
+}
+NCCL_PARAM(LocalRegister, "LOCAL_REGISTER", 1);
+
+ncclResult_t ncclRegLocalIsValid(struct ncclReg *reg, bool *isValid) {
+  if (reg && isValid) {
+    if (reg->localRefs)
+      *isValid = true;
+    else
+      *isValid = false;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclRegister(struct ncclComm* comm, void* data, size_t size, bool isGraph, void** handle) {
+  NCCLCHECK(CommCheck(comm, "ncclCommRegister", "comm"));
+  struct ncclRegCache* cache = &comm->regCache;
+  uintptr_t pageSize = cache->pageSize;
+  uintptr_t addr = (uintptr_t)data & -pageSize;
+  size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
+
+  if (comm->checkPointers) NCCLCHECK(CudaPtrCheck(data, comm, "buff", "ncclCommRegister"));
+  INFO(NCCL_REG, "register comm %p buffer %p size %zi", comm, data, size);
+
+  for (int slot=0; /*true*/; slot++) {
+    if ((slot == cache->population) || (addr < cache->slots[slot]->addr)) {
+      if (cache->population == cache->capacity) { // must grow cache
+        cache->capacity = cache->capacity < 32 ? 32 : 2*cache->capacity;
+        NCCLCHECK(ncclRealloc(&cache->slots, cache->population, cache->capacity));
+      }
+      memmove(cache->slots+slot+1, cache->slots+slot, (cache->population-slot)*sizeof(struct ncclReg*));
+      NCCLCHECK(ncclCalloc(cache->slots+slot, 1));
+      struct ncclReg* regSlot = cache->slots[slot];
+      regSlot->addr = addr;
+      regSlot->pages = pages;
+      if (isGraph) regSlot->graphRefs = 1;
+      else regSlot->localRefs = 1;
+      cache->population += 1;
+      *handle = regSlot;
+      goto exit;
+    } else if ((addr >= cache->slots[slot]->addr) &&
+        ((addr-cache->slots[slot]->addr)/pageSize+pages) <= cache->slots[slot]->pages) {
+      if (isGraph) cache->slots[slot]->graphRefs++;
+      else cache->slots[slot]->localRefs++;
+      *handle = cache->slots[slot];
+      goto exit;
+    }
+  }
+
+exit:
+  return ncclSuccess;
+}
+
+static ncclResult_t regCleanup(struct ncclComm* comm, struct ncclReg* reg) {
+  if (reg->state & NET_REG_COMPLETE) {
+    struct ncclRegNetHandles* netHandle = reg->netHandleHead;
+    struct ncclRegNetHandles* netHandlePrev;
+    while(netHandle) {
+      if (ncclNetDeregBuffer(comm, netHandle->proxyConn, netHandle->handle) != ncclSuccess) {
+        WARN("rank %d deregister NET buffer handle %p proxy rank %d failed\n", comm->rank, netHandle->handle, netHandle->proxyConn->rank);
+      }
+      netHandlePrev = netHandle;
+      netHandle = netHandle->next;
+      free(netHandlePrev);
+    }
+  }
+  if (reg->state & NVLS_REG_COMPLETE) {
+    if (ncclNvlsDeregBuffer(comm, &reg->mcHandle, reg->regAddr, reg->dev, reg->regSize) != ncclSuccess) {
+      WARN("rank %d deregister NVLS buffer %p dev %d size %ld failed", comm->rank, (void*)reg->regAddr, reg->dev, reg->regSize);
+    }
+    reg->regAddr = (CUdeviceptr)NULL;
+  }
+  if (reg->state & COLLNET_REG_COMPLETE) {
+    if (ncclCollnetDeregBuffer(comm, reg->collnetProxyconn, reg->collnetHandle) != ncclSuccess) {
+      WARN("rank %d deregister COLLNET buffer handle %p proxy rank %d failed", comm->rank, reg->collnetHandle, reg->collnetProxyconn->rank);
+    }
+  }
+  if (reg->state & IPC_REG_COMPLETE) {
+    for (int i = 0; i < NCCL_MAX_LOCAL_RANKS; ++i)
+      if (reg->ipcInfos[i]) {
+        if (ncclIpcDeregBuffer(comm, reg->ipcInfos[i]) != ncclSuccess) {
+          WARN("rank %d deregister IPC buffer %p peerRank %d failed", comm->rank, reg->ipcInfos[i]->baseAddr, reg->ipcInfos[i]->peerRank);
+        }
+        free(reg->ipcInfos[i]);
+      }
+    if (reg->regIpcAddrs.hostPeerRmtAddrs) free(reg->regIpcAddrs.hostPeerRmtAddrs);
+    if (reg->regIpcAddrs.devPeerRmtAddrs) NCCLCHECK(ncclCudaFree(reg->regIpcAddrs.devPeerRmtAddrs));
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclRegCleanup(struct ncclComm* comm) {
+  struct ncclRegCache* cache = &comm->regCache;
+  for (int i = 0; i < cache->population; i++) {
+    struct ncclReg* reg = cache->slots[i];
+    INFO(NCCL_INIT, "Cleanup buffer %p pages %lx", (void*)reg->addr, reg->pages);
+    NCCLCHECK(regCleanup(comm, reg));
+    free(reg);
+  }
+  free(cache->slots);
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclCommRegister, const ncclComm_t comm, void* buff, size_t size, void** handle);
+ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) {
+  if (!ncclParamLocalRegister())
+    *handle = NULL;
+  else
+    NCCLCHECK(ncclRegister(comm, buff, size, false, handle));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCommGraphRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) {
+  NCCLCHECK(ncclRegister(comm, buff, size, true, handle));
+  return ncclSuccess;
+}
+
+static ncclResult_t commDeregister(struct ncclComm *comm, bool isGraph, struct ncclReg* reg) {
+  NCCLCHECK(CommCheck(comm, "ncclCommRegister", "comm"));
+  struct ncclRegCache* cache = &comm->regCache;
+  int slot;
+  int saveDev;
+  if (reg == NULL) goto exit;
+  CUDACHECK(cudaGetDevice(&saveDev));
+  CUDACHECK(cudaSetDevice(comm->cudaDev));
+  for (slot = 0; slot < cache->population && cache->slots[slot] != reg; slot++);
+  if (slot == cache->population) {
+    WARN("Deregister: Could not find handle");
+    return ncclInvalidUsage;
+  }
+  if (isGraph) --reg->graphRefs;
+  else --reg->localRefs;
+  if (reg->localRefs || reg->graphRefs) return ncclSuccess;
+  NCCLCHECK(regCleanup(comm, reg));
+  free(reg);
+  memmove(cache->slots + slot, cache->slots + slot + 1, (cache->population - slot - 1) * sizeof(struct ncclReg*));
+  cache->population -= 1;
+  CUDACHECK(cudaSetDevice(saveDev));
+exit:
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclCommDeregister, const ncclComm_t comm, void* handle);
+ncclResult_t ncclCommDeregister(const ncclComm_t comm, void *handle) {
+  NCCLCHECK(commDeregister(comm, false, (struct ncclReg*)handle));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCommGraphDeregister(const ncclComm_t comm, struct ncclReg *handle) {
+  NCCLCHECK(commDeregister(comm, true, handle));
+  return ncclSuccess;
+}
diff --git a/src/register/sendrecv_reg.cc b/src/register/sendrecv_reg.cc
new file mode 100644
index 0000000000..f82fbd7142
--- /dev/null
+++ b/src/register/sendrecv_reg.cc
@@ -0,0 +1,35 @@
+#include "register.h"
+#include "transport.h"
+
+ncclResult_t ncclRegisterP2pNetBuffer(struct ncclComm* comm, void* userbuff, size_t size, struct ncclConnector* conn, int* regFlag, void** handle, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue) {
+  ncclResult_t ret = ncclSuccess;
+
+  *regFlag = 0;
+  if (comm->netDeviceType != NCCL_NET_DEVICE_UNPACK) {
+    if (comm->planner.persistent && ncclParamGraphRegister()) {
+      ncclNetGraphRegisterBuffer(comm, userbuff, size, &conn, 1, regFlag, handle, cleanupQueue, NULL);
+    }
+    if (*regFlag == 0 && ncclParamLocalRegister()) {
+      ncclNetLocalRegisterBuffer(comm, userbuff, size, &conn, 1, regFlag, handle);
+    }
+  }
+  return ret;
+}
+
+ncclResult_t ncclRegisterP2pIpcBuffer(struct ncclComm* comm, void* userbuff, size_t size, int peerRank, int* regFlag, void** regAddr, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue) {
+  ncclResult_t ret = ncclSuccess;
+  uintptr_t offset = 0;
+  uintptr_t* peerRmtAddrs = NULL;
+
+  *regFlag = 0;
+  if (comm->planner.persistent && ncclParamGraphRegister()) {
+    ncclIpcGraphRegisterBuffer(comm, userbuff, size, &peerRank, 1, NCCL_IPC_SENDRECV, regFlag, &offset, &peerRmtAddrs, reinterpret_cast<void*>(cleanupQueue), NULL);
+  }
+  if (*regFlag == 0 && ncclParamLocalRegister()) {
+    ncclIpcLocalRegisterBuffer(comm, userbuff, size, &peerRank, 1, NCCL_IPC_SENDRECV, regFlag, &offset, &peerRmtAddrs);
+  }
+
+  if (*regFlag)
+    *regAddr = (void*)((uintptr_t)peerRmtAddrs + offset);
+  return ret;
+}
diff --git a/src/transport.cc b/src/transport.cc
index eeee7a24bf..5629ce7a28 100644
--- a/src/transport.cc
+++ b/src/transport.cc
@@ -94,13 +94,13 @@ ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* intraNodeP2p
   }
   *intraNodeP2pSupport = supportFlag;
   *directMode = directFlag;
+  if (comm->rank == 0) INFO(NCCL_INIT, "Check P2P Type intraNodeP2pSupport %d directMode %d", supportFlag, directFlag);
   return ncclSuccess;
 }
 
-ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType/*=NULL*/) {
+ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex) {
   // Stream used during transport setup; need for P2P pre-connect + CUDA Graph
   ncclResult_t ret = ncclSuccess;
-  int highestType = TRANSPORT_UNDEFINED;  // track highest transport type
   struct ncclConnect** data; // Store intermediate send/recvData structs for connect
   struct ncclConnect** recvData = NULL; // Points to entries inside data for given recv connection within a channel
   struct ncclConnect** sendData = NULL; // Points to entries inside data for given send connection within a channel
@@ -131,7 +131,10 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
     // The next M entries contain sendData, connection information for send connections
     // It's not guaranteed that each entry of data has the same number of total or send/recv specific connections
     int p = i-(done+1);
-    if (recvMask || sendMask) NCCLCHECKGOTO(ncclCalloc(data+p, 2*MAXCHANNELS), ret, fail);
+    if (recvMask || sendMask) {
+      if (data[p] == NULL) NCCLCHECKGOTO(ncclCalloc(data + p, 2 * MAXCHANNELS), ret, fail);
+      else memset(data[p], 0, 2 * MAXCHANNELS * sizeof(struct ncclConnect));
+    }
     recvData[p] = data[p];
     int sendChannels = 0, recvChannels = 0;
     int type;
@@ -139,7 +142,6 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
     for (int c=0; c<MAXCHANNELS; c++) {
       if (recvMask & (1UL<<c)) {
         NCCLCHECKGOTO(selectTransport<0>(comm, graph, recvData[p]+recvChannels++, c, recvPeer, connIndex, &type), ret, fail);
-        if (type > highestType) highestType = type;
       }
     }
     TIME_STOP(0);
@@ -148,7 +150,6 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
     for (int c=0; c<MAXCHANNELS; c++) {
       if (sendMask & (1UL<<c)) {
         NCCLCHECKGOTO(selectTransport<1>(comm, graph, sendData[p]+sendChannels++, c, sendPeer, connIndex, &type), ret, fail);
-        if (type > highestType) highestType = type;
       }
     }
     TIME_STOP(1);
@@ -222,22 +223,18 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
             }
             TIME_STOP(4);
           }
-          if (sendMask || recvMask) {
-            free(data[p]);
-            data[p] = NULL;
-          }
         }
-	if (ncclParamReportConnectProgress() && comm->rank == 0 && done > 0) {
+        if (ncclParamReportConnectProgress() && comm->rank == 0 && done > 0) {
           struct timeval now;
           gettimeofday(&now, NULL);
-          if (((now.tv_sec - timeLast.tv_sec)*1.0 + (now.tv_usec-timeLast.tv_usec)*1e-6) > 1) {
-            float elapsed = (now.tv_sec - timeStart.tv_sec)*1.0 + (now.tv_usec-timeStart.tv_usec)*1e-6;
-	    float remaining = elapsed*(comm->nRanks-done)/done;
+          if (((now.tv_sec - timeLast.tv_sec) * 1.0 + (now.tv_usec - timeLast.tv_usec) * 1e-6) > 1) {
+            float elapsed = (now.tv_sec - timeStart.tv_sec) * 1.0 + (now.tv_usec - timeStart.tv_usec) * 1e-6;
+            float remaining = elapsed * (comm->nRanks - done) / done;
             printf("%sP2p connect: %g%% Elapsed %d:%02d Remaining %d:%02d                                       ",
-                timeReported ? "\r" : "", done*100.0/comm->nRanks, ((int)elapsed)/60, ((int)elapsed)%60, ((int)remaining)/60, ((int)remaining)%60);
+              timeReported ? "\r" : "", done * 100.0 / comm->nRanks, ((int)elapsed) / 60, ((int)elapsed) % 60, ((int)remaining) / 60, ((int)remaining) % 60);
             fflush(stdout);
             timeReported = true;
-	    timeLast = now; // struct copy;
+            timeLast = now; // struct copy;
           }
         }
       }
@@ -280,7 +277,6 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
     comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0UL;
   }
 
-  if (highestTransportType != NULL) *highestTransportType = highestType;
   TIME_PRINT("P2P Setup/Connect");
 exit:
   for(int i=0; i<maxPeers; ++i){
diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc
index 7d2f298ae6..67180123f4 100644
--- a/src/transport/coll_net.cc
+++ b/src/transport/coll_net.cc
@@ -112,6 +112,7 @@ struct sendResources {
   uint64_t step;
   struct reqSlot (*reqFifo)[NCCL_STEPS];
   int collNetRank;
+  size_t maxCollBytes;
 };
 
 struct recvResources {
@@ -133,6 +134,7 @@ struct recvResources {
   uint64_t step;
   struct reqSlot reqFifo[COLLNET_MAX_GROUPS][NCCL_STEPS];
   int collNetRank;
+  size_t maxCollBytes;
 };
 
 static ncclResult_t canConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
@@ -157,7 +159,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   int proxyRank;
   int64_t netId;
   NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &netId, &req.netDev, &proxyRank));
-  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 1, &req.useGdr));
+  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->rank, netId, 1, &req.useGdr));
   send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
 
   send->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
@@ -177,10 +179,10 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   int proxyRank;
   int64_t netId;
   NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &netId, &req.netDev, &proxyRank));
-  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 0, &req.useGdr));
+  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->rank, netId, 0, &req.useGdr));
   recv->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
   // Determine whether we need to flush the GDR buffer on recv or not
-  if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush));
+  if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm, req.netDev, myInfo->rank, &req.needFlush));
 
   recv->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
   NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, myInfo->rank, &recv->proxyConn));
@@ -319,6 +321,13 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc
   connection->collNet = req->collNet;
   /* DMA-BUF support */
   resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
+  /* collective size limits*/
+  resources->maxCollBytes = props.maxCollBytes;
+  if((resources->maxCollBytes <= 0) || (resources->maxCollBytes > NCCL_MAX_NET_SIZE_BYTES)) {
+    WARN("sendProxySetup: collnet plugin returned invalid value for maxCollBytes %ld \
+      [allowed range: %ld - %ld] \n", resources->maxCollBytes, 0L, NCCL_MAX_NET_SIZE_BYTES);
+    return ncclInternalError;
+  }
   return ncclSuccess;
 }
 
@@ -430,6 +439,12 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
   connection->collNet = req->collNet;
   /* DMA-BUF support */
   resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
+  resources->maxCollBytes = props.maxCollBytes;
+  if((resources->maxCollBytes <= 0) || (resources->maxCollBytes > NCCL_MAX_NET_SIZE_BYTES)) {
+    WARN("sendProxySetup: collnet plugin returned invalid value for maxCollBytes %ld \
+      [allowed range: %ld - %ld] \n", resources->maxCollBytes, 0L, NCCL_MAX_NET_SIZE_BYTES);
+    return ncclInternalError;
+  }
 
   collNetHandle_t* netHandle = (collNetHandle_t*) respBuff;
   if (respSize != sizeof(collNetHandle_t)) return ncclInternalError;
@@ -645,14 +660,14 @@ static size_t calcAlgoOffset(struct ncclProxyArgs* args, int isAllNotOne, int su
   return offset;
 }
 
-static int calcRegionOffset(
+static ssize_t calcRegionOffset(
     struct ncclProxyArgs* args, int isRecvNotSend, int sub, uint64_t step,
     int side // 0=begin, 1=end
   ) {
   struct ncclCollNetSharedRes* collNet = args->subs[0].connection->collNet;
-  int slotSize = collNet->buffSize/NCCL_STEPS;
-  int chunkSize = args->chunkSize;
-  int base = isRecvNotSend*NCCL_STEPS + (step%NCCL_STEPS);
+  ssize_t slotSize = collNet->buffSize/NCCL_STEPS;
+  ssize_t chunkSize = args->chunkSize;
+  ssize_t base = isRecvNotSend*NCCL_STEPS + (step%NCCL_STEPS);
   base *= collNet->nChannels*slotSize;
   if (args->coll == ncclFuncAllReduce) {
     return base + (sub+side)*chunkSize;
@@ -674,6 +689,165 @@ static constexpr int calcStepsPerGroup(int nGroups) {
   return NCCL_STEPS;
 }
 
+static ncclResult_t collNetRegIallreduce(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, int groupStart, ssize_t *nBytesInOut, void **request) {
+  ssize_t loopSize, winOffset, nBytes;
+  ssize_t eltSize = ncclTypeSize((ncclDataType_t)args->dtype);
+  // for UB iallreduce 1RPN case, user's send and recv buffers are both directly accessed by collnet network.
+  // we can just issue maximal collnet bytes by resources->maxCollBytes for each iallreduce.
+  // for multi-RPN case, we have to consider pipeline, so each time we only send groupSize * chunkSize (i.e., nBytesInOut)
+  // sub->loopOffset is data offset to the buffer for this head rank in each loop
+  // winOffset is used to find actual offset from send and recv buffer for this iallreduce
+  // loopSize is all bytes sent by all channels and head ranks in each loop.
+  // send and recv mem handle are retrieved from sub in which user buffer mem handles are stored.
+  if (sub->isOneRPN) {
+    winOffset = 0;
+    nBytes = std::min((size_t)sub->nbytes, resources->maxCollBytes);
+    loopSize = nBytes;
+  } else {
+    winOffset = sub->loopOffset + groupStart * args->chunkSize;
+    nBytes = std::min(sub->nbytes - winOffset, *nBytesInOut);
+    loopSize = sub->loopSize;
+  }
+
+  if (nBytes > 0) {
+    NCCLCHECK(proxyState->ncclCollNet->iallreduce(resources->collNetComm, sub->sendbuff + winOffset, sub->recvbuff + winOffset, nBytes / eltSize, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sub->sendMhandle, sub->recvMhandle, request));
+    if (*request) {
+      // if issued successfully, we need to move the pointer forward and reduce the existing nbytes.
+      sub->nbytes -= loopSize;
+      sub->sendbuff += loopSize;
+      sub->recvbuff += loopSize;
+      TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] registered Iallreduce posted sendbuff %p recvbuff %p size %ld loopSize %ld winOffset %ld isOneRPN %d req %p", (long)sub->transmitted, sub->nsteps, groupStart, sub->sendbuff, sub->recvbuff, nBytes, loopSize, winOffset, sub->isOneRPN, *request);
+    }
+  }
+  *nBytesInOut = nBytes;
+  return ncclSuccess;
+}
+
+static ncclResult_t collNetIallreduce(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytes, ssize_t sendBeg, ssize_t recvBeg, void **request) {
+  void *sendMhandle = resources->sendMhandles[NCCL_PROTO_SIMPLE];
+  void *recvMhandle = resources->recvMhandles[NCCL_PROTO_SIMPLE];
+  char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]);
+  ssize_t eltSize = ncclTypeSize((ncclDataType_t)args->dtype);
+  // non-UB iallreduce, region is intermediate buffer and sendBeg/recvBeg is the corresponding offset
+  // for send and recv data. The send and recv mem handle are retrieved from resources.
+  NCCLCHECK(proxyState->ncclCollNet->iallreduce(resources->collNetComm, region + sendBeg, region + recvBeg, nBytes / eltSize, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sendMhandle, recvMhandle, request));
+  if (*request)
+    TRACE(NCCL_NET, "sendProxy [%ld/%d] Iallreduce posted size %ld sendBeg %ld recvBeg %ld req %p", (long)sub->transmitted, sub->nsteps, nBytes, sendBeg, recvBeg, *request);
+  return ncclSuccess;
+}
+
+static ncclResult_t collNetRegIallgather(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytesIn, ssize_t allBeg, ssize_t recvBeg, void *recvMhandle, void **request) {
+  ncclNetSGE_v9_t recvParts;
+  ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank;
+  char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]);
+  ssize_t nBytes;
+  ssize_t winOffset;
+  void *sendbuff;
+  // UB iallgather 1RPN logic is the same as iallreduce.
+  // If iallgather is not 1RPN, we can let collnet network directly access sendbuff but not recvbuff;
+  // the main reason is non-1RPN case will cause non-contiguous recv data from network, so
+  // we have to use intermediate buffer "region" to recv data and copy into the recvbuff.
+  // so allBeg and recvMhandle, which are global window offset of recv buffer and mem handle for region,
+  // are only used in multi-RPN case.
+  if (sub->isOneRPN) {
+    nBytes = std::min((size_t)sub->nbytes, resources->maxCollBytes);
+    winOffset = sub->offset;
+    recvParts.mhandle = sub->recvMhandle;
+    recvParts.address = sub->recvbuff;
+  } else {
+    nBytes = nBytesIn;
+    winOffset = allBeg;
+    recvParts.mhandle = recvMhandle;
+    recvParts.address = region + recvBeg;
+  }
+  recvParts.size = nBytes;
+  if (winOffset / sizePerRank == args->specifics.collnetDirect.node) {
+    sendbuff = sub->sendbuff + winOffset % sizePerRank;
+  } else {
+    sendbuff = sub->sendbuff;
+  }
+  NCCLCHECK(proxyState->ncclCollNet->iallgather(resources->collNetComm, sendbuff, 1, &recvParts, sizePerRank, winOffset, nBytes, sub->sendMhandle, request));
+  if (*request) {
+    if (sub->isOneRPN) {
+      sub->recvbuff += nBytes;
+      sub->nbytes -= nBytes;
+      sub->offset += nBytes;
+    }
+    TRACE(NCCL_NET, "sendProxy [%ld/%d] registered Iallgather posted sizePerRank %ld winOffset %ld recvSize %ld isOneRPN %d request %p", sub->transmitted, sub->nsteps, sizePerRank, winOffset, nBytes, sub->isOneRPN, *request);
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t collNetIallgather(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytes, ssize_t allBeg, ssize_t sendBeg, ssize_t recvBeg, void *sendMhandle, void *recvMhandle, void **request) {
+  ncclNetSGE_v9_t recvParts;
+  ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank;
+  char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]);
+  recvParts.mhandle = recvMhandle;
+  recvParts.address = region + recvBeg;
+  recvParts.size = nBytes;
+  // non-UB iallgather, we use intermidate region buffers for both send and recv data.
+  // sendMhandle and recvMhandle are send and recv mem handles for region, and allBeg is
+  // the global window offset of recv buffer. sendBeg and recvBeg are offset to the region
+  // for intermediate data.
+  NCCLCHECK(proxyState->ncclCollNet->iallgather(resources->collNetComm, region + sendBeg, 1, &recvParts, sizePerRank, allBeg, nBytes, sendMhandle, request));
+  if (*request)
+    TRACE(NCCL_NET, "sendProxy [%ld/%d] Iallgather posted sizePerRank %ld winOffset %ld recvSize %ld request %p", sub->transmitted, sub->nsteps, sizePerRank, allBeg, nBytes, *request);
+  return ncclSuccess;
+}
+
+static ncclResult_t collNetRegIreducescatter(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytesIn, ssize_t allBeg, ssize_t sendBeg, void *sendMhandle, void **request) {
+  ncclNetSGE_v9_t sendParts;
+  ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank;
+  char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]);
+  ssize_t nBytes;
+  size_t winOffset;
+  void *recvbuff;
+  // Similar to iallgather, if ireducescatter is not 1RPN, we can let collnet network
+  // directly access recvbuff but not sendbuff. We use intermediate buffer "region" to
+  // send data and directly recv into the recvbuff.
+  if (sub->isOneRPN) {
+    nBytes = std::min((size_t)sub->nbytes, resources->maxCollBytes);
+    winOffset = sub->offset;
+    sendParts.mhandle = sub->sendMhandle;
+    sendParts.address = sub->sendbuff;
+  } else {
+    nBytes = nBytesIn;
+    winOffset = allBeg;
+    sendParts.mhandle = sendMhandle;
+    sendParts.address = region + sendBeg;
+  }
+  sendParts.size = nBytes;
+  if (winOffset / sizePerRank == args->specifics.collnetDirect.node) {
+    recvbuff = sub->recvbuff + winOffset % sizePerRank;
+  } else {
+    recvbuff = sub->recvbuff;
+  }
+  NCCLCHECK(proxyState->ncclCollNet->ireducescatter(resources->collNetComm, 1, &sendParts, recvbuff, sizePerRank, winOffset, nBytes, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sub->recvMhandle, request));
+  if (*request) {
+    if (sub->isOneRPN) {
+      sub->sendbuff += nBytes;
+      sub->nbytes -= nBytes;
+      sub->offset += nBytes;
+    }
+    TRACE(NCCL_NET, "sendProxy [%ld/%d] registered Ireducescatter posted sizePerRank %ld winOffset %ld sendSize %ld isOneRPN %d request %p", sub->transmitted, sub->nsteps, sizePerRank, winOffset, nBytes, sub->isOneRPN, *request);
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t collNetIreducescatter(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytes, ssize_t allBeg, ssize_t sendBeg, ssize_t recvBeg, void *sendMhandle, void *recvMhandle, void **request) {
+  ncclNetSGE_v9_t sendParts;
+  ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank;
+  char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]);
+  sendParts.mhandle = sendMhandle;
+  sendParts.address = region + sendBeg;
+  sendParts.size = nBytes;
+  // non-UB ireducescatter is the same as non-UB iallgather but in the reverse direction.
+  NCCLCHECK(proxyState->ncclCollNet->ireducescatter(resources->collNetComm, 1, &sendParts, region + recvBeg, sizePerRank, allBeg, nBytes, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, recvMhandle, request));
+  if (*request)
+    TRACE(NCCL_NET, "sendProxy [%ld/%d] Ireducescatter posted sizePerRank %ld winOffset %ld sendSize %ld request %p", sub->transmitted, sub->nsteps, sizePerRank, allBeg, nBytes, *request);
+  return ncclSuccess;
+}
+
 static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
   if (args->state == ncclProxyOpReady) {
     for (int s=0; s<args->nsubs; s++) {
@@ -683,6 +857,8 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
       sub->base = ROUNDUP(resources->step, args->chunkSteps);
       sub->posted = sub->received = sub->transmitted = sub->done = 0;
       resources->step = sub->base + sub->nsteps;
+      //adjust nsteps for registerd buffers as device signals a single step
+      if (sub->reg && sub->isOneRPN) sub->nsteps = DIVUP((size_t)sub->nbytes, resources->maxCollBytes);
     }
     args->state = ncclProxyOpProgress;
   }
@@ -695,28 +871,30 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
       struct sendResources* resources = (struct sendResources*) (sub->connection->transportResources);
       void* sendMhandle = resources->sendMhandles[p];
       void* recvMhandle = resources->recvMhandles[p];
-      char* region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[p]);
       auto reqFifo = resources->reqFifo;
       int group = s/COLLNET_GROUP_NSUBS;
       int groupStart = s - (s%COLLNET_GROUP_NSUBS);
 
       if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) {
         int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
-        if (sub->reg == 0) {
+        if (sub->reg == 0 || (!sub->isOneRPN && args->coll == ncclFuncReduceScatter)) {
           resources->recvMem->connFifo[buffSlot].offset = calcRegionOffset(args, 0, s, sub->posted, 0);
           __sync_synchronize();
         }
         volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
-        TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] posted offset %d @ %p signal %ld->%ld", long(sub->posted), group, buffSlot, resources->recvMem->connFifo[buffSlot].offset, &resources->recvMem->connFifo[buffSlot].offset, long(*sendHead), long(sub->base + sub->posted + args->sliceSteps - NCCL_STEPS));
+        TRACE(NCCL_NET, "sendProxy [%ld/%d/%d/%d] posted offset %d @ %p signal %ld->%ld", long(sub->posted), group, buffSlot, sub->nsteps, resources->recvMem->connFifo[buffSlot].offset, &resources->recvMem->connFifo[buffSlot].offset, long(*sendHead), long(sub->base + sub->posted + args->sliceSteps - NCCL_STEPS));
         sub->posted += args->sliceSteps;
-        *sendHead = sub->base + sub->posted - NCCL_STEPS;
+        // Only post one credit for registered buffer
+        if (sub->reg == 0 || !sub->isOneRPN || sub->posted == args->sliceSteps) *sendHead = sub->base + sub->posted - NCCL_STEPS;
         if (resources->gdcSync) wc_store_fence(); // Flush out WC write
       }
       if (sub->received < sub->posted && sub->received < sub->done + calcStepsPerGroup(nGroups)) {
         int buffSlot = (sub->base+sub->received)%NCCL_STEPS;
         volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
         volatile uint64_t* recvTail = &resources->recvMem->tail;
-        if ((connFifo[buffSlot].size != -1 || sub->reg) && ((*recvTail > (sub->base+sub->received)))) {
+        //device progresses tail by only 1 for registered buffers
+        uint64_t tail = sub->base + (sub->reg && sub->isOneRPN ? 0 : sub->received);
+        if ((connFifo[buffSlot].size != -1 || sub->reg) && (*recvTail > tail)) {
           if (args->coll != ncclFuncAllReduce && sub->reg == 0) {
             int sendBeg = calcRegionOffset(args, 0, s, sub->received, 0);
             int sendEnd = calcRegionOffset(args, 0, s, sub->received, 1);
@@ -738,110 +916,42 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
           int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
           if (!reqFifo[group][buffSlot].turnIsSendNotRecv) continue;
 
-          ssize_t sizePerRank = 0;
-          size_t allBeg = calcAlgoOffset(args, 1, groupStart, sub->transmitted);
-          size_t allEnd = calcAlgoOffset(args, 1, s+1, sub->transmitted);
-          int sendBeg = calcRegionOffset(args, 0, groupStart, sub->transmitted, 0);
-          int sendEnd = calcRegionOffset(args, 0, s, sub->transmitted, 1);
-          int recvBeg = calcRegionOffset(args, 1, groupStart, sub->transmitted, 0);
-          int recvEnd = calcRegionOffset(args, 1, s, sub->transmitted, 1);
+          ssize_t allBeg = calcAlgoOffset(args, 1, groupStart, sub->transmitted);
+          ssize_t allEnd = calcAlgoOffset(args, 1, s+1, sub->transmitted);
+          ssize_t sendBeg = calcRegionOffset(args, 0, groupStart, sub->transmitted, 0);
+          ssize_t sendEnd = calcRegionOffset(args, 0, s, sub->transmitted, 1);
+          ssize_t recvBeg = calcRegionOffset(args, 1, groupStart, sub->transmitted, 0);
+          ssize_t recvEnd = calcRegionOffset(args, 1, s, sub->transmitted, 1);
           reqFifo[group][buffSlot].size = recvEnd - recvBeg;
-          size_t eltSize = ncclTypeSize((ncclDataType_t)args->dtype);
 
-          if (sendBeg==sendEnd && recvBeg==recvEnd && sub->reg == 0) {
+          if (sendBeg==sendEnd && recvBeg==recvEnd) {
             sub->requests[buffSlot] = nullptr; // trivally finished request
           } else {
+            ssize_t nBytes = 0;
             if (args->coll == ncclFuncAllReduce) {
+              nBytes = sendEnd - sendBeg;
               if (sub->reg) {
-                size_t nBytes = std::min(sub->nbytes, NCCL_MAX_COLLNET_SIZE);
-                int count = (int)(nBytes / eltSize);
-                NCCLCHECK(proxyState->ncclCollNet->iallreduce(resources->collNetComm, sub->sendbuff, sub->recvbuff, count, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sub->sendMhandle, sub->recvMhandle, sub->requests + buffSlot));
-                if (sub->requests[buffSlot]) {
-                  sub->nbytes -= nBytes;
-                  sub->sendbuff += nBytes;
-                  sub->recvbuff += nBytes;
-                }
+                NCCLCHECK(collNetRegIallreduce(proxyState, resources, args, sub, groupStart, &nBytes, &sub->requests[buffSlot]));
               } else {
-                int count = (sendEnd - sendBeg) / eltSize;
-                NCCLCHECK(proxyState->ncclCollNet->iallreduce(resources->collNetComm, region + sendBeg, region + recvBeg, count, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sendMhandle, recvMhandle, sub->requests + buffSlot));
+                NCCLCHECK(collNetIallreduce(proxyState, resources, args, sub, nBytes, sendBeg, recvBeg, &sub->requests[buffSlot]));
               }
-            } else {
-              sizePerRank = args->specifics.collnetDirect.sizePerRank;
-              if (args->coll == ncclFuncAllGather) {
-                ncclNetSGE_v8_t recvParts;
-                if (sub->reg) {
-                  size_t nBytes = std::min(sub->nbytes, NCCL_MAX_COLLNET_SIZE);
-                  void *sendbuff;
-                  recvParts.mhandle = sub->recvMhandle;
-                  recvParts.address = sub->recvbuff;
-                  recvParts.size = nBytes;
-                  if (sub->offset / sizePerRank == args->specifics.collnetDirect.node) {
-                    sendbuff = sub->sendbuff + sub->offset % sizePerRank;
-                  } else {
-                    sendbuff = sub->sendbuff;
-                  }
-                  NCCLCHECK(proxyState->ncclCollNet->iallgather(
-                    resources->collNetComm, sendbuff, 1, &recvParts,
-                    sizePerRank, sub->offset, nBytes,
-                    sub->sendMhandle, sub->requests + buffSlot));
-                  if (sub->requests[buffSlot]) {
-                    sub->recvbuff += nBytes;
-                    sub->nbytes -= nBytes;
-                    sub->offset += nBytes;
-                  }
-                } else {
-                  recvParts.mhandle = recvMhandle;
-                  recvParts.address = region + recvBeg;
-                  recvParts.size = allEnd - allBeg;
-                  NCCLCHECK(proxyState->ncclCollNet->iallgather(
-                    resources->collNetComm, region + sendBeg, 1, &recvParts,
-                    sizePerRank, allBeg, allEnd - allBeg,
-                    sendMhandle, sub->requests + buffSlot));
-                }
-              } else {
-                ncclNetSGE_v8_t sendParts;
-                if (sub->reg) {
-                  size_t nBytes = std::min(sub->nbytes, NCCL_MAX_COLLNET_SIZE);
-                  void *recvbuff;
-                  sendParts.mhandle = sub->sendMhandle;
-                  sendParts.address = sub->sendbuff;
-                  sendParts.size = nBytes;
-                  if (sub->offset / sizePerRank == args->specifics.collnetDirect.node) {
-                    recvbuff = sub->recvbuff + sub->offset % sizePerRank;
-                  } else {
-                    recvbuff = sub->recvbuff;
-                  }
-                  NCCLCHECK(proxyState->ncclCollNet->ireducescatter(
-                    resources->collNetComm, 1, &sendParts, recvbuff,
-                    sizePerRank, sub->offset, nBytes,
-                    (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp,
-                    sub->recvMhandle, sub->requests + buffSlot));
-                  if (sub->requests[buffSlot]) {
-                    sub->sendbuff += nBytes;
-                    sub->nbytes -= nBytes;
-                    sub->offset += nBytes;
-                  }
-                } else {
-                  sendParts.mhandle = sendMhandle;
-                  sendParts.address = region + sendBeg;
-                  sendParts.size = allEnd - allBeg;
-                  NCCLCHECK(proxyState->ncclCollNet->ireducescatter(
-                    resources->collNetComm, 1, &sendParts, region + recvBeg,
-                    sizePerRank, allBeg, allEnd - allBeg,
-                    (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp,
-                    recvMhandle, sub->requests + buffSlot));
-                }
-              }
-            }
-            if (sub->requests[buffSlot] == nullptr) continue;
-
-            if (args->coll == ncclFuncAllReduce) {
-              TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Iallreduce posted, size %d req %p", (long)sub->transmitted, group, buffSlot, int(sendEnd-sendBeg), sub->requests[buffSlot]);
             } else if (args->coll == ncclFuncAllGather) {
-              TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Iallgather posted sendSize=%ld recvOffset=%ld recvSize=%ld request=%p", (long)sub->transmitted, group, buffSlot, long(sizePerRank), long(allBeg), long(allEnd-allBeg), sub->requests[buffSlot]);
+              nBytes = allEnd - allBeg;
+              if (sub->reg) {
+                NCCLCHECK(collNetRegIallgather(proxyState, resources, args, sub, nBytes, allBeg, recvBeg, recvMhandle, &sub->requests[buffSlot]));
+              } else {
+                NCCLCHECK(collNetIallgather(proxyState, resources, args, sub, nBytes, allBeg, sendBeg, recvBeg, sendMhandle, recvMhandle, &sub->requests[buffSlot]));
+              }
             } else {
-              TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Ireducescatter posted sendOffset=%ld sendSize=%ld recvSize=%ld request=%p", (long)sub->transmitted, group, buffSlot, long(allBeg), long(allEnd-allBeg), long(sizePerRank), sub->requests[buffSlot]);
+              // reducescatter
+              nBytes = allEnd - allBeg;
+              if (sub->reg) {
+                NCCLCHECK(collNetRegIreducescatter(proxyState, resources, args, sub, nBytes, allBeg, sendBeg, sendMhandle, &sub->requests[buffSlot]));
+              } else {
+                NCCLCHECK(collNetIreducescatter(proxyState, resources, args, sub, nBytes, allBeg, sendBeg, recvBeg, sendMhandle, recvMhandle, &sub->requests[buffSlot]));
+              }
             }
+            if (nBytes > 0 && sub->requests[buffSlot] == nullptr) continue;
           }
         }
         sub->transmitted += args->sliceSteps;
@@ -875,6 +985,52 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
   return ncclSuccess;
 }
 
+static ncclResult_t collNetRecvFlush(struct ncclProxyState* proxyState, struct recvResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, int groupStart, ssize_t nBytesIn, ssize_t recvBeg, void **request) {
+  char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]);
+  if (sub->reg && (sub->isOneRPN || args->coll != ncclFuncAllGather)) {
+    ssize_t nBytes, loopSize;
+    ssize_t offset = sub->offset + groupStart * args->chunkSize;
+    if (sub->isOneRPN) {
+      nBytes = std::min((size_t)sub->nbytes, resources->maxCollBytes);
+      loopSize = nBytes;
+    } else {
+      nBytes = std::min(sub->nbytes - sub->loopOffset, nBytesIn);
+      loopSize = sub->loopSize;
+    }
+    if (nBytes > 0) {
+      if (args->coll == ncclFuncReduceScatter) {
+        ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank;
+        ssize_t groupStartOffset = sub->offset + groupStart * args->chunkSize;
+        ssize_t groupEndOffset = groupStartOffset + nBytes;
+        int node = args->specifics.collnetDirect.node;
+        int startNode = groupStartOffset / sizePerRank;
+        int lastNode = groupEndOffset / sizePerRank;
+        if (startNode == node) {
+          offset = groupStartOffset % sizePerRank;
+          nBytes = std::min(sizePerRank - offset, nBytes);
+        } else if (startNode < node && node < lastNode) {
+          offset = 0;
+          nBytes = sizePerRank;
+        } else if (node == lastNode) {
+          offset = 0;
+          nBytes = groupEndOffset % sizePerRank;
+        } else {
+          // dummy flush
+          offset = 0;
+        }
+      }
+      NCCLCHECK(proxyState->ncclCollNet->iflush(resources->collNetComm, sub->recvbuff + offset + sub->loopOffset, nBytes, sub->recvMhandle, request));
+      if (*request) {
+        sub->nbytes -= loopSize;
+        sub->offset += loopSize;
+      }
+    }
+  } else {
+    NCCLCHECK(proxyState->ncclCollNet->iflush(resources->collNetComm, region + recvBeg, nBytesIn, resources->mhandles[NCCL_PROTO_SIMPLE], request));
+  }
+  return ncclSuccess;
+}
+
 static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
   if (args->state == ncclProxyOpReady) {
     for (int s=0; s<args->nsubs; s++) {
@@ -884,22 +1040,21 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
       sub->base = ROUNDUP(resources->step, args->chunkSteps);
       sub->posted = sub->received = sub->flushed = sub->transmitted = sub->done = 0;
       resources->step = sub->base + sub->nsteps;
+      //adjust nsteps for registerd buffers as device signals a single step
+      if (sub->reg && sub->isOneRPN) sub->nsteps = DIVUP((size_t)sub->nbytes, resources->maxCollBytes);
       memset(sub->requests, 0, sizeof(sub->requests));
     }
     args->state = ncclProxyOpProgress;
   }
   args->idle = 1;
   if (args->state == ncclProxyOpProgress) {
-    int p = NCCL_PROTO_SIMPLE;
     int nGroups = DIVUP(args->nsubs, COLLNET_GROUP_NSUBS);
     for (int s=0; s<args->nsubs; s++) {
       int group = s/COLLNET_GROUP_NSUBS;
       int groupStart = s - (s%COLLNET_GROUP_NSUBS);
       struct ncclProxySubArgs* sub = args->subs+s;
       struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
-      void* mhandle = resources->mhandles[p];
       auto reqFifo = resources->reqFifo;
-      char* region = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
 
       // Enforce sync between operations of the same group.
       if (LAST_OF_GROUP(args, s) && (sub->posted < sub->done + calcStepsPerGroup(nGroups)) && (sub->posted < sub->nsteps)) {
@@ -913,10 +1068,10 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
       if (LAST_OF_GROUP(args, s) && (sub->received < sub->posted)) {
         int buffSlot = (sub->base+sub->received)%NCCL_STEPS;
         if (!reqFifo[group][buffSlot].turnIsSendNotRecv) { // Buffer is cleared : coll is complete
-          int recvBeg = calcRegionOffset(args, 1, groupStart, sub->received, 0);
-          int recvEnd = calcRegionOffset(args, 1, s, sub->received, 1);
-          int totalSize = recvEnd - recvBeg;
-          TRACE(NCCL_NET, "recvProxy [%ld/%d/%d] received, size %d chunkSize=%d", (long)sub->received, group, buffSlot, totalSize, args->chunkSize);
+          ssize_t recvBeg = calcRegionOffset(args, 1, groupStart, sub->received, 0);
+          ssize_t recvEnd = calcRegionOffset(args, 1, s, sub->received, 1);
+          ssize_t totalSize = recvEnd - recvBeg;
+          TRACE(NCCL_NET, "recvProxy [%ld/%d/%d] received, size %ld chunkSize=%ld", (long)sub->received, group, buffSlot, totalSize, args->chunkSize);
           sub->received += args->sliceSteps;
           if ((reqFifo[group][buffSlot].size > 0 || sub->reg) && resources->useGdr && resources->needFlush) {
             // GDRCOPY support
@@ -929,37 +1084,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
               return ncclInternalError;
 #endif
             } else {
-              if (sub->reg) {
-                size_t nBytes = std::min(sub->nbytes, NCCL_MAX_COLLNET_SIZE);
-                size_t offset = 0;
-                if (args->coll == ncclFuncReduceScatter) {
-                  size_t sizePerRank = args->specifics.collnetDirect.sizePerRank;
-                  int node = args->specifics.collnetDirect.node;
-                  int startNode = sub->offset / sizePerRank;
-                  int lastNode = (sub->offset + nBytes) / sizePerRank;
-                  if (startNode == node) {
-                    offset = sub->offset % sizePerRank;
-                    nBytes = std::min(sizePerRank - offset, nBytes);
-                  } else if (startNode < node && node < lastNode) {
-                    nBytes = sizePerRank;
-                  } else if (node == lastNode) {
-                    nBytes = (sub->offset + nBytes) % sizePerRank;
-                  } else {
-                    // no need to flush
-                    nBytes = 0;
-                  }
-                }
-                NCCLCHECK(proxyState->ncclCollNet->iflush(resources->collNetComm, sub->recvbuff + offset, nBytes, sub->recvMhandle, sub->requests+buffSlot));
-                if (sub->requests[buffSlot]) {
-                  sub->nbytes -= nBytes;
-                  sub->offset += nBytes;
-                  if (args->coll == ncclFuncAllGather || args->coll == ncclFuncAllReduce) {
-                    sub->recvbuff += nBytes;
-                  }
-                }
-              } else {
-                NCCLCHECK(proxyState->ncclCollNet->iflush(resources->collNetComm, region+recvBeg, totalSize, mhandle, sub->requests+buffSlot));
-              }
+              NCCLCHECK(collNetRecvFlush(proxyState, resources, args, sub, groupStart, totalSize, recvBeg, &sub->requests[buffSlot]));
             }
           }
           args->idle = 0;
@@ -980,14 +1105,19 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
         }
       }
       if (sub->transmitted < sub->flushed) {
-        if (sub->reg == 0) {
+        if (sub->reg == 0 || (!sub->isOneRPN && args->coll == ncclFuncAllGather)) {
           int buffSlot = (sub->base + sub->transmitted)%NCCL_STEPS;
           volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
           connFifo[buffSlot].offset = calcRegionOffset(args, 1, s, sub->transmitted, 0);
           __sync_synchronize();
         }
         volatile uint64_t* recvTail = resources->gdcSync ? resources->gdcSync : &resources->recvMem->tail;
-        *recvTail = sub->base + sub->flushed;
+        if (sub->reg && sub->isOneRPN) {
+          // We may have bumped net steps, but reg operations only have a single step w.r.t. the GPU.
+          if (sub->flushed == sub->nsteps) *recvTail = sub->base + args->sliceSteps;
+        } else {
+          *recvTail = sub->base + sub->flushed;
+        }
         if (resources->gdcSync) wc_store_fence(); // Flush out WC write
         sub->transmitted += args->sliceSteps;
         args->idle = 0;
@@ -999,7 +1129,8 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
       bool groupSync = s==0 ? args->subs[args->nsubs-1].done == sub->done
                             : (sub-1)->done > sub->done;
       volatile uint64_t* sendHead = &resources->sendMem->head;
-      if (groupSync && sub->done < sub->transmitted && (sub->base+sub->done) < *sendHead) {
+      int done = sub->reg && sub->isOneRPN ? 0 : sub->done;
+      if (groupSync && sub->done < sub->transmitted && sub->base + done < *sendHead) {
         sub->done += args->sliceSteps;
         args->idle = 0;
         if (sub->done == sub->nsteps && s == args->nsubs-1) {
@@ -1017,24 +1148,22 @@ struct collnetRegInfo {
   size_t size;
 };
 
-ncclResult_t ncclCollnetLocalRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, int* outRegBufFlag, void** outHandle) {
+static ncclResult_t collnetRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, struct ncclReg* regRecord, int* outRegBufFlag, void** outHandle) {
   ncclResult_t ret = ncclSuccess;
-  struct ncclReg *regRecord = NULL;
+  if (regRecord) {
+    if (regRecord->state & COLLNET_REG_COMPLETE) {
+      // reuse previous registration
+      *outRegBufFlag = 2;
+      *outHandle = regRecord->collnetHandle;
+      INFO(NCCL_REG, "rank %d - COLLNET reuse register userbuff %p (handle %p), buffSize %ld, type %s", comm->rank, userbuff, regRecord->collnetHandle, buffSize, type == collNetRecv ? "Recv" : "Send");
+      goto exit;
+    } else {
+      /* start register collnet buffer */
+      struct collnetRegInfo info = { regRecord->addr, regRecord->pages * comm->regCache.pageSize };
+      void* handle = NULL;
+      struct ncclConnInfo* conn = (type == collNetRecv) ? &comm->channels[0].peers[comm->nRanks]->recv[type].conn : &comm->channels[0].peers[comm->nRanks]->send[type].conn;
 
-  *outRegBufFlag = 0;
-  *outHandle = NULL;
-  if (comm && userbuff && buffSize > 0) {
-    NCCLCHECKGOTO(ncclRegFind(comm, userbuff, buffSize, &regRecord), ret, fail);
-    if (regRecord) {
-      if (regRecord->state & COLLNET_REG_COMPLETE) {
-        // reuse previous registration
-        *outRegBufFlag = 2;
-        *outHandle = regRecord->collnetHandle;
-        goto exit;
-      } else {
-        /* start register collnet buffer */
-        struct collnetRegInfo info = {regRecord->addr, regRecord->pages * comm->regCache.pageSize};
-        void* handle = NULL;
+      if (conn->flags & NCCL_DIRECT_NIC) {
         struct ncclProxyConnector* proxyconn = (type == collNetRecv) ? &comm->channels[0].peers[comm->nRanks]->recv[type].proxyConn : &comm->channels[0].peers[comm->nRanks]->send[type].proxyConn;
         NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyconn, ncclProxyMsgRegister, &info, sizeof(struct collnetRegInfo), &handle, sizeof(void*)), ret, fail);
         if (handle) {
@@ -1042,10 +1171,78 @@ ncclResult_t ncclCollnetLocalRegisterBuffer(struct ncclComm* comm, const void* u
           regRecord->collnetProxyconn = proxyconn;
           *outHandle = regRecord->collnetHandle = handle;
           *outRegBufFlag = 1;
+          INFO(NCCL_REG, "rank %d - COLLNET register userbuff %p (handle %p), buffSize %ld, type %s", comm->rank, userbuff, handle, buffSize, type == collNetRecv ? "Recv" : "Send");
         }
+      } else {
+        WARN("rank %d - COLLNET failed to register userbuff %p (handle %p), buffSize %ld, type %s, GDR is not enabled", comm->rank, userbuff, handle, buffSize, type == collNetRecv ? "Recv" : "Send");
       }
     }
   }
+exit:
+  return ret;
+fail:
+  *outRegBufFlag = 0;
+  *outHandle = NULL;
+  goto exit;
+}
+
+ncclResult_t ncclCollnetLocalRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, int* outRegBufFlag, void** outHandle) {
+  ncclResult_t ret = ncclSuccess;
+  struct ncclReg *regRecord = NULL;
+  bool isValid = false;
+
+  *outRegBufFlag = 0;
+  *outHandle = NULL;
+  if (comm && userbuff && buffSize > 0) {
+    NCCLCHECKGOTO(ncclRegFind(comm, userbuff, buffSize, &regRecord), ret, fail);
+    NCCLCHECKGOTO(ncclRegLocalIsValid(regRecord, &isValid), ret, fail);
+    if (isValid)
+      NCCLCHECKGOTO(collnetRegisterBuffer(comm, userbuff, buffSize, type, regRecord, outRegBufFlag, outHandle), ret, fail);
+  }
+exit:
+  return ret;
+fail:
+  *outRegBufFlag = 0;
+  goto exit;
+}
+
+struct ncclCollnetCleanupCallback {
+  struct ncclCommCallback base;
+  struct ncclComm *comm;
+  struct ncclReg *reg;
+};
+
+static ncclResult_t cleanupCollnet(struct ncclComm* comm, struct ncclCommCallback* cb) {
+  struct ncclCollnetCleanupCallback* obj = (struct ncclCollnetCleanupCallback*)cb;
+  NCCLCHECK(ncclCommGraphDeregister(obj->comm, obj->reg));
+  free(obj);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, int* outRegBufFlag, void** outHandle, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, int* nCleanupQueueElts) {
+  ncclResult_t ret = ncclSuccess;
+  struct ncclCollnetCleanupCallback* record = NULL;
+  struct ncclReg *regRecord = NULL;
+  void *baseSend = NULL;
+  size_t baseSendSize = 0;
+
+  *outRegBufFlag = 0;
+  if (comm && userbuff && buffSize > 0) {
+    CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &baseSendSize, (CUdeviceptr)userbuff), ret, fail);
+    NCCLCHECKGOTO(ncclCommGraphRegister(comm, baseSend, baseSendSize, (void**)&regRecord), ret, fail);
+    NCCLCHECKGOTO(collnetRegisterBuffer(comm, userbuff, buffSize, type, regRecord, outRegBufFlag, outHandle), ret, fail);
+
+    if (*outRegBufFlag) {
+      record = (struct ncclCollnetCleanupCallback*)malloc(sizeof(struct ncclCollnetCleanupCallback));
+      record->base.fn = cleanupCollnet;
+      record->comm = comm;
+      record->reg = regRecord;
+      ncclIntruQueueEnqueue(cleanupQueue, (struct ncclCommCallback*)record);
+      *nCleanupQueueElts += 1;
+    } else {
+      NCCLCHECKGOTO(ncclCommGraphDeregister(comm, regRecord), ret, fail);
+    }
+  }
 
 exit:
   return ret;
@@ -1055,55 +1252,9 @@ fail:
   goto exit;
 }
 
-struct ncclCollnetCleanupCallback {
-  struct ncclCommCallback base;
-  struct ncclProxyConnector* proxyConn;
-  void* buffer;
-  size_t size;
-  void* mhandle;
-};
-
-static ncclResult_t cleanupCollnet(struct ncclComm* comm, struct ncclCommCallback* cb) {
-  struct ncclCollnetCleanupCallback* obj = (struct ncclCollnetCleanupCallback*)cb;
-  NCCLCHECK(ncclCollnetDeregBuffer(comm, obj->proxyConn, obj->mhandle));
-  INFO(NCCL_REG, "rank %d - deregistered collnet buffer handle %p, size %ld, buff %p", comm->rank, obj->mhandle, obj->size, obj->buffer);
-  free(obj);
-  return ncclSuccess;
-}
-
-ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, int* outRegBufFlag, void** outHandle, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, int* nCleanupQueueElts) {
-  ncclResult_t ret = ncclSuccess;
-  void* handle = NULL;
-  struct ncclRegCache* cache = &comm->regCache;
-  uintptr_t pageSize = cache->pageSize;
-  uintptr_t addr = (uintptr_t)userbuff & -pageSize;
-  size_t size = DIVUP((uintptr_t)userbuff - addr + buffSize, pageSize) * pageSize;
-  collnetRegInfo info = {addr, size};
-  struct ncclCollnetCleanupCallback* record = NULL;
-  struct ncclProxyConnector* proxyConn = (type == collNetRecv) ? &comm->channels[0].peers[comm->nRanks]->recv[type].proxyConn : &comm->channels[0].peers[comm->nRanks]->send[type].proxyConn;
-
-  *outRegBufFlag = 0;
-  NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgRegister, &info, sizeof(struct collnetRegInfo), &handle, sizeof(void*)), ret, fail);
-  record = (struct ncclCollnetCleanupCallback*)malloc(sizeof(struct ncclCollnetCleanupCallback));
-  record->base.fn = cleanupCollnet;
-  record->proxyConn = proxyConn;
-  record->buffer = (void*)userbuff;
-  record->size = buffSize;
-  *outHandle = record->mhandle = handle;
-  *outRegBufFlag = 1;
-  ncclIntruQueueEnqueue(cleanupQueue, (struct ncclCommCallback*)record);
-  *nCleanupQueueElts += 1;
-
-exit:
-  return ret;
-fail:
-  *outRegBufFlag = 0;
-  *outHandle = NULL;
-  goto exit;
-}
-
 ncclResult_t ncclCollnetDeregBuffer(struct ncclComm* comm, struct ncclProxyConnector* proxyconn, void* handle) {
   NCCLCHECK(ncclProxyCallBlocking(comm, proxyconn, ncclProxyMsgDeregister, &handle, sizeof(void*), NULL, 0));
+  INFO(NCCL_REG, "rank %d - COLLNET deregistered buffer handle %p", comm->rank, handle);
   return ncclSuccess;
 }
 
@@ -1111,26 +1262,67 @@ static ncclResult_t sendProxyRegBuffer(struct ncclProxyConnection* connection, s
   void* handle;
   struct collnetRegInfo* info = (struct collnetRegInfo*)reqBuff;
   struct sendResources* resources = (struct sendResources*)(connection->transportResources);
+  ncclResult_t ret = ncclSuccess;
+  bool needReg = true;
 
   assert(reqSize == sizeof(struct collnetRegInfo));
   assert(respSize == sizeof(void*));
-  if (proxyState->ncclCollNet->regMr(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle) != ncclSuccess) handle = NULL;
+
+#if CUDART_VERSION >= 11070
+  /* DMA-BUF support */
+  if (resources->useGdr && resources->useDmaBuf) {
+    int dmabuf_fd;
+    CUCHECKGOTO(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0), ret, peermem);
+    NCCLCHECKGOTO(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem);
+    (void)close(dmabuf_fd);
+    needReg = false;
+  }
+#endif
+peermem:
+  if (needReg) {
+    NCCLCHECKGOTO(proxyState->ncclCollNet->regMr(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle), ret, fail);
+  }
+
+exit:
   memcpy(respBuff, (void*)&handle, sizeof(void*));
   *done = 1;
   return ncclSuccess;
+fail:
+  handle = NULL;
+  goto exit;
 }
 
 static ncclResult_t recvProxyRegBuffer(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
   void* handle;
   struct collnetRegInfo* info = (struct collnetRegInfo*)reqBuff;
   struct recvResources* resources = (struct recvResources*)(connection->transportResources);
+  ncclResult_t ret = ncclSuccess;
+  bool needReg = true;
 
   assert(reqSize == sizeof(struct collnetRegInfo));
   assert(respSize == sizeof(void*));
-  if (proxyState->ncclCollNet->regMr(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle) != ncclSuccess) handle = NULL;
+  #if CUDART_VERSION >= 11070
+  /* DMA-BUF support */
+  if (resources->useGdr && resources->useDmaBuf) {
+    int dmabuf_fd;
+    CUCHECKGOTO(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0), ret, peermem);
+    NCCLCHECKGOTO(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem);
+    (void)close(dmabuf_fd);
+    needReg = false;
+  }
+#endif
+peermem:
+  if (needReg) {
+    NCCLCHECKGOTO(proxyState->ncclCollNet->regMr(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle), ret, fail);
+  }
+
+exit:
   memcpy(respBuff, (void*)&handle, sizeof(void*));
   *done = 1;
   return ncclSuccess;
+fail:
+  handle = NULL;
+  goto exit;
 }
 
 static ncclResult_t sendProxyDeregBuffer(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, int* done) {
@@ -1155,13 +1347,6 @@ static ncclResult_t recvProxyDeregBuffer(struct ncclProxyConnection* connection,
   return ncclSuccess;
 }
 
-struct ncclTransport collNetTransport = {
-  "COL",
-  canConnect,
-  { sendSetup, sendConnect, sendFree, NULL, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress, sendProxyRegBuffer, sendProxyDeregBuffer },
-  { recvSetup, recvConnect, recvFree, NULL, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress, recvProxyRegBuffer, recvProxyDeregBuffer }
-};
-
 ncclResult_t ncclCollNetChainBufferSetup(ncclComm_t comm) {
   ncclResult_t ret = ncclSuccess;
   char line[1024];
@@ -1197,7 +1382,6 @@ fail:
 
 ncclResult_t ncclCollNetDirectBufferSetup(ncclComm_t comm) {
   ncclResult_t ret = ncclSuccess;
-  int highestTransportType0 = TRANSPORT_UNDEFINED, highestTransportType1 = TRANSPORT_UNDEFINED;
 
   if (comm->collNetSupport == 0) goto exit;
 
@@ -1206,13 +1390,13 @@ ncclResult_t ncclCollNetDirectBufferSetup(ncclComm_t comm) {
     struct ncclChannel* channelRecv = comm->channels + c;
     NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.down, 0), ret, fail);
   }
-  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_COLLNET_DIRECT], 0, &highestTransportType0), ret, fail);
+  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_COLLNET_DIRECT], 0), ret, fail);
 
   for (int c = 0; c < comm->nChannels; c++) {
     struct ncclChannel* channelSend = comm->channels + c;
     NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.down, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.up, 1), ret, fail);
   }
-  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_COLLNET_DIRECT], 1, &highestTransportType1), ret, fail);
+  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_COLLNET_DIRECT], 1), ret, fail);
 
   INFO(NCCL_INIT, "rank %d Connected CollNet", comm->rank);
 
@@ -1410,3 +1594,10 @@ fail:
   comm->collNetSupport = 0;
   goto exit;
 }
+
+struct ncclTransport collNetTransport = {
+  "COL",
+  canConnect,
+  { sendSetup, sendConnect, sendFree, NULL, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress, sendProxyRegBuffer, sendProxyDeregBuffer },
+  { recvSetup, recvConnect, recvFree, NULL, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress, recvProxyRegBuffer, recvProxyDeregBuffer }
+};
\ No newline at end of file
diff --git a/src/transport/generic.cc b/src/transport/generic.cc
index 7fd7e59fbc..47b023667d 100644
--- a/src/transport/generic.cc
+++ b/src/transport/generic.cc
@@ -1,17 +1,37 @@
 #include "comm.h"
 #include "transport.h"
+#include "bootstrap.h"
 
 ncclResult_t ncclTransportRingConnect(struct ncclComm* comm) {
+  struct ringConnInfo {
+    bool useNetPXN;
+    bool useGdr;
+  };
+  struct ringConnInfo* ringInfo = NULL;
   ncclResult_t ret = ncclSuccess;
   if (comm && comm->nRanks > 1) {
+    comm->useGdr = true;
+    comm->useNetPXN = false;
     for (int c = 0; c < comm->nChannels; c++) {
       struct ncclChannel* channel = comm->channels + c;
       NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, fail);
     }
     NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_RING], 0), ret, fail);
-    INFO(NCCL_INIT, "Connected all rings");
+    if (ncclParamLocalRegister() || ncclParamGraphRegister()) {
+      NCCLCHECK(ncclCalloc(&ringInfo, comm->nRanks));
+      ringInfo[comm->rank].useGdr = comm->useGdr;
+      ringInfo[comm->rank].useNetPXN = comm->useNetPXN;
+      NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, ringInfo, sizeof(struct ringConnInfo)), ret, fail);
+      for (int i = 0; i < comm->nRanks; ++i) {
+        if (!ringInfo[i].useGdr) comm->useGdr = false;
+        if (ringInfo[i].useNetPXN) comm->useNetPXN = true;
+        if (comm->useGdr == false && comm->useNetPXN == true) break;
+      }
+    }
+    INFO(NCCL_INIT, "Connected all rings, use ring PXN %d GDR %d", comm->useNetPXN, comm->useGdr);
   }
 exit:
+  free(ringInfo);
   return ret;
 fail:
   goto exit;
diff --git a/src/transport/net.cc b/src/transport/net.cc
index 00eca607d9..8760b4258d 100644
--- a/src/transport/net.cc
+++ b/src/transport/net.cc
@@ -15,6 +15,7 @@
 #include "profiler.h"
 #include "transport.h"
 #include "shm.h"
+#include <assert.h>
 
 static_assert(sizeof(ncclNetHandle_t) <= CONNECT_SIZE, "NET Connect info is too large");
 
@@ -107,6 +108,7 @@ struct sendNetResources {
   int netDeviceVersion;
   ncclNetDeviceType netDeviceType;
   ncclNetDeviceHandle_t* netDeviceHandle;
+  size_t maxP2pBytes;
 };
 
 struct recvNetResources {
@@ -139,6 +141,12 @@ struct recvNetResources {
   int netDeviceVersion;
   ncclNetDeviceType netDeviceType;
   ncclNetDeviceHandle_t* netDeviceHandle;
+  size_t maxP2pBytes;
+};
+
+struct netRegInfo {
+  uintptr_t buffer;
+  size_t size;
 };
 
 /* Determine if two peers can communicate with NET */
@@ -166,6 +174,9 @@ struct setupReq {
   int connIndex;
 };
 
+NCCL_PARAM(NetOptionalRecvCompletion, "NET_OPTIONAL_RECV_COMPLETION", 1);
+
+static_assert(sizeof(ncclNetHandle_t) + sizeof(int) <= CONNECT_SIZE, "Not large enough ncclConnect to hold ncclNetHandle_t and useGdr flag");
 // Forward declaration
 static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args);
 
@@ -181,8 +192,10 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   int proxyRank;
   int64_t netId;
   NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &netId, &req.netDev, &proxyRank));
-  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 1, &req.useGdr));
+  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->rank, netId, 1, &req.useGdr));
   send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
+  if (!req.useGdr && connIndex == 0) comm->useGdr = 0;
+  if (proxyRank != myInfo->rank && connIndex == 0) comm->useNetPXN = true;
 
   NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, proxyRank, &send->proxyConn));
   req.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
@@ -198,6 +211,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
         proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
   }
   *((int*)connectInfo) = comm->topParentRanks[proxyRank];
+  memcpy((uint8_t*)connectInfo + sizeof(ncclNetHandle_t), &req.useGdr, sizeof(int));
   return ncclSuccess;
 }
 
@@ -218,10 +232,12 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   int proxyRank;
   int64_t netId;
   NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, myInfo->rank, &netId, &req.netDev, &proxyRank));
-  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 0, &req.useGdr));
+  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->rank, netId, 0, &req.useGdr));
+  recv->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
+  if (!req.useGdr && connIndex == 0) comm->useGdr = 0;
 
   // Determine whether we need to flush the GDR buffer on recv or not
-  if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush));
+  if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm, req.netDev, myInfo->rank, &req.needFlush));
 
   // We don't support PXN on receive yet
   NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, myInfo->rank, &recv->proxyConn));
@@ -230,6 +246,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   req.tpRank = comm->topParentRanks[myInfo->rank];
   req.tpRemoteRank = comm->topParentRanks[peerInfo->rank];
   NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t)));
+  memcpy((uint8_t*)connectInfo + sizeof(ncclNetHandle_t), &req.useGdr, sizeof(int));
   INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->nvmlDev, myInfo->rank, myInfo->nvmlDev, comm->ncclNet->name, req.netDev,
       req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
   return ncclSuccess;
@@ -283,8 +300,11 @@ struct netRecvConnectArgs {
 
 static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
   struct connectMap* map = (connectMap*) send->transportResources;
-
   void* opId;
+  int recvUseGdr;
+
+  memcpy(&recvUseGdr, (uint8_t*)connectInfo + sizeof(ncclNetHandle_t), sizeof(int));
+  if (!recvUseGdr) send->conn.flags &= ~NCCL_DIRECT_NIC;
 
   // map isn't allocated thus this op hasn't been submitted yet
   if (!map) {
@@ -391,6 +411,11 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
 static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
   struct connectMap* map = (connectMap*) recv->transportResources;
   void* opId;
+  int sendUseGdr;
+
+  memcpy(&sendUseGdr, (uint8_t*)connectInfo + sizeof(ncclNetHandle_t), sizeof(int));
+  if (!sendUseGdr) recv->conn.flags &= ~NCCL_DIRECT_NIC;
+
   if (!map) {
     NCCLCHECK(ncclCalloc(&map, 1));
     recv->transportResources = map;
@@ -522,7 +547,7 @@ static ncclResult_t sharedNetBuffersInit(struct ncclProxyState* proxyState, int
   return ncclSuccess;
 }
 
-static ncclResult_t sharedBuffersGet(struct ncclProxyState* proxyState, int channel, int slot, int* offset, int* size) {
+static ncclResult_t sharedBuffersGet(struct ncclProxyState* proxyState, int channel, int slot, int* offset, size_t* size) {
   // Use different pools for different channels and also separate send/recv.
   int globalSlot = (channel*NCCL_SHARED_STEPS)+slot;
   *offset = proxyState->p2pChunkSize * globalSlot;
@@ -590,6 +615,13 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc
 
   resources->netDeviceVersion = props.netDeviceVersion;
   resources->netDeviceType = props.netDeviceType;
+  /* point-to-point size limits*/
+  resources->maxP2pBytes = props.maxP2pBytes;
+  if((resources->maxP2pBytes <= 0) || (resources->maxP2pBytes > NCCL_MAX_NET_SIZE_BYTES)) {
+    WARN("sendProxySetup: net plugin returned invalid value for maxP2pBytes %ld \
+      [allowed range: %ld - %ld] \n", resources->maxP2pBytes, 0L, NCCL_MAX_NET_SIZE_BYTES);
+    return ncclInternalError;
+  }
 
   // We don't return any data
   if (respSize != 0) return ncclInternalError;
@@ -621,6 +653,13 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
   resources->maxRecvs = props.maxRecvs;
   resources->netDeviceVersion = props.netDeviceVersion;
   resources->netDeviceType = props.netDeviceType;
+  /* point-to-point size limits*/
+  resources->maxP2pBytes = props.maxP2pBytes;
+  if((resources->maxP2pBytes <= 0) || (resources->maxP2pBytes > NCCL_MAX_NET_SIZE_BYTES)) {
+    WARN("recvProxySetup: net plugin returned invalid value for maxP2pBytes %ld \
+      [allowed range: %ld - %ld] \n", resources->maxP2pBytes, 0L, NCCL_MAX_NET_SIZE_BYTES);
+    return ncclInternalError;
+  }
 
   if (respSize != sizeof(ncclNetHandle_t)) return ncclInternalError;
   NCCLCHECK(proxyState->ncclNet->listen(req->netDev, respBuff, &resources->netListenComm));
@@ -916,6 +955,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
 
   resources->sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem);
   resources->recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem);
+  for (int i = 0; i < NCCL_STEPS; i++) resources->recvMem->connFifo[i].size = -1;
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
     resources->buffers[p] = NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]);
     if (resources->buffers[p]) {
@@ -1032,7 +1072,6 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
 }
 
 static_assert(NCCL_STEPS <= NCCL_NET_MAX_REQUESTS, "Not enough net requests to cover for steps");
-#define MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two.
 
 static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
   if (args->state == ncclProxyOpReady) {
@@ -1045,11 +1084,8 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
       resources->step = sub->base + sub->nsteps;
       sub->posted = sub->transmitted = sub->done = 0;
       ncclProfilerStartSendProxyOpEvent(s, args);
-      if (sub->reg && sub->nbytes > 0) {
-        NCCLCHECK(proxyState->ncclNet->regMr(resources->netSendComm, sub->recvbuff, sub->nbytes, NCCL_PTR_CUDA, &sub->mhandle));
-      } else {
-        sub->mhandle = resources->mhandles[args->protocol];
-      }
+      if (!sub->reg)
+        sub->sendMhandle = resources->mhandles[args->protocol];
     }
     args->state = ncclProxyOpProgress;
   }
@@ -1059,6 +1095,9 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
     int maxDepth = std::min(NCCL_STEPS, NCCL_SHARED_STEPS/args->nsubs);
     for (int s=0; s<args->nsubs; s++) {
       struct ncclProxySubArgs* sub = args->subs+s;
+      int postedStepId = sub->posted;
+      int transmittedStepId = sub->transmitted;
+      int doneStepId = sub->done;
       if (sub->done == sub->nsteps) continue;
       struct sendNetResources* resources = (struct sendNetResources*) (sub->connection->transportResources);
       volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
@@ -1066,7 +1105,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
       char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
       // Post buffers to the GPU
       if (sub->posted < sub->nsteps && sub->posted < sub->done + maxDepth) {
-        ncclProfilerStartSendProxyStepEvents(s, args, sub->posted, sub->posted+args->sliceSteps);
+        ncclProfilerStartSendProxyStepEvent(s, args, postedStepId);
         int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
         if (resources->shared) {
           if (!sub->reg) {
@@ -1078,12 +1117,13 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
           }
           volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
           sub->posted += args->sliceSteps;
-          // Only post one credit for registered buffer
-          if (sub->reg == 0 || sub->posted == args->sliceSteps) *sendHead = sub->base + sub->posted - NCCL_STEPS;
+          *sendHead = sub->base + sub->posted - NCCL_STEPS;
           if (resources->gdcSync) wc_store_fence(); // Flush out WC write
-        } else sub->posted += args->sliceSteps;
+        } else {
+          sub->posted += args->sliceSteps;
+        }
         ncclProfilerRecordProxyOpEventState(s, args, sub->posted, sub->transSize, ncclProfilerProxyOpSendPosted);
-        ncclProfilerRecordProxyStepEventStates(s, args, sub->posted-args->sliceSteps, sub->posted, ncclProfilerProxyStepSendGPUWait);
+        ncclProfilerRecordProxyStepEventState(s, args, postedStepId, ncclProfilerProxyStepSendGPUWait);
         args->idle = 0;
         continue;
       }
@@ -1091,10 +1131,10 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
       if (sub->transmitted < sub->posted && sub->transmitted < sub->done + NCCL_STEPS) {
         int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
         volatile uint64_t* recvTail = &resources->recvMem->tail;
-        uint64_t tail = sub->base + (sub->reg ? 0 : sub->transmitted);
-        if ((sub->reg || connFifo[buffSlot].size != -1) && ((*recvTail > tail) || p == NCCL_PROTO_LL)) {
+        uint64_t tail = sub->base + sub->transmitted;
+        if (connFifo[buffSlot].size != -1 && (*recvTail > tail || p == NCCL_PROTO_LL)) {
           // We have something to receive, let's check if it's completely ready.
-          int size = sub->reg ? std::min(MAX_NET_SIZE, sub->nbytes) : connFifo[buffSlot].size;
+          int size = connFifo[buffSlot].size;
           bool shared = (p == NCCL_PROTO_SIMPLE) && resources->shared;
           char* buff = shared ? localBuff+connFifo[buffSlot].offset : localBuff+buffSlot*stepSize;
           int ready = 1;
@@ -1120,22 +1160,28 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
               volatile uint32_t *f2 = &lines[i].flag2;
               if (f1[0] != flag || f2[0] != flag) { ready = 0; break; }
             }
-          } else if (p == NCCL_PROTO_SIMPLE && resources->shared) {
-            buff = sub->reg ? (char*)sub->recvbuff : localBuff+resources->recvMem->connFifo[buffSlot].offset;
+          } else if (p == NCCL_PROTO_SIMPLE) {
+            if (resources->shared) {
+              buff = sub->reg ? (char*)sub->sendbuff + sub->transmitted * NCCL_MAX_NET_SIZE : localBuff + resources->recvMem->connFifo[buffSlot].offset;
+            } else if (sub->reg) {
+              size_t sendSize;
+              sub->ringAlgo->getNextSendAddr(sub->transmitted, (uint8_t**)&buff, &sendSize, &sub->sendMhandle);
+              assert(sendSize == size);
+            }
           }
           if (ready) {
-            ncclProfilerRecordProxyOpEventState(s, args, sub->transmitted + args->sliceSteps, sub->transSize, ncclProfilerProxyOpSendRemFifoWait);
+            ncclProfilerRecordProxyOpEventState(s, args, sub->transmitted+args->sliceSteps, sub->transSize, ncclProfilerProxyOpSendRemFifoWait);
             // Data is ready, try to send.
             // Coverity complains about the size here as pointing to an out-of-scope temporary.  Which is nonsense,
             // since size is a plain integer.
             // coverity[use_invalid:FALSE]
-            NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, sub->mhandle, sub->requests+buffSlot));
+            NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, sub->sendMhandle, sub->requests+buffSlot));
             if (sub->requests[buffSlot] != NULL) {
-              TRACE(NCCL_NET, "sendProxy [%ld/%d] Isend posted, req %p, size %d, proto %d, myRank %d, channelId %d", sub->transmitted, buffSlot, sub->requests[buffSlot], size, p, proxyState->tpRank, sub->channelId);
+              TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Isend posted, req %p, buff %p, size %d, proto %d, myRank %d, channelId %d, mhandle %p", sub->transmitted, buffSlot, sub->nsteps, sub->requests[buffSlot], buff, size, p, proxyState->tpRank, sub->channelId, sub->sendMhandle);
+              sub->transSize += size;
               sub->transmitted += args->sliceSteps;
               ncclProfilerRecordProxyOpEventState(s, args, sub->transmitted, sub->transSize, ncclProfilerProxyOpSendTransmitted);
-              ncclProfilerRecordProxyStepEventStates(s, args, sub->transmitted-args->sliceSteps, sub->transmitted, ncclProfilerProxyStepSendWait);
-              sub->transSize += size;
+              ncclProfilerRecordProxyStepEventState(s, args, transmittedStepId, ncclProfilerProxyStepSendWait);
               args->idle = 0;
               continue;
             }
@@ -1149,41 +1195,24 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
         int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
         NCCLCHECK(proxyState->ncclNet->test(sub->requests[buffSlot], &done, &size));
         if (done) {
-          if (sub->reg) {
-            if (size < sub->nbytes) {
-              sub->recvbuff += size;
-              sub->nbytes -= size;
-              // Do one more step (at least)
-              sub->nsteps++;
-            } else {
-              // Signal the GPU the send is complete and it can return.
-              connFifo[sub->base%NCCL_STEPS].size = -1;
-            }
-          }
           // Make sure size is reset to -1 before we update the head.
-          if (sub->reg == 0) connFifo[buffSlot].size = -1;
+          connFifo[buffSlot].size = -1;
           __sync_synchronize();
-          TRACE(NCCL_NET, "sendProxy [%ld/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]);
+          TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] request %p done", sub->done, buffSlot, sub->nsteps, sub->requests[buffSlot]);
           sub->done += args->sliceSteps;
-          ncclProfilerStopProxyStepEvents(s, args, sub->done-args->sliceSteps, sub->done);
+          ncclProfilerStopProxyStepEvent(s, args, doneStepId);
           ncclProfilerRecordProxyOpEventState(s, args, sub->done, sub->transSize, ncclProfilerProxyOpSendDone);
 
           if (resources->shared == 0) {
             volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
-            if (sub->reg) {
-              // We may have added more net steps, but reg operations only have a single step w.r.t. the GPU.
-              if (sub->done == sub->nsteps) *sendHead = sub->base + args->sliceSteps;
-            } else {
-              *sendHead = sub->base + sub->done;
-            }
+            *sendHead = sub->base + sub->done;
             if (resources->gdcSync) wc_store_fence(); // Flush out WC write
           }
           args->idle = 0;
           if (sub->done == sub->nsteps) {
-            if (sub->reg && sub->nbytes > 0) {
-              NCCLCHECK(proxyState->ncclNet->deregMr(resources->netSendComm, sub->mhandle));
-            }
             args->done++;
+            if (sub->ringAlgo && sub->ringAlgo->decRefCount() == 0) delete sub->ringAlgo;
+            sub->ringAlgo = NULL;
           }
         }
       }
@@ -1232,14 +1261,11 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
       // Set step base for next op
       resources->step = sub->base + sub->nsteps;
       sub->posted = sub->received = sub->transmitted = sub->done = 0;
+      sub->regBufferReady = 0;
       for (int i=0; i<groupSize; i++) sub[-i].groupSize = groupSize;
       ncclProfilerStartRecvProxyOpEvent(s, args);
-      if (sub->reg && sub->nbytes > 0) {
-        // Register buffer
-        NCCLCHECK(proxyState->ncclNet->regMr(resources->netRecvComm, sub->recvbuff, sub->nbytes, NCCL_PTR_CUDA, &sub->mhandle));
-      } else {
-        sub->mhandle = resources->mhandles[args->protocol];
-      }
+      if (!sub->reg)
+        sub->recvMhandle = resources->mhandles[args->protocol];
     }
     args->state = ncclProxyOpProgress;
   }
@@ -1251,32 +1277,44 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
       struct ncclProxySubArgs* subGroup = args->subs+s;
       int subCount = 0;
       void* ptrs[NCCL_PROXY_MAX_SUBS];
-      int sizes[NCCL_PROXY_MAX_SUBS];
+      size_t sizes[NCCL_PROXY_MAX_SUBS];
       int tags[NCCL_PROXY_MAX_SUBS];
       void* mhandles[NCCL_PROXY_MAX_SUBS];
       for (int i=0; i<subGroup->groupSize; i++) {
         struct ncclProxySubArgs* sub = subGroup + i;
+        int postedStepId = sub->posted;
         if (sub->posted < sub->nsteps) {
           if (sub->posted >= sub->done + maxDepth) { subCount = 0; break; }
-          ncclProfilerStartRecvProxyStepEvents(s+i, args, sub->posted, sub->posted+args->sliceSteps);
+          ncclProfilerStartRecvProxyStepEvent(s+i, args, postedStepId);
           struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
-          if (sub->reg) maxDepth = 1;
           int stepSize = resources->buffSizes[p] / NCCL_STEPS;
           char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
           int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
           volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
-          if (p == NCCL_PROTO_SIMPLE && resources->shared) {
-            if (sub->reg) {
-              // Wait until CUDA kernel has started before we access the user buffer directly.
-              if (connFifo[sub->base%NCCL_STEPS].size == -1) continue;
-              ptrs[subCount] = sub->recvbuff;
-              sizes[subCount] = std::min(MAX_NET_SIZE, sub->nbytes);
+          if (p == NCCL_PROTO_SIMPLE) {
+            if (resources->shared) {
+              if (sub->reg) {
+                // Wait until CUDA kernel has started before we access the user buffer directly.
+                if (!sub->regBufferReady && connFifo[sub->base % NCCL_STEPS].size == -1) continue;
+                sub->regBufferReady = 1;
+                ptrs[subCount] = sub->recvbuff + sub->posted * NCCL_MAX_NET_SIZE;
+                sizes[subCount] = std::min(NCCL_MAX_NET_SIZE, (ssize_t)(sub->nbytes - sub->posted * NCCL_MAX_NET_SIZE));
+              } else {
+                int sharedBuffSlot = sub->posted % maxDepth;
+                int offset;
+                NCCLCHECK(sharedBuffersGet(proxyState, sub->channelId, sharedBuffSlot * args->nsubs + s + i, &offset, sizes + subCount));
+                connFifo[buffSlot].offset = offset;
+                ptrs[subCount] = localBuff + offset;
+              }
             } else {
-              int sharedBuffSlot = sub->posted%maxDepth;
-              int offset;
-              NCCLCHECK(sharedBuffersGet(proxyState, sub->channelId, sharedBuffSlot*args->nsubs+s+i, &offset, sizes+subCount));
-              connFifo[buffSlot].offset = offset;
-              ptrs[subCount] = localBuff+offset;
+              if (sub->reg) {
+                if (!sub->regBufferReady && connFifo[sub->base % NCCL_STEPS].size == -1) continue;
+                sub->regBufferReady = 1;
+                sub->ringAlgo->getNextRecvAddr(sub->posted, (uint8_t**)&ptrs[subCount], &sizes[subCount], &sub->recvMhandle);
+              } else {
+                ptrs[subCount] = localBuff + buffSlot * stepSize;
+                sizes[subCount] = stepSize * args->sliceSteps;
+              }
             }
           } else {
             ptrs[subCount] = localBuff+buffSlot*stepSize;
@@ -1284,7 +1322,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
           }
           if (sub->nbytes < sizes[subCount]) sizes[subCount] = sub->nbytes;
           tags[subCount] = resources->tpRemoteRank;
-          mhandles[subCount] = sub->mhandle;
+          mhandles[subCount] = sub->recvMhandle;
           subCount++;
         }
       }
@@ -1292,15 +1330,19 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
         uint64_t step = subGroup->posted;
         struct recvNetResources* resources = (struct recvNetResources*) (subGroup->connection->transportResources);
         void** requestPtr = subGroup->requests+(step%NCCL_STEPS);
+        bool ignoreCompletion = ncclParamNetOptionalRecvCompletion() && ((args->protocol == NCCL_PROTO_LL128) || (args->protocol == NCCL_PROTO_LL)) && (subCount == 1);
+        if (ignoreCompletion) *requestPtr = (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION;
         NCCLCHECK(proxyState->ncclNet->irecv(resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, requestPtr));
         if (*requestPtr) {
           subGroup->recvRequestsCache[step%NCCL_STEPS] = *requestPtr;
           subGroup->recvRequestsSubCount = subCount;
           for (int i=0; i<subGroup->groupSize; i++) {
             struct ncclProxySubArgs* sub = subGroup+i;
+            int postedStepId = sub->posted;
+            TRACE(NCCL_NET, "recvProxy [%ld/%ld/%d] Irecv posted, buff %p, size %ld, myRank %d, channelId %d, mhandle %p", sub->posted, (sub->base + sub->posted) % NCCL_STEPS, sub->nsteps, ptrs[i], sizes[i], proxyState->tpRank, sub->channelId, mhandles[i]);
             sub->posted += args->sliceSteps;
             ncclProfilerRecordProxyOpEventState(s+i, args, sub->posted, sub->transSize, ncclProfilerProxyOpRecvPosted);
-            ncclProfilerRecordProxyStepEventStates(s+i, args, sub->posted-args->sliceSteps, sub->posted, ncclProfilerProxyStepRecvWait);
+            ncclProfilerRecordProxyStepEventState(s+i, args, postedStepId, ncclProfilerProxyStepRecvWait);
           }
           args->idle = 0;
         }
@@ -1321,31 +1363,18 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
         if (done) {
           int needFlush = 0;
           int totalSize = 0;
-          int subIndex = 0;
           for (int i=0; i<NCCL_PROXY_MAX_SUBS; i++) totalSize += sizes[i];
           for (int i=0; i<subGroup->groupSize; i++) {
             struct ncclProxySubArgs* sub = subGroup + i;
-            if (sub->received < sub->nsteps) {
-              int size = sizes[subIndex++];
-              if (sub->reg) {
-                if (size < sub->nbytes) {
-                  sub->recvbuff += size;
-                  sub->nbytes -= size;
-                  // Do one more step (at least)
-                  sub->nsteps++;
-                } else {
-                  // Reset connFifo size indicating the GPU was ready to receive.
-                  // There is a __sync_synchronize() later to ensure it is reset before it is set again by the GPU.
-                  struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
-                  volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
-                  connFifo[sub->base%NCCL_STEPS].size = -1;
-                }
-              }
-            }
-            sub->received += args->sliceSteps;
+            int receivedStepId = sub->received;
+            int buffSlot = (sub->base + sub->received) % NCCL_STEPS;
+            struct recvNetResources* resources = (struct recvNetResources*)(sub->connection->transportResources);
+            volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
+            connFifo[buffSlot].size = -1;
             sub->transSize += sizes[i];
+            sub->received += args->sliceSteps;
             ncclProfilerRecordProxyOpEventState(s+i, args, sub->received, sub->transSize, ncclProfilerProxyOpRecvReceived);
-            ncclProfilerRecordProxyStepEventStates(s+i, args, sub->received-args->sliceSteps, sub->received, ncclProfilerProxyStepRecvFlushWait);
+            ncclProfilerRecordProxyStepEventState(s+i, args, receivedStepId, ncclProfilerProxyStepRecvFlushWait);
             if (step < sub->nsteps) {
               struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
               if (resources->useGdr) needFlush |= resources->needFlush;
@@ -1372,10 +1401,16 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
                   int stepSize = resources->buffSizes[p] / NCCL_STEPS;
                   char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
                   int buffSlot = (sub->base+sub->received-args->sliceSteps)%NCCL_STEPS;
-                  ptrs[subCount] = resources->shared ?
-                    (sub->reg ? (char*)sub->recvbuff : localBuff+resources->recvMem->connFifo[buffSlot].offset) :
-                    localBuff+buffSlot*stepSize;
-                  mhandles[subCount] = sub->mhandle;
+                  if (resources->shared) {
+                    ptrs[subCount] = sub->reg ? (char*)sub->recvbuff + step * NCCL_MAX_NET_SIZE : localBuff + resources->recvMem->connFifo[buffSlot].offset;
+                  } else {
+                    if (sub->reg) {
+                      sub->ringAlgo->getNextRecvAddr(step, (uint8_t**)&ptrs[subCount], NULL, &sub->recvMhandle);
+                    } else {
+                      ptrs[subCount] = localBuff + buffSlot * stepSize;
+                    }
+                  }
+                  mhandles[subCount] = sub->recvMhandle;
                   subCount++;
                 }
               }
@@ -1399,19 +1434,16 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
         if (done) {
           for (int i=0; i<subGroup->groupSize; i++) {
             struct ncclProxySubArgs* sub = subGroup + i;
+            int transmittedStepId = sub->transmitted;
 
             sub->transmitted += args->sliceSteps;
             ncclProfilerRecordProxyOpEventState(s+i, args, sub->transmitted, sub->transSize, ncclProfilerProxyOpRecvTransmitted);
-            ncclProfilerRecordProxyStepEventStates(s+i, args, sub->transmitted-args->sliceSteps, sub->transmitted, ncclProfilerProxyStepRecvGPUWait);
+            ncclProfilerRecordProxyStepEventState(s+i, args, transmittedStepId, ncclProfilerProxyStepRecvGPUWait);
             if (step < sub->nsteps) {
               __sync_synchronize();
               struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
               volatile uint64_t* recvTail = resources->gdcSync ? resources->gdcSync : &resources->recvMem->tail;
-              if (sub->reg) {
-                // We may have added more net steps, but reg operations only have a single step w.r.t. the GPU.
-                if (sub->transmitted == sub->nsteps) *recvTail = sub->base + args->sliceSteps;
-              } else
-                *recvTail = sub->base + sub->transmitted;
+              *recvTail = sub->base + sub->transmitted;
               if (resources->gdcSync) wc_store_fence(); // Flush out WC write
             }
           }
@@ -1425,11 +1457,12 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
       struct ncclProxySubArgs* subGroup = args->subs+s;
       for (int i=0; i<subGroup->groupSize; i++) {
         struct ncclProxySubArgs* sub = subGroup + i;
+        int doneStepId = sub->done;
         if (sub->done == sub->nsteps) continue;
         if (sub->transmitted > sub->done) {
           struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
           volatile uint64_t* sendHead = &resources->sendMem->head;
-          uint64_t done = sub->reg ? sub->base + sub->nsteps : *sendHead;
+          uint64_t done = *sendHead;
           while (done > sub->base + sub->done &&
               // LL and LL128 can acknowledge 0-bytes send before they even happen. Don't go past what we transmitted.
               sub->transmitted > sub->done) {
@@ -1440,15 +1473,13 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
               subGroup->recvRequestsCache[sub->done%NCCL_STEPS] = NULL;
             }
             sub->done += args->sliceSteps;
-            ncclProfilerStopProxyStepEvents(s+i, args, sub->done-args->sliceSteps, sub->done);
+            ncclProfilerStopProxyStepEvent(s+i, args, doneStepId);
             ncclProfilerRecordProxyOpEventState(s+i, args, sub->done, sub->transSize, ncclProfilerProxyOpRecvDone);
             args->idle = 0;
             if (sub->done == sub->nsteps) {
-              struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
-              if (sub->reg && sub->nbytes > 0) {
-                NCCLCHECK(proxyState->ncclNet->deregMr(resources->netRecvComm, sub->mhandle));
-              }
               args->done++;
+              if (sub->ringAlgo && sub->ringAlgo->decRefCount() == 0) delete sub->ringAlgo;
+              sub->ringAlgo = NULL;
               break;
             }
           }
@@ -1465,9 +1496,228 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
   return ncclSuccess;
 }
 
+ncclResult_t ncclNetDeregBuffer(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void* handle) {
+  NCCLCHECK(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgDeregister, &handle, sizeof(void*), NULL, 0));
+  INFO(NCCL_REG, "rank %d - deregistered net buffer handle %p", comm->rank, handle);
+  return ncclSuccess;
+}
+
+static ncclResult_t netRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, struct ncclConnector** peerConns, int nPeers, struct ncclReg* regRecord, int* outRegBufFlag, void** outHandle) {
+  ncclResult_t ret = ncclSuccess;
+  int gdrFlag = 1;
+
+  if (regRecord) {
+    for (int p = 0; p < nPeers; ++p) {
+      struct ncclConnector* peerConn = peerConns[p];
+      struct ncclProxyConnector* peerProxyConn = NULL;
+      struct ncclRegNetHandles* netHandle = NULL;
+      bool found = false;
+      if (peerConn == NULL) continue;
+      peerProxyConn = &peerConn->proxyConn;
+      netHandle = regRecord->netHandleHead;
+      while (netHandle) {
+        if (netHandle->proxyConn == peerProxyConn) {
+          found = true;
+          break;
+        }
+        netHandle = netHandle->next;
+      }
+      if (found) {
+        *outRegBufFlag = 1;
+        outHandle[p] = netHandle->handle;
+        INFO(NCCL_REG, "rank %d - NET reuse buffer %p size %ld (baseAddr %p size %ld) handle %p", comm->rank, userbuff, buffSize, (void*)regRecord->addr, regRecord->pages * comm->regCache.pageSize, netHandle->handle);
+      } else {
+        struct netRegInfo info = { regRecord->addr, regRecord->pages * comm->regCache.pageSize };
+        void* handle = NULL;
+
+        if (peerConn->conn.flags & NCCL_DIRECT_NIC) {
+          NCCLCHECKGOTO(ncclProxyCallBlocking(comm, peerProxyConn, ncclProxyMsgRegister, &info, sizeof(struct netRegInfo), &handle, sizeof(void*)), ret, fail);
+          if (handle) {
+            struct ncclRegNetHandles* netHandle;
+            regRecord->state |= NET_REG_COMPLETE;
+            NCCLCHECK(ncclCalloc(&netHandle, 1));
+            netHandle->handle = handle;
+            netHandle->proxyConn = peerProxyConn;
+            netHandle->next = regRecord->netHandleHead;
+            regRecord->netHandleHead = netHandle;
+            outHandle[p] = handle;
+            *outRegBufFlag = 1;
+            INFO(NCCL_REG, "rank %d - NET register userbuff %p (handle %p), buffSize %ld", comm->rank, userbuff, handle, buffSize);
+          } else {
+            goto fail;
+          }
+        } else {
+          gdrFlag = 0;
+          goto fail;
+        }
+      }
+    }
+  }
+
+exit:
+  return ret;
+fail:
+  *outRegBufFlag = 0;
+  WARN("rank %d failed to NET register userbuff %p buffSize %ld GDR flag %d", comm->rank, userbuff, buffSize, gdrFlag);
+  goto exit;
+}
+
+ncclResult_t ncclNetLocalRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, struct ncclConnector** peerConns, int nPeers, int* outRegBufFlag, void** outHandle) {
+  ncclResult_t ret = ncclSuccess;
+  struct ncclReg *regRecord = NULL;
+  bool isValid = false;
+
+  *outRegBufFlag = 0;
+  if (comm && userbuff && buffSize > 0 && nPeers > 0) {
+    NCCLCHECKGOTO(ncclRegFind(comm, userbuff, buffSize, &regRecord), ret, fail);
+    NCCLCHECKGOTO(ncclRegLocalIsValid(regRecord, &isValid), ret, fail);
+    if (isValid)
+      NCCLCHECKGOTO(netRegisterBuffer(comm, userbuff, buffSize, peerConns, nPeers, regRecord, outRegBufFlag, outHandle), ret, fail);
+  }
+
+exit:
+  return ret;
+fail:
+  *outRegBufFlag = 0;
+  goto exit;
+}
+
+struct ncclNetCleanupCallback {
+  struct ncclCommCallback base;
+  struct ncclComm *comm;
+  struct ncclReg *reg;
+};
+
+static ncclResult_t cleanupNet(struct ncclComm* comm, struct ncclCommCallback* cb) {
+  struct ncclNetCleanupCallback* obj = (struct ncclNetCleanupCallback*)cb;
+  NCCLCHECK(ncclCommGraphDeregister(obj->comm, obj->reg));
+  free(obj);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclNetGraphRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, struct ncclConnector** peerConns, int nPeers, int* outRegBufFlag, void** outHandle, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, int* nCleanupQueueElts) {
+  ncclResult_t ret = ncclSuccess;
+  struct ncclNetCleanupCallback *record = NULL;
+  struct ncclReg *regRecord = NULL;
+  void *baseSend;
+  size_t baseSendSize;
+
+  *outRegBufFlag = 0;
+  if (comm && userbuff && buffSize > 0 && nPeers > 0) {
+    CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &baseSendSize, (CUdeviceptr)userbuff), ret, fail);
+    NCCLCHECKGOTO(ncclCommGraphRegister(comm, baseSend, baseSendSize, (void**)&regRecord), ret, fail);
+    NCCLCHECKGOTO(netRegisterBuffer(comm, userbuff, buffSize, peerConns, nPeers, regRecord, outRegBufFlag, outHandle), ret, fail);
+    if (*outRegBufFlag) {
+      NCCLCHECKGOTO(ncclCalloc(&record, 1), ret, fail);
+      record->base.fn = cleanupNet;
+      record->comm = comm;
+      record->reg = regRecord;
+      ncclIntruQueueEnqueue(cleanupQueue, (struct ncclCommCallback*)record);
+      if (nCleanupQueueElts) *nCleanupQueueElts += 1;
+    } else {
+      NCCLCHECKGOTO(ncclCommGraphDeregister(comm, regRecord), ret, fail);
+    }
+  }
+exit:
+  return ret;
+fail:
+  *outRegBufFlag = 0;
+  goto exit;
+}
+
+static ncclResult_t sendProxyRegBuffer(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  void* handle;
+  struct netRegInfo* info = (struct netRegInfo*)reqBuff;
+  struct sendNetResources* resources = (struct sendNetResources*)(connection->transportResources);
+  ncclResult_t ret = ncclSuccess;
+  bool needReg = true;
+
+  assert(reqSize == sizeof(struct netRegInfo));
+  assert(respSize == sizeof(void*));
+
+#if CUDART_VERSION >= 11070
+  /* DMA-BUF support */
+  if (resources->useDmaBuf) {
+    int dmabuf_fd;
+    CUCHECKGOTO(cuMemGetHandleForAddressRange((void*)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0), ret, peermem);
+    NCCLCHECKGOTO(proxyState->ncclNet->regMrDmaBuf(resources->netSendComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem);
+    (void)close(dmabuf_fd);
+    needReg = false;
+  }
+peermem:
+#endif
+  if (needReg) {
+    NCCLCHECKGOTO(proxyState->ncclNet->regMr(resources->netSendComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle), ret, fail);
+  }
+
+exit:
+  memcpy(respBuff, (void*)&handle, sizeof(void*));
+  *done = 1;
+  return ncclSuccess;
+fail:
+  handle = NULL;
+  goto exit;
+}
+
+static ncclResult_t recvProxyRegBuffer(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  void* handle;
+  struct netRegInfo* info = (struct netRegInfo*)reqBuff;
+  struct recvNetResources* resources = (struct recvNetResources*)(connection->transportResources);
+  ncclResult_t ret = ncclSuccess;
+  bool needReg = true;
+
+  assert(reqSize == sizeof(struct netRegInfo));
+  assert(respSize == sizeof(void*));
+
+#if CUDART_VERSION >= 11070
+  /* DMA-BUF support */
+  if (resources->useDmaBuf) {
+    int dmabuf_fd;
+    CUCHECKGOTO(cuMemGetHandleForAddressRange((void*)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0), ret, peermem);
+    NCCLCHECKGOTO(proxyState->ncclNet->regMrDmaBuf(resources->netRecvComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem);
+    (void)close(dmabuf_fd);
+    needReg = false;
+  }
+peermem:
+#endif
+  if (needReg) {
+    NCCLCHECKGOTO(proxyState->ncclNet->regMr(resources->netRecvComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle), ret, fail);
+  }
+
+exit:
+  memcpy(respBuff, (void*)&handle, sizeof(void*));
+  *done = 1;
+  return ncclSuccess;
+fail:
+  handle = NULL;
+  goto exit;
+}
+
+static ncclResult_t sendProxyDeregBuffer(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, int* done) {
+  void* handle;
+  struct sendNetResources* resources = (struct sendNetResources*)(connection->transportResources);
+
+  assert(reqSize == sizeof(void*));
+  memcpy(&handle, reqBuff, sizeof(void*));
+  NCCLCHECK(proxyState->ncclNet->deregMr(resources->netSendComm, handle));
+  *done = 1;
+  return ncclSuccess;
+}
+
+static ncclResult_t recvProxyDeregBuffer(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, int* done) {
+  void* handle;
+  struct recvNetResources* resources = (struct recvNetResources*)(connection->transportResources);
+
+  assert(reqSize == sizeof(void*));
+  memcpy(&handle, reqBuff, sizeof(void*));
+  NCCLCHECK(proxyState->ncclNet->deregMr(resources->netRecvComm, handle));
+  *done = 1;
+  return ncclSuccess;
+}
+
 struct ncclTransport netTransport = {
   "NET",
   canConnect,
-  { sendSetup, sendConnect, sendFree, proxySharedInit, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress, NULL },
-  { recvSetup, recvConnect, recvFree, proxySharedInit, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress, NULL }
+  { sendSetup, sendConnect, sendFree, proxySharedInit, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress, sendProxyRegBuffer, sendProxyDeregBuffer },
+  { recvSetup, recvConnect, recvFree, proxySharedInit, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress, recvProxyRegBuffer, recvProxyDeregBuffer }
 };
diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc
index d828c9801b..bc54133d39 100644
--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
@@ -42,14 +42,12 @@ struct ncclIbMrCache {
 };
 
 static int ncclNMergedIbDevs = -1;
-#define NCCL_IB_MAX_DEVS_PER_NIC 2
+#define NCCL_IB_MAX_DEVS_PER_NIC 4
 #define MAX_MERGED_DEV_NAME (MAXNAMESIZE*NCCL_IB_MAX_DEVS_PER_NIC)+NCCL_IB_MAX_DEVS_PER_NIC
 struct alignas(64) ncclIbMergedDev {
-  int ndevs;
-  int devs[NCCL_IB_MAX_DEVS_PER_NIC]; // Points to an index in ncclIbDevs
+  ncclNetVDeviceProps_t vProps;
   int speed;
   char devName[MAX_MERGED_DEV_NAME]; // Up to NCCL_IB_MAX_DEVS_PER_NIC * name size, and a character for each '+'
-  int dmaBufSupported;               //  0 = uninit, 1 = yes, -1 = no
 };
 
 struct ncclIbStats {
@@ -69,16 +67,20 @@ struct alignas(64) ncclIbDev {
   ibv_pd* pd;
   char devName[MAXNAMESIZE];
   char* pciPath;
+  char* virtualPciPath;
   int realPort;
   int maxQp;
+  float latency;
   struct ncclIbMrCache mrCache;
   int ar; // ADAPTIVE_ROUTING
   struct ibv_port_attr portAttr;
   struct ncclIbStats stats;
+  int dmaBufSupported;
 };
 
-#define MAX_IB_DEVS 32
-struct ncclIbMergedDev ncclIbMergedDevs[MAX_IB_DEVS];
+#define MAX_IB_DEVS  32
+#define MAX_IB_VDEVS MAX_IB_DEVS*8
+struct ncclIbMergedDev ncclIbMergedDevs[MAX_IB_VDEVS];
 struct ncclIbDev ncclIbDevs[MAX_IB_DEVS];
 pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER;
 static int ncclIbRelaxedOrderingEnabled = 0;
@@ -95,7 +97,7 @@ NCCL_PARAM(IbTc, "IB_TC", 0);
 NCCL_PARAM(IbArThreshold, "IB_AR_THRESHOLD", 8192);
 NCCL_PARAM(IbPciRelaxedOrdering, "IB_PCI_RELAXED_ORDERING", 2);
 NCCL_PARAM(IbAdaptiveRouting, "IB_ADAPTIVE_ROUTING", -2);
-NCCL_PARAM(IbFifoTc, "IB_FIFO_TC", 0);
+NCCL_PARAM(IbFifoTc, "IB_FIFO_TC", -1);
 NCCL_PARAM(IbAsyncEvents,"IB_RETURN_ASYNC_EVENTS",1);
 NCCL_PARAM(IbEceEnable,"IB_ECE_ENABLE",1);
 
@@ -223,17 +225,17 @@ static void* envIbAddrRange(sa_family_t af, int* mask) {
   *(maskStrPtr++) = '\0';
 
   if (inet_pton(af, addrStrPtr, ret) == 0) {
-    WARN("NET/IB: Ip address '%s' is invalid for family %s, ignoring address", addrStrPtr, (af == AF_INET) ? "AF_INET" : "AF_INET6");
+    INFO(NCCL_INIT|NCCL_NET, "NET/IB: Ip address '%s' is invalid for family %s, ignoring address", addrStrPtr, (af == AF_INET) ? "AF_INET" : "AF_INET6");
     return NULL;
   }
 
   *mask = (int)strtol(maskStrPtr, NULL, 10);
   if (af == AF_INET && *mask > 32) {
-    WARN("NET/IB: Ip address mask '%d' is invalid for family %s, ignoring mask", *mask, (af == AF_INET) ? "AF_INET" : "AF_INET6");
+    INFO(NCCL_INIT|NCCL_NET, "NET/IB: Ip address mask '%d' is invalid for family %s, ignoring mask", *mask, (af == AF_INET) ? "AF_INET" : "AF_INET6");
     *mask = 0;
     ret = NULL;
   } else if (af == AF_INET6 && *mask > 128) {
-    WARN("NET/IB: Ip address mask '%d' is invalid for family %s, ignoring mask", *mask, (af == AF_INET) ? "AF_INET" : "AF_INET6");
+    INFO(NCCL_INIT|NCCL_NET, "NET/IB: Ip address mask '%d' is invalid for family %s, ignoring mask", *mask, (af == AF_INET) ? "AF_INET" : "AF_INET6");
     *mask = 0;
     ret = NULL;
   }
@@ -314,7 +316,7 @@ static bool validGid(union ibv_gid* gid) {
 static ncclResult_t ncclIbRoceGetVersionNum(const char* deviceName, int portNum, int gidIndex, int* version) {
   char gidRoceVerStr[16] = { 0 };
   char roceTypePath[PATH_MAX] = { 0 };
-  sprintf(roceTypePath, "/sys/class/infiniband/%s/ports/%d/gid_attrs/types/%d", deviceName, portNum, gidIndex);
+  snprintf(roceTypePath, sizeof(roceTypePath), "/sys/class/infiniband/%s/ports/%d/gid_attrs/types/%d", deviceName, portNum, gidIndex);
 
   int fd = open(roceTypePath, O_RDONLY);
   if (fd == -1) {
@@ -423,6 +425,16 @@ NCCL_PARAM(IbDisable, "IB_DISABLE", 0);
 NCCL_PARAM(IbMergeVfs, "IB_MERGE_VFS", 1);
 NCCL_PARAM(IbMergeNics, "IB_MERGE_NICS", 1);
 
+// Returns 0 if this is the path of two VFs of the same physical device
+static int ncclIbMatchVfPath(char* path1, char* path2) {
+  // Merge multi-port NICs into the same PCI device
+  if (ncclParamIbMergeVfs()) {
+    return strncmp(path1, path2, strlen(path1)-4) == 0;
+  } else {
+    return strncmp(path1, path2, strlen(path1)-1) == 0;
+  }
+}
+
 static ncclResult_t ncclIbGetPciPath(char* devName, char** path, int* realPort) {
   char devicePath[PATH_MAX];
   snprintf(devicePath, PATH_MAX, "/sys/class/infiniband/%s/device", devName);
@@ -430,14 +442,10 @@ static ncclResult_t ncclIbGetPciPath(char* devName, char** path, int* realPort)
   if (p == NULL) {
     WARN("Could not find real path of %s (%s)", devName, devicePath);
   } else {
-    // Merge multi-port NICs into the same PCI device
-    p[strlen(p)-1] = '0';
-    // Also merge virtual functions (VF) into the same device
-    if (ncclParamIbMergeVfs()) p[strlen(p)-3] = p[strlen(p)-4] = '0';
-    // And keep the real port aside (the ibv port is always 1 on recent cards)
+    // Keep the real port aside (the ibv port is always 1 on recent cards)
     *realPort = 0;
     for (int d=0; d<ncclNIbDevs; d++) {
-      if (strcmp(p, ncclIbDevs[d].pciPath) == 0) (*realPort)++;
+      if (ncclIbMatchVfPath(p, ncclIbDevs[d].pciPath)) (*realPort)++;
     }
   }
   *path = p;
@@ -478,23 +486,66 @@ static int ncclIbRelaxedOrderingCapable(void) {
   return r == ncclInternalError ? 0 : 1;
 }
 
-// Compare ncclIbDev[dev] to all stored mergedIbDevs
-int ncclIbFindMatchingDev(int dev) {
-  for (int i = 0; i < ncclNMergedIbDevs; i++) {
-    if (ncclIbMergedDevs[i].ndevs < NCCL_IB_MAX_DEVS_PER_NIC) {
-      int compareDev = ncclIbMergedDevs[i].devs[0];
-      if (strcmp(ncclIbDevs[dev].pciPath, ncclIbDevs[compareDev].pciPath) == 0 &&
-          (ncclIbDevs[dev].guid == ncclIbDevs[compareDev].guid) &&
-          (ncclIbDevs[dev].link == ncclIbDevs[compareDev].link)) {
-          TRACE(NCCL_NET, "NET/IB: Matched name1=%s pciPath1=%s guid1=0x%lx link1=%u name2=%s pciPath2=%s guid2=0x%lx link2=%u",
-            ncclIbDevs[dev].devName, ncclIbDevs[dev].pciPath, ncclIbDevs[dev].guid, ncclIbDevs[dev].link,
-            ncclIbDevs[compareDev].devName, ncclIbDevs[compareDev].pciPath, ncclIbDevs[compareDev].guid, ncclIbDevs[compareDev].link);
-          return i;
-      }
+ncclResult_t ncclIbMakeVDeviceInternal(int* d, ncclNetVDeviceProps_t* props) {
+  if (ncclParamIbMergeNics() == 0 && props->ndevs > 1) {
+    WARN("NET/IB : Trying to merge multiple devices together when NCCL_IB_MERGE_NICS=0. Please enable it or disable device merging in NCCL.");
+    return ncclInvalidUsage;
+  }
+
+  if (props->ndevs == 0) {
+      WARN("NET/IB : Can't make virtual NIC with 0 devices");
+      return ncclInvalidUsage;
+  }
+
+  if (ncclNMergedIbDevs == MAX_IB_VDEVS) {
+    WARN("NET/IB : Cannot allocate any more virtual devices (%d)", MAX_IB_VDEVS);
+    return ncclInvalidUsage;
+  }
+
+  // Always count up number of merged devices
+  ncclIbMergedDev* mDev = ncclIbMergedDevs + ncclNMergedIbDevs;
+  mDev->vProps.ndevs = 0;
+  mDev->speed = 0;
+
+  for (int i = 0; i < props->ndevs; i++) {
+    ncclIbDev* dev = ncclIbDevs + props->devs[i];
+    if (mDev->vProps.ndevs == NCCL_IB_MAX_DEVS_PER_NIC) return ncclInvalidUsage;
+    mDev->vProps.devs[mDev->vProps.ndevs++] = props->devs[i];
+    mDev->speed += dev->speed;
+    // Each successive time, copy the name '+' new name
+    if (mDev->vProps.ndevs > 1) {
+      snprintf(mDev->devName + strlen(mDev->devName), sizeof(mDev->devName) - strlen(mDev->devName), "+%s", dev->devName);
+    // First time, copy the plain name
+    } else {
+      strncpy(mDev->devName, dev->devName, MAXNAMESIZE);
     }
   }
 
-  return ncclNMergedIbDevs;
+  // Check link layers
+  ncclIbDev* dev0 = ncclIbDevs + props->devs[0];
+  for (int i = 1; i < props->ndevs; i++) {
+    if (props->devs[i] >= ncclNIbDevs) {
+      WARN("NET/IB : Cannot use physical device %d, max %d", props->devs[i], ncclNIbDevs);
+      return ncclInvalidUsage;
+    }
+    ncclIbDev* dev = ncclIbDevs + props->devs[i];
+    if (dev->link != dev0->link) {
+      WARN("NET/IB : Trying to merge multiple devices together with different link_layer properties %s -> %d, %s -> %d. Try only selecting NICs with one type of link using NCCL_IB_HCA",
+        dev0->devName, dev0->link, dev->devName, dev->link);
+      return ncclInvalidUsage;
+    }
+  }
+
+  *d = ncclNMergedIbDevs++;
+  INFO(NCCL_NET, "NET/IB : Made virtual device [%d] name=%s speed=%d ndevs=%d", *d, mDev->devName, mDev->speed, mDev->vProps.ndevs);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIbMakeVDevice(int* d, ncclNetVDeviceProps_t* props) {
+  pthread_mutex_lock(&ncclIbLock);
+  ncclResult_t res = ncclIbMakeVDeviceInternal(d, props);
+  pthread_mutex_unlock(&ncclIbLock);
+  return res;
 }
 
 ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
@@ -531,10 +582,6 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
 
       if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) { ret = ncclInternalError; goto fail; }
 
-      // Should NCCL merge multi-port devices into one?
-      int mergeNics;
-      mergeNics = ncclParamIbMergeNics();
-build_ib_list:
       for (int d=0; d<nIbDevs && ncclNIbDevs<MAX_IB_DEVS; d++) {
         struct ibv_context * context;
         if (ncclSuccess != wrap_ibv_open_device(&context, devices[d]) || context == NULL) {
@@ -593,82 +640,38 @@ build_ib_list:
           ncclSetThreadName(ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs);
           PTHREADCHECKGOTO(pthread_detach(ncclIbAsyncThread), "pthread_detach", ret, fail); // will not be pthread_join()'d
 
-          int mergedDev = ncclNMergedIbDevs;
-          if (mergeNics) {
-            mergedDev = ncclIbFindMatchingDev(ncclNIbDevs);
-          }
+          // Add this plain physical device to the list of virtual devices
+          int vDev;
+          ncclNetVDeviceProps_t vProps = {0};
+          vProps.ndevs = 1;
+          vProps.devs[0] = ncclNIbDevs;
+          NCCLCHECK(ncclIbMakeVDeviceInternal(&vDev, &vProps));
 
-          // No matching dev found, create new mergedDev entry (it's okay if there's only one dev inside)
-          if (mergedDev == ncclNMergedIbDevs) {
-            // Set ndevs to 1, assign first ibDevN to the current IB device
-            ncclIbMergedDevs[mergedDev].ndevs = 1;
-            ncclIbMergedDevs[mergedDev].devs[0] = ncclNIbDevs;
-            ncclNMergedIbDevs++;
-            strncpy(ncclIbMergedDevs[mergedDev].devName, ncclIbDevs[ncclNIbDevs].devName, MAXNAMESIZE);
-          // Matching dev found, edit name
-          } else {
-            // Set next device in this array to the current IB device
-            int ndevs = ncclIbMergedDevs[mergedDev].ndevs;
-            ncclIbMergedDevs[mergedDev].devs[ndevs] = ncclNIbDevs;
-            ncclIbMergedDevs[mergedDev].ndevs++;
-            snprintf(ncclIbMergedDevs[mergedDev].devName + strlen(ncclIbMergedDevs[mergedDev].devName), MAXNAMESIZE+1, "+%s", ncclIbDevs[ncclNIbDevs].devName);
-          }
-
-          // Aggregate speed
-          ncclIbMergedDevs[mergedDev].speed += ncclIbDevs[ncclNIbDevs].speed;
           ncclNIbDevs++;
           nPorts++;
         }
         if (nPorts == 0 && ncclSuccess != wrap_ibv_close_device(context)) { ret = ncclInternalError; goto fail; }
       }
 
-      // Detect if there are both multi-port and single-port NICs in the system. If so, disable port merging and build the list again
-      if (mergeNics) {
-        for (int d = 0; d < ncclNMergedIbDevs; d++) {
-          if (ncclIbMergedDevs[d].ndevs != ncclIbMergedDevs[0].ndevs) {
-            INFO(NCCL_NET, "Detected a mix of single and multiple-port NICs. Force-disabling NCCL_IB_MERGE_NICS");
-            mergeNics = 0;
-            ncclNIbDevs = 0;
-            ncclNMergedIbDevs = 0;
-            memset(ncclIbMergedDevs, 0, sizeof(ncclIbMergedDevs));
-            goto build_ib_list;
-          }
-        }
-      }
-
       if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { ret = ncclInternalError; goto fail; };
     }
     if (ncclNIbDevs == 0) {
       INFO(NCCL_INIT|NCCL_NET, "NET/IB : No device found.");
-    } else {
-      char line[2048];
-      line[0] = '\0';
-      // Determine whether RELAXED_ORDERING is enabled and possible
-      ncclIbRelaxedOrderingEnabled = ncclIbRelaxedOrderingCapable();
-      for (int d = 0; d < ncclNMergedIbDevs; d++) {
-        struct ncclIbMergedDev* mergedDev = ncclIbMergedDevs + d;
-        if (mergedDev->ndevs > 1) {
-          // Print out merged dev info
-          snprintf(line+strlen(line), 2047-strlen(line), " [%d]={", d);
-          for (int i = 0; i < mergedDev->ndevs; i++) {
-            int ibDev = mergedDev->devs[i];
-            snprintf(line+strlen(line), 2047-strlen(line), "[%d] %s:%d/%s%s", ibDev, ncclIbDevs[ibDev].devName,
-              ncclIbDevs[ibDev].portNum, ncclIbDevs[ibDev].link == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE",
-              // Insert comma to delineate
-              i == (mergedDev->ndevs - 1) ? "" : ", ");
-          }
-          snprintf(line+strlen(line), 2047-strlen(line), "}");
-        } else {
-          int ibDev = mergedDev->devs[0];
-          snprintf(line+strlen(line), 2047-strlen(line), " [%d]%s:%d/%s", ibDev, ncclIbDevs[ibDev].devName,
-            ncclIbDevs[ibDev].portNum, ncclIbDevs[ibDev].link == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
-        }
-      }
-      line[2047] = '\0';
-      char addrline[SOCKET_NAME_MAXLEN+1];
-      INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using%s %s; OOB %s:%s", line, ncclIbRelaxedOrderingEnabled ? "[RO]" : "",
-           ncclIbIfName, ncclSocketToString(&ncclIbIfAddr, addrline));
     }
+
+    // Print out all net devices to the user (in the same format as before)
+    char line[2048];
+    line[0] = '\0';
+    // Determine whether RELAXED_ORDERING is enabled and possible
+    ncclIbRelaxedOrderingEnabled = ncclIbRelaxedOrderingCapable();
+    for (int d = 0; d < ncclNIbDevs; d++) {
+        snprintf(line+strlen(line), sizeof(line)-strlen(line), " [%d]%s:%d/%s", d, ncclIbDevs[d].devName,
+          ncclIbDevs[d].portNum, ncclIbDevs[d].link == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
+    }
+    char addrline[SOCKET_NAME_MAXLEN+1];
+    INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using%s %s; OOB %s:%s", line, ncclIbRelaxedOrderingEnabled ? "[RO]" : "",
+          ncclIbIfName, ncclSocketToString(&ncclIbIfAddr, addrline));
+
     pthread_mutex_unlock(&ncclIbLock);
   }
 exit:
@@ -706,27 +709,25 @@ ncclResult_t ncclIbGdrSupport() {
 static __thread int ibDmaSupportInitDev; // which device to init, must be thread local
 static void ibDmaBufSupportInitOnce(){
   ncclResult_t res;
-  // select the appropriate
-  struct ncclIbMergedDev* mergedDev = ncclIbMergedDevs + ibDmaSupportInitDev;
-  // Test each real devices
   int dev_fail = 0;
-  for (int i = 0; i < mergedDev->ndevs; i++) {
-    int ibDev = mergedDev->devs[i];
-    struct ibv_pd* pd;
-    struct ibv_context* ctx = ncclIbDevs[ibDev].context;
-    NCCLCHECKGOTO(wrap_ibv_alloc_pd(&pd, ctx), res, failure);
-    // Test kernel DMA-BUF support with a dummy call (fd=-1)
-    (void)wrap_direct_ibv_reg_dmabuf_mr(pd, 0ULL /*offset*/, 0ULL /*len*/, 0ULL /*iova*/, -1 /*fd*/, 0 /*flags*/);
-    // ibv_reg_dmabuf_mr() will fail with EOPNOTSUPP/EPROTONOSUPPORT if not supported (EBADF otherwise)
-    dev_fail |= (errno == EOPNOTSUPP) || (errno == EPROTONOSUPPORT);
-    NCCLCHECKGOTO(wrap_ibv_dealloc_pd(pd), res, failure);
-    // stop the search and goto failure
-    if (dev_fail) goto failure;
-  }
-  mergedDev->dmaBufSupported = 1;
+
+  // This is a physical device, not a virtual one, so select from ibDevs
+  ncclIbMergedDev* mergedDev = ncclIbMergedDevs + ibDmaSupportInitDev;
+  ncclIbDev* ibDev = ncclIbDevs + mergedDev->vProps.devs[0];
+  struct ibv_pd* pd;
+  struct ibv_context* ctx = ibDev->context;
+  NCCLCHECKGOTO(wrap_ibv_alloc_pd(&pd, ctx), res, failure);
+  // Test kernel DMA-BUF support with a dummy call (fd=-1)
+  (void)wrap_direct_ibv_reg_dmabuf_mr(pd, 0ULL /*offset*/, 0ULL /*len*/, 0ULL /*iova*/, -1 /*fd*/, 0 /*flags*/);
+  // ibv_reg_dmabuf_mr() will fail with EOPNOTSUPP/EPROTONOSUPPORT if not supported (EBADF otherwise)
+  dev_fail |= (errno == EOPNOTSUPP) || (errno == EPROTONOSUPPORT);
+  NCCLCHECKGOTO(wrap_ibv_dealloc_pd(pd), res, failure);
+  // stop the search and goto failure
+  if (dev_fail) goto failure;
+  ibDev->dmaBufSupported = 1;
   return;
 failure:
-  mergedDev->dmaBufSupported = -1;
+  ibDev->dmaBufSupported = -1;
   return;
 }
 // Detect whether DMA-BUF support is present in the kernel
@@ -741,21 +742,20 @@ ncclResult_t ncclIbDmaBufSupport(int dev) {
   // init the device only once
   ibDmaSupportInitDev = dev;
   pthread_once(&onces[dev].once, ibDmaBufSupportInitOnce);
-
-  int dmaBufSupported = ncclIbMergedDevs[dev].dmaBufSupported;
+  ncclIbMergedDev* mergedDev = ncclIbMergedDevs + ibDmaSupportInitDev;
+  ncclIbDev* ibDev = ncclIbDevs + mergedDev->vProps.devs[0];
+  int dmaBufSupported = ibDev->dmaBufSupported;
   if (dmaBufSupported == 1) return ncclSuccess;
   return ncclSystemError;
 }
 
 #define NCCL_NET_IB_MAX_RECVS 8
 
-ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) {
-  struct ncclIbMergedDev* mergedDev = ncclIbMergedDevs+dev;
-  props->name = mergedDev->devName;
-  props->speed = mergedDev->speed;
-
-  // Take the rest of the properties from an arbitrary sub-device (should be the same)
-  struct ncclIbDev* ibDev = ncclIbDevs + mergedDev->devs[0];
+ncclResult_t ncclIbGetPhysProperties(int dev, ncclNetProperties_t* props) {
+  struct ncclIbDev* ibDev = ncclIbDevs + dev;
+  pthread_mutex_lock(&ibDev->lock);
+  props->name = ibDev->devName;
+  props->speed = ibDev->speed;
   props->pciPath = ibDev->pciPath;
   props->guid = ibDev->guid;
   props->ptrSupport = NCCL_PTR_HOST;
@@ -766,12 +766,29 @@ ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) {
   if (ncclIbDmaBufSupport(dev) == ncclSuccess) {
     props->ptrSupport |= NCCL_PTR_DMABUF; // GDR support via DMA-BUF
   }
+  props->forceFlush = 0;
   props->latency = 0; // Not set
   props->port = ibDev->portNum + ibDev->realPort;
   props->maxComms = ibDev->maxQp;
   props->maxRecvs = NCCL_NET_IB_MAX_RECVS;
   props->netDeviceType    = NCCL_NET_DEVICE_HOST;
   props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  props->maxP2pBytes = NCCL_MAX_NET_SIZE_BYTES;
+  pthread_mutex_unlock(&ibDev->lock);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) {
+  if (dev >= ncclNMergedIbDevs) {
+    WARN("NET/IB : Requested properties for vNic %d, only %d vNics have been created", dev, ncclNMergedIbDevs);
+    return ncclInvalidUsage;
+  }
+  struct ncclIbMergedDev* mergedDev = ncclIbMergedDevs + dev;
+  // Take the rest of the properties from an arbitrary sub-device (should be the same)
+  NCCLCHECK(ncclIbGetPhysProperties(mergedDev->vProps.devs[0], props));
+  props->name = mergedDev->devName;
+  props->speed = mergedDev->speed;
+  memcpy(&props->vProps, &mergedDev->vProps, sizeof(ncclNetVDeviceProps_t));
   return ncclSuccess;
 }
 
@@ -826,6 +843,8 @@ enum ncclIbCommState {
   ncclIbCommStateConnecting = 6,
   ncclIbCommStateConnected = 7,
   ncclIbCommStatePendingReady = 8,
+  ncclIbCommStateSendDevList = 9,
+  ncclIbCommStateRecvDevList = 10,
 };
 
 struct ncclIbCommStage {
@@ -890,12 +909,12 @@ struct ncclIbListenComm {
 
 struct ncclIbSendFifo {
   uint64_t addr;
-  int      size;
+  uint64_t size;
   uint32_t rkeys[NCCL_IB_MAX_DEVS_PER_NIC];
   uint32_t nreqs;
   uint32_t tag;
   uint64_t idx;
-  char padding[24];
+  char padding[16];
 };
 
 struct ncclIbQp {
@@ -927,7 +946,7 @@ struct ncclIbMrHandle {
 };
 
 struct alignas(32) ncclIbNetCommBase {
-  int ndevs;
+  ncclNetVDeviceProps_t vProps;
   bool isSend;
   struct ncclIbRequest reqs[MAX_REQUESTS];
   struct ncclIbQp qps[NCCL_IB_MAX_QPS];
@@ -938,6 +957,7 @@ struct alignas(32) ncclIbNetCommBase {
   int ready;
   // Track necessary remDevInfo here
   int nRemDevs;
+  int nDataQps;
   struct ncclIbDevInfo remDevs[NCCL_IB_MAX_DEVS_PER_NIC];
   // statistics about the comm
   struct ncclIbStats stats;
@@ -981,7 +1001,6 @@ struct ncclIbRemFifo {
 struct alignas(16) ncclIbRecvCommDev {
   struct ncclIbNetCommDevBase base;
   struct ncclIbGpuFlush gpuFlush;
-  uint32_t fifoRkey;
   struct ibv_mr* fifoMr;
   struct ibv_sge fifoSge;
   struct ibv_mr* sizesFifoMr;
@@ -989,7 +1008,7 @@ struct alignas(16) ncclIbRecvCommDev {
 
 struct ncclIbRecvComm {
   struct ncclIbNetCommBase base;
-  struct ncclIbRecvCommDev    devs[NCCL_IB_MAX_DEVS_PER_NIC];
+  struct ncclIbRecvCommDev devs[NCCL_IB_MAX_DEVS_PER_NIC];
   struct ncclIbRemFifo remFifo;
   int sizesFifo[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
   int gpuFlushHostMem;
@@ -1060,10 +1079,12 @@ ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base,
   qpAttr.port_num = ib_port;
   qpAttr.qp_access_flags = access_flags;
   NCCLCHECK(wrap_ibv_modify_qp(qp->qp, &qpAttr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS));
+  TRACE(NCCL_NET, "NET/IB : ncclIbCreateQp port=%d dev=%d devName=%s ndevs=%d nmdevs=%d qpn=%u pkey=%u pd=%p",
+    ib_port, base->ibDevN, ncclIbDevs[base->ibDevN].devName, ncclNIbDevs, ncclNMergedIbDevs, qp->qp->qp_num, qpAttr.pkey_index, base->pd);
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, struct ncclIbGidInfo* sGidInfo, uint32_t dest_qp_num, struct ncclIbDevInfo* info, bool override_tc) {
+ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, struct ncclIbGidInfo* sGidInfo, uint32_t dest_qp_num, struct ncclIbDevInfo* info, bool fifoTc) {
   struct ibv_qp_attr qpAttr;
   memset(&qpAttr, 0, sizeof(struct ibv_qp_attr));
   qpAttr.qp_state = IBV_QPS_RTR;
@@ -1079,11 +1100,7 @@ ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, struct ncclIbGidInfo* sGidInfo, uint
     qpAttr.ah_attr.grh.flow_label = 0;
     qpAttr.ah_attr.grh.sgid_index = sGidInfo->localGidIndex;
     qpAttr.ah_attr.grh.hop_limit = 255;
-    if(ncclParamIbFifoTc() && override_tc) {
-      qpAttr.ah_attr.grh.traffic_class = ncclParamIbFifoTc();
-    } else {
-      qpAttr.ah_attr.grh.traffic_class = ncclParamIbTc();
-    }
+    qpAttr.ah_attr.grh.traffic_class = fifoTc && ncclParamIbFifoTc() != -1 ? ncclParamIbFifoTc() : ncclParamIbTc();
   } else {
     //pick lid if subnet prefixs are same, FLID if they are not
     if (ncclIbExtractLocalSubnetPrefix(sGidInfo->localGid.global.subnet_prefix) ==
@@ -1108,6 +1125,7 @@ ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, struct ncclIbGidInfo* sGidInfo, uint
   qpAttr.ah_attr.sl = ncclParamIbSl();
   qpAttr.ah_attr.src_path_bits = 0;
   qpAttr.ah_attr.port_num = info->ib_port;
+  TRACE(NCCL_NET, "NET/IB : ncclIbRtrQp qpn=%u mtu=%d dst=%u ll=%u port=%u", qp->qp_num, info->mtu, dest_qp_num, info->link_layer, info->ib_port);
   NCCLCHECK(wrap_ibv_modify_qp(qp, &qpAttr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER));
   return ncclSuccess;
 }
@@ -1154,10 +1172,12 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet
   int ready;
   *sendComm = NULL;
 
-  if (stage->state == ncclIbCommStateConnect)    goto ib_connect_check;
-  if (stage->state == ncclIbCommStateSend)       goto ib_send;
-  if (stage->state == ncclIbCommStateConnecting) goto ib_connect;
-  if (stage->state == ncclIbCommStateConnected)  goto ib_send_ready;
+  if (stage->state == ncclIbCommStateConnect)      goto ib_connect_check;
+  if (stage->state == ncclIbCommStateSendDevList)  goto ib_send_dev_list;
+  if (stage->state == ncclIbCommStateRecvDevList)  goto ib_recv_dev_list;
+  if (stage->state == ncclIbCommStateSend)         goto ib_send;
+  if (stage->state == ncclIbCommStateConnecting)   goto ib_connect;
+  if (stage->state == ncclIbCommStateConnected)    goto ib_send_ready;
   if (stage->state != ncclIbCommStateStart) {
     WARN("Error: trying to connect already connected sendComm");
     return ncclInternalError;
@@ -1178,21 +1198,51 @@ ib_connect_check:
 
   // IB Setup
   struct ncclIbMergedDev* mergedDev;
+  if (dev >= ncclNMergedIbDevs) {
+    WARN("NET/IB : Trying to use non-existant virtual device %d", dev);
+    return ncclInternalError;
+  }
+
   mergedDev = ncclIbMergedDevs + dev;
-  comm->base.ndevs = mergedDev->ndevs;
-  comm->base.nqps = ncclParamIbQpsPerConn() * comm->base.ndevs; // We must have at least 1 qp per-device
+  comm->base.vProps = mergedDev->vProps;
   comm->base.isSend = true;
+  stage->state = ncclIbCommStateSendDevList;
+  stage->offset = 0;
+  struct ncclIbConnectionMetadata meta;
+  NCCLCHECKGOTO(ncclIbMalloc((void**)&stage->buffer, sizeof(meta)), ret, fail);
+  memcpy(stage->buffer, &mergedDev->vProps, sizeof(ncclNetVDeviceProps_t));
+
+// In the case of mismatched nDevs, we will make sure that both sides of a logical connection have the same number of RC qps
+ib_send_dev_list:
+  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->base.sock, stage->buffer, sizeof(ncclNetVDeviceProps_t), &stage->offset));
+  if (stage->offset != sizeof(ncclNetVDeviceProps_t)) return ncclSuccess;
+
+  stage->state = ncclIbCommStateRecvDevList;
+  stage->offset = 0;
+
+ib_recv_dev_list:
+  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->base.sock, stage->buffer, sizeof(ncclNetVDeviceProps_t), &stage->offset));
+  if (stage->offset != sizeof(ncclNetVDeviceProps_t)) return ncclSuccess;
+  stage->offset = 0;
+  ncclNetVDeviceProps_t remoteVProps;
+  memcpy(&remoteVProps, stage->buffer, sizeof(ncclNetVDeviceProps_t));
+  mergedDev = ncclIbMergedDevs + dev;
+  comm->base.vProps = mergedDev->vProps;
+  int localNqps, remoteNqps;
+  localNqps  = ncclParamIbQpsPerConn() * comm->base.vProps.ndevs; // We must have at least 1 qp per-device
+  remoteNqps = ncclParamIbQpsPerConn() * remoteVProps.ndevs;
+  comm->base.nqps = remoteNqps > localNqps ? remoteNqps : localNqps; // Select max nqps (local or remote)
 
   // Init PD, Ctx for each IB device
   comm->ar = 1; // Set to 1 for logic
-  for (int i = 0; i < mergedDev->ndevs; i++) {
-    int ibDevN = mergedDev->devs[i];
+  for (int i = 0; i < comm->base.vProps.ndevs; i++) {
+    int ibDevN = comm->base.vProps.devs[i];
     NCCLCHECKGOTO(ncclIbInitCommDevBase(ibDevN, &comm->devs[i].base, &comm->base.stats), ret, fail);
-    comm->ar = comm->ar && ncclIbDevs[dev].ar; // ADAPTIVE_ROUTING - if all merged devs have it enabled
+    comm->ar = comm->ar && ncclIbDevs[ibDevN].ar; // ADAPTIVE_ROUTING - if all merged devs have it enabled
   }
 
-  struct ncclIbConnectionMetadata meta;
-  meta.ndevs = comm->base.ndevs;
+  memset(&meta, 0, sizeof(meta));
+  meta.ndevs = comm->base.vProps.ndevs;
 
   // Alternate QPs between devices
   int devIndex;
@@ -1211,10 +1261,10 @@ ib_connect_check:
     } else {
       meta.qpInfo[q].ece_supported = 0;
     }
-    devIndex = (devIndex + 1) % comm->base.ndevs;
+    devIndex = (devIndex + 1) % comm->base.vProps.ndevs;
   }
 
-  for (int i = 0; i < comm->base.ndevs; i++) {
+  for (int i = 0; i < comm->base.vProps.ndevs; i++) {
     ncclIbSendCommDev* commDev = comm->devs + i;
     ncclIbDev* ibDev = ncclIbDevs + commDev->base.ibDevN;
 
@@ -1241,7 +1291,7 @@ ib_connect_check:
         // Print just the QPs for this dev
         if (comm->base.qps[q].devIndex == i)
           INFO(NCCL_NET,"NET/IB: %s %d IbDev %d Port %d qpn %d mtu %d LID %d subnet-prefix %lu  FLID %d fifoRkey=0x%x fifoLkey=0x%x",
-            comm->base.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev",
+            comm->base.vProps.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev",
             dev, commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu, devInfo->lid,
 	    devInfo->gid.global.subnet_prefix, ncclIbExtractFlid(&devInfo->gid), devInfo->fifoRkey, commDev->fifoMr->lkey);
       }
@@ -1250,7 +1300,7 @@ ib_connect_check:
         // Print just the QPs for this dev
         if (comm->base.qps[q].devIndex == i)
           INFO(NCCL_NET,"NET/IB: %s %d IbDev %d Port %d qpn %d mtu %d query_ece={supported=%d, vendor_id=0x%x, options=0x%x, comp_mask=0x%x} GID %ld (%lX/%lX) fifoRkey=0x%x fifoLkey=0x%x",
-            comm->base.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev", dev,
+            comm->base.vProps.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev", dev,
             commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu, meta.qpInfo[q].ece_supported, meta.qpInfo[q].ece.vendor_id, meta.qpInfo[q].ece.options, meta.qpInfo[q].ece.comp_mask, (int64_t)commDev->base.gidInfo.localGidIndex,
             devInfo->gid.global.subnet_prefix, devInfo->gid.global.interface_id, devInfo->fifoRkey, commDev->fifoMr->lkey);
       }
@@ -1261,7 +1311,6 @@ ib_connect_check:
 
   stage->state = ncclIbCommStateSend;
   stage->offset = 0;
-  NCCLCHECKGOTO(ncclIbMalloc((void**)&stage->buffer, sizeof(meta)), ret, fail);
 
   memcpy(stage->buffer, &meta, sizeof(meta));
 
@@ -1282,17 +1331,12 @@ ib_connect:
   memcpy(&remMeta, stage->buffer, sizeof(ncclIbConnectionMetadata));
 
   comm->base.nRemDevs = remMeta.ndevs;
-  if (comm->base.nRemDevs != comm->base.ndevs) {
-    mergedDev = ncclIbMergedDevs + dev;
-    WARN("NET/IB : Local mergedDev=%s has a different number of devices=%d as remoteDev=%s nRemDevs=%d",
-      mergedDev->devName, comm->base.ndevs, remMeta.devName, comm->base.nRemDevs);
-  }
 
   int link_layer;
   link_layer = remMeta.devs[0].link_layer;
   for (int i = 1; i < remMeta.ndevs; i++) {
     if (remMeta.devs[i].link_layer != link_layer) {
-      WARN("NET/IB : Can't merge net devices with different link_layer. i=%d remMeta.ndevs=%d link_layer=%d rem_link_layer=%d",
+      WARN("NET/IB : Can't connect net devices with different link_layer. i=%d remMeta.ndevs=%d link_layer=%d rem_link_layer=%d",
       i, remMeta.ndevs, link_layer, remMeta.devs[i].link_layer);
       return ncclInternalError;
     }
@@ -1309,7 +1353,7 @@ ib_connect:
     comm->remSizesFifo.addr = remMeta.fifoAddr;
   }
 
-  for (int i=0; i < comm->base.ndevs; i++) {
+  for (int i=0; i < comm->base.vProps.ndevs; i++) {
     NCCLCHECKGOTO(wrap_ibv_reg_mr(comm->remSizesFifo.mrs+i, comm->devs[i].base.pd, &comm->remSizesFifo.elems, sizeof(int)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
   }
   comm->base.nRemDevs = remMeta.ndevs;
@@ -1327,6 +1371,8 @@ ib_connect:
     if (remQpInfo->ece_supported)
       NCCLCHECKGOTO(wrap_ibv_set_ece(qp, &remQpInfo->ece, &remQpInfo->ece_supported), ret, fail);
 
+    ncclIbDev* ibDev = ncclIbDevs + commDev->base.ibDevN;
+    remDevInfo->mtu = std::min(remDevInfo->mtu, ibDev->portAttr.active_mtu);
     NCCLCHECKGOTO(ncclIbRtrQp(qp, &commDev->base.gidInfo, remQpInfo->qpn, remDevInfo, false), ret, fail);
     NCCLCHECKGOTO(ncclIbRtsQp(qp), ret, fail);
   }
@@ -1341,6 +1387,8 @@ ib_connect:
     }
   }
 
+  comm->base.nDataQps = std::max(comm->base.vProps.ndevs, comm->base.nRemDevs);
+
   comm->base.ready = 1;
   stage->state = ncclIbCommStateConnected;
   stage->offset = 0;
@@ -1359,6 +1407,50 @@ fail:
   goto exit;
 }
 
+NCCL_PARAM(IbWarnRailLocal, "IB_WARN_RAIL_LOCAL", 0);
+
+ncclResult_t ncclIbCheckVProps(ncclNetVDeviceProps_t* vProps1, ncclNetVDeviceProps_t* vProps2) {
+  ncclNetVDeviceProps_t  outVProps = {0};
+  ncclNetVDeviceProps_t* minVProps = vProps2;
+  ncclNetVDeviceProps_t* maxVProps = vProps1;
+  if (vProps2->ndevs > vProps1->ndevs) {
+    minVProps = vProps1;
+    maxVProps = vProps2;
+  }
+
+  // Find the intersection of devices
+  for (int i = 0; i < minVProps->ndevs; i++) {
+    int dev = minVProps->devs[i];
+    for (int j = 0; j < maxVProps->ndevs; j++) {
+      // Found
+      if (maxVProps->devs[j] == dev) {
+        outVProps.devs[outVProps.ndevs++] = dev;
+      }
+    }
+  }
+
+  // In the case that at least one side has a fused NIC but there are no matching physical NICs, we should check if the user wants this
+  if (ncclParamIbWarnRailLocal() && outVProps.ndevs < maxVProps->ndevs) {
+    char local[128];
+    int cursor = 1;
+    snprintf(local, sizeof(local), "%d", vProps1->devs[0]);
+    for (int i = 1; i < vProps1->ndevs; i++) {
+      snprintf(local+cursor, sizeof(local)-cursor, ",%d", vProps1->devs[i]);
+      cursor += 2;
+    }
+    char remote[128];
+    snprintf(remote, sizeof(remote), "%d", vProps2->devs[0]);
+    cursor = 1;
+    for (int i = 1; i < vProps2->ndevs; i++) {
+      snprintf(remote+cursor, sizeof(remote)-cursor, ",%d", vProps2->devs[i]);
+      cursor += 2;
+    }
+    INFO(NCCL_NET, "NET/IB : There are mismatched physical devices between local (%s) and remote (%s). To disable this warning, set NCCL_IB_WARN_RAIL_LOCAL=0", local, remote);
+  }
+
+  return ncclSuccess;
+}
+
 NCCL_PARAM(IbGdrFlushDisable, "GDR_FLUSH_DISABLE", 0);
 
 ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
@@ -1369,7 +1461,9 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle
   int ready;
   *recvComm = NULL;
 
-  if (stage->state == ncclIbCommStateAccept) goto ib_accept_check;
+  if (stage->state == ncclIbCommStateAccept)   goto ib_accept_check;
+  if (stage->state == ncclIbCommStateRecvDevList) goto ib_recv_dev_list;
+  if (stage->state == ncclIbCommStateSendDevList) goto ib_send_dev_list;
   if (stage->state == ncclIbCommStateRecv) goto ib_recv;
   if (stage->state == ncclIbCommStateSend) goto ib_send;
   if (stage->state == ncclIbCommStatePendingReady) goto ib_recv_ready;
@@ -1385,14 +1479,49 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle
   NCCLCHECKGOTO(ncclSocketInit(&rComm->base.sock), ret, fail);
   NCCLCHECKGOTO(ncclSocketAccept(&rComm->base.sock, &lComm->sock), ret, fail);
 
+  // Alloc stage->buffer here to be used for all following steps
+  struct ncclIbConnectionMetadata remMeta;
+  stage->offset = 0;
+  NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(remMeta)));
+
 ib_accept_check:
   NCCLCHECKGOTO(ncclSocketReady(&rComm->base.sock, &ready), ret, fail);
   if (!ready) return ncclSuccess;
-
-  struct ncclIbConnectionMetadata remMeta;
-  stage->state = ncclIbCommStateRecv;
+  stage->state = ncclIbCommStateRecvDevList;
   stage->offset = 0;
-  NCCLCHECKGOTO(ncclIbMalloc((void**)&stage->buffer, sizeof(remMeta)), ret, fail);
+
+// In the case of mismatched nDevs, we will make sure that both sides of a logical connection have the same number of RC qps
+ib_recv_dev_list:
+  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->base.sock, stage->buffer, sizeof(ncclNetVDeviceProps_t), &stage->offset));
+  if (stage->offset != sizeof(ncclNetVDeviceProps_t)) return ncclSuccess;
+  ncclNetVDeviceProps_t remoteVProps;
+  memcpy(&remoteVProps, stage->buffer, sizeof(ncclNetVDeviceProps_t));
+  if (lComm->dev >= ncclNMergedIbDevs) {
+    WARN("NET/IB : Trying to use non-existant virtual device %d", lComm->dev);
+    return ncclInternalError;
+  }
+
+  // Reduce the physical device list and store in the connection base
+  struct ncclIbMergedDev* mergedDev;
+  mergedDev = ncclIbMergedDevs + lComm->dev;
+  NCCLCHECK(ncclIbCheckVProps(&mergedDev->vProps, &remoteVProps));
+  rComm->base.vProps = mergedDev->vProps;
+  memcpy(stage->buffer, &rComm->base.vProps, sizeof(ncclNetVDeviceProps_t));
+  rComm->base.isSend = false;
+  int localNqps, remoteNqps;
+  localNqps  = ncclParamIbQpsPerConn() * rComm->base.vProps.ndevs; // We must have at least 1 qp per-device
+  remoteNqps = ncclParamIbQpsPerConn() * remoteVProps.ndevs;
+  rComm->base.nqps = remoteNqps > localNqps ? remoteNqps : localNqps; // Select max nqps (local or remote)
+
+  stage->offset = 0;
+  stage->state = ncclIbCommStateSendDevList;
+
+ib_send_dev_list:
+  NCCLCHECKGOTO(ncclSocketProgress(NCCL_SOCKET_SEND, &rComm->base.sock, stage->buffer, sizeof(ncclNetVDeviceProps_t), &stage->offset), ret, fail);
+  if (stage->offset != sizeof(ncclNetVDeviceProps_t)) return ncclSuccess;
+
+  stage->offset = 0;
+  stage->state = ncclIbCommStateRecv;
 
 ib_recv:
   NCCLCHECKGOTO(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->base.sock, stage->buffer, sizeof(remMeta), &stage->offset), ret, fail);
@@ -1403,7 +1532,6 @@ ib_recv:
 
   // IB setup
   // Pre-declare variables because of goto
-  struct ncclIbMergedDev* mergedDev;
   struct ncclIbDev* ibDev;
   int ibDevN;
   struct ncclIbRecvCommDev* rCommDev;
@@ -1411,21 +1539,18 @@ ib_recv:
   struct ncclIbQp* qp;
 
   mergedDev = ncclIbMergedDevs + lComm->dev;
-  rComm->base.ndevs = mergedDev->ndevs;
-  rComm->base.nqps  = ncclParamIbQpsPerConn() * rComm->base.ndevs; // We must have at least 1 qp per-device
-  rComm->base.isSend = false;
-
   rComm->base.nRemDevs = remMeta.ndevs;
-  if (rComm->base.nRemDevs != rComm->base.ndevs) {
-    WARN("NET/IB : Local mergedDev %s has a different number of devices=%d as remote %s %d",
-      mergedDev->devName, rComm->base.ndevs, remMeta.devName, rComm->base.nRemDevs);
+  if (rComm->base.nRemDevs != rComm->base.vProps.ndevs) {
+    INFO(NCCL_NET, "NET/IB : Local mergedDev %s has a different number of devices=%d as remote %s %d",
+      mergedDev->devName, rComm->base.vProps.ndevs, remMeta.devName, rComm->base.nRemDevs);
   }
 
   // Metadata to send back to requestor (sender)
   struct ncclIbConnectionMetadata meta;
-  for (int i = 0; i < rComm->base.ndevs; i++) {
+  memset(&meta, 0, sizeof(meta));
+  for (int i = 0; i < rComm->base.vProps.ndevs; i++) {
     rCommDev = rComm->devs + i;
-    ibDevN = mergedDev->devs[i];
+    ibDevN = rComm->base.vProps.devs[i];
     NCCLCHECKGOTO(ncclIbInitCommDevBase(ibDevN, &rCommDev->base, &rComm->base.stats), ret, fail);
     ibDev = ncclIbDevs + ibDevN;
     NCCLCHECKGOTO(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, &ibDev->portAttr, &rCommDev->base.gidInfo.localGidIndex), ret, fail);
@@ -1456,7 +1581,7 @@ ib_recv:
     ibDev = ncclIbDevs + ibDevN;
     NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_REMOTE_WRITE, &rComm->base.stats, qp), ret, fail);
     qp->devIndex = devIndex;
-    devIndex = (devIndex + 1) % rComm->base.ndevs;
+    devIndex = (devIndex + 1) % rComm->base.vProps.ndevs;
 
     // Set the ece (enhanced connection establishment) on this QP before RTR
     if (remMeta.qpInfo[q].ece_supported) {
@@ -1469,23 +1594,22 @@ ib_recv:
       // Store this in our own qpInfo for returning to the requestor
       if (meta.qpInfo[q].ece_supported)
         NCCLCHECKGOTO(wrap_ibv_query_ece(qp->qp, &meta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported), ret, fail);
+    } else {
+      meta.qpInfo[q].ece_supported = 0;
     }
 
-    bool override_tc = (q == 0) ? true : false;
-    NCCLCHECKGOTO(ncclIbRtrQp(qp->qp, &rCommDev->base.gidInfo, remMeta.qpInfo[q].qpn, remDevInfo, override_tc), ret, fail);
+    NCCLCHECKGOTO(ncclIbRtrQp(qp->qp, &rCommDev->base.gidInfo, remMeta.qpInfo[q].qpn, remDevInfo, true), ret, fail);
     NCCLCHECKGOTO(ncclIbRtsQp(qp->qp), ret, fail);
   }
 
   rComm->flushEnabled = ((ncclIbGdrSupport() == ncclSuccess || ncclIbDmaBufSupport(lComm->dev) == ncclSuccess)
                             && (ncclParamIbGdrFlushDisable() == 0)) ? 1 : 0;
 
-  for (int i = 0; i < mergedDev->ndevs; i++) {
+  for (int i = 0; i < rComm->base.vProps.ndevs; i++) {
     rCommDev = rComm->devs + i;
-    ibDevN = rCommDev->base.ibDevN;
-    ibDev = ncclIbDevs + ibDevN;
+    ibDev = ncclIbDevs + rCommDev->base.ibDevN;
 
     // Retain remote fifo info and prepare my RDMA ops
-    rCommDev->fifoRkey = remMeta.devs[i].fifoRkey;
     rComm->remFifo.addr = remMeta.fifoAddr;
     NCCLCHECKGOTO(wrap_ibv_reg_mr(&rCommDev->fifoMr, rCommDev->base.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
     rCommDev->fifoSge.lkey = rCommDev->fifoMr->lkey;
@@ -1510,15 +1634,12 @@ ib_recv:
     }
 
     // Fill Handle
-    meta.devs[i].lid        = ibDev->portAttr.lid;
-    meta.devs[i].link_layer = rCommDev->base.gidInfo.link_layer = ibDev->portAttr.link_layer;
-    meta.devs[i].ib_port    = ibDev->portNum;
+    meta.devs[i].lid                            = ibDev->portAttr.lid;
+    meta.devs[i].link_layer                     = rCommDev->base.gidInfo.link_layer = ibDev->portAttr.link_layer;
+    meta.devs[i].ib_port                        = ibDev->portNum;
     meta.devs[i].gid.global.subnet_prefix       = rCommDev->base.gidInfo.localGid.global.subnet_prefix;
     meta.devs[i].gid.global.interface_id        = rCommDev->base.gidInfo.localGid.global.interface_id;
-
-    // Adjust the MTU
-    remMeta.devs[i].mtu    = (enum ibv_mtu) std::min(remMeta.devs[i].mtu, ibDev->portAttr.active_mtu);
-    meta.devs[i].mtu      = remMeta.devs[i].mtu;
+    meta.devs[i].mtu                            = ibDev->portAttr.active_mtu;
 
     // Prepare sizes fifo
     NCCLCHECKGOTO(wrap_ibv_reg_mr(&rComm->devs[i].sizesFifoMr, rComm->devs[i].base.pd, rComm->sizesFifo, sizeof(int)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
@@ -1530,9 +1651,9 @@ ib_recv:
     meta.qpInfo[q].qpn      = rComm->base.qps[q].qp->qp_num;
     meta.qpInfo[q].devIndex = rComm->base.qps[q].devIndex;
   }
-
-  meta.ndevs = rComm->base.ndevs;
+  meta.ndevs = rComm->base.vProps.ndevs;
   strncpy(meta.devName, mergedDev->devName, MAX_MERGED_DEV_NAME);
+  rComm->base.nDataQps = std::max(rComm->base.vProps.ndevs, rComm->base.nRemDevs);
 
   stage->state = ncclIbCommStateSend;
   stage->offset = 0;
@@ -1662,7 +1783,7 @@ ncclResult_t ncclIbRegMrDmaBuf(void* comm, void* data, size_t size, int type, ui
   assert(size > 0);
   struct ncclIbNetCommBase* base = (struct ncclIbNetCommBase*) comm;
   struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) malloc(sizeof(struct ncclIbMrHandle));
-  for (int i = 0; i < base->ndevs; i++) {
+  for (int i = 0; i < base->vProps.ndevs; i++) {
     // Each ncclIbNetCommDevBase is at different offset in send and recv netComms
     struct ncclIbNetCommDevBase* devComm = ncclIbGetNetCommDevBase(base, i);
     NCCLCHECKGOTO(ncclIbRegMrDmaBufInternal(devComm, data, size, type, offset, fd, mhandleWrapper->mrs + i), ret, fail);
@@ -1706,9 +1827,11 @@ returning:
 }
 
 ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) {
+  if (mhandle == NULL) return ncclSuccess;
+
   struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) mhandle;
   struct ncclIbNetCommBase* base = (struct ncclIbNetCommBase*) comm;
-  for (int i = 0; i < base->ndevs; i++) {
+  for (int i = 0; i < base->vProps.ndevs; i++) {
     // Each ncclIbNetCommDevBase is at different offset in send and recv netComms
     struct ncclIbNetCommDevBase* devComm = ncclIbGetNetCommDevBase(base, i);
     NCCLCHECK(ncclIbDeregMrInternal(devComm, mhandleWrapper->mrs[i]));
@@ -1773,7 +1896,7 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
 
   // Multi-QP: make sure IB writes are multiples of 128B so that LL and LL128 protocols still work
   const int align = 128;
-  int nqps = ncclParamIbSplitDataOnQps() ? comm->base.nqps : comm->base.ndevs;
+  int nqps = ncclParamIbSplitDataOnQps() ? comm->base.nqps : comm->base.nDataQps;
   for (int i = 0; i < nqps; i++) {
     int qpIndex = comm->base.qpIndex;
     ncclIbQp* qp = comm->base.qps + qpIndex;
@@ -1822,7 +1945,7 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
+ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
   struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
   if (comm->base.ready == 0) { WARN("NET/IB: ncclIbIsend() called when comm->base.ready == 0"); return ncclInternalError; }
   if (comm->base.ready == 0) { *request = NULL; return ncclSuccess; }
@@ -1852,7 +1975,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mh
       char line[SOCKET_NAME_MAXLEN + 1];
       union ncclSocketAddress addr;
       ncclSocketGetAddr(&comm->base.sock, &addr);
-      WARN("NET/IB : req %d/%d tag %x peer %s posted incorrect receive info: size %d addr %lx rkeys[0]=%x",
+      WARN("NET/IB : req %d/%d tag %x peer %s posted incorrect receive info: size %ld addr %lx rkeys[0]=%x",
         r, nreqs, tag, ncclSocketToString(&addr, line), slots[r].size, slots[r].addr, slots[r].rkeys[0]);
       return ncclInternalError;
     }
@@ -1868,7 +1991,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mh
     req->send.offset = 0;
 
     // Populate events
-    int nEvents = ncclParamIbSplitDataOnQps() ? comm->base.nqps : comm->base.ndevs;
+    int nEvents = ncclParamIbSplitDataOnQps() ? comm->base.nqps : comm->base.nDataQps;
     int qpIndex = comm->base.qpIndex;
     // Count down
     while (nEvents > 0) {
@@ -1883,7 +2006,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mh
     }
 
     // Store all lkeys
-    for (int i = 0; i < comm->base.ndevs; i++) {
+    for (int i = 0; i < comm->base.vProps.ndevs; i++) {
       req->send.lkeys[i] = mhandleWrapper->mrs[i]->lkey;
     }
 
@@ -1909,7 +2032,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mh
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, int* sizes, int* tags, void** mhandles, struct ncclIbRequest* req) {
+ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, size_t* sizes, int* tags, void** mhandles, struct ncclIbRequest* req) {
   struct ibv_send_wr wr;
   memset(&wr, 0, sizeof(wr));
 
@@ -1921,14 +2044,14 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, int
   // Select the next devIndex (local) and QP to use for posting this CTS message
   // Since QPs are initialized by striping across devIndex, we can simply assign this to the same value
   ncclIbQp* ctsQp = comm->base.qps + comm->base.devIndex;
-  comm->base.devIndex = (comm->base.devIndex + 1) % comm->base.ndevs;
+  comm->base.devIndex = (comm->base.devIndex + 1) % comm->base.vProps.ndevs;
 
   for (int i=0; i<n; i++) {
     localElem[i].addr = (uint64_t)data[i];
     struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) mhandles[i];
 
     // Send all applicable rkeys
-    for (int j = 0; j < comm->base.ndevs; j++)
+    for (int j = 0; j < comm->base.vProps.ndevs; j++)
       localElem[i].rkeys[j] = mhandleWrapper->mrs[j]->rkey;
 
     localElem[i].nreqs = n;
@@ -1986,7 +2109,7 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, int
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
+ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
   struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
   if (comm->base.ready == 0) { WARN("NET/IB: ncclIbIrecv() called when comm->base.ready == 0"); return ncclInternalError; }
   if (comm->base.ready == 0) { *request = NULL; return ncclSuccess; }
@@ -1999,7 +2122,7 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* ta
   req->sock = &comm->base.sock;
   req->nreqs = n;
 
-  for (int i = 0; i < comm->base.ndevs; i++) {
+  for (int i = 0; i < comm->base.vProps.ndevs; i++) {
     req->devBases[i] = &comm->devs[i].base;
   }
 
@@ -2011,7 +2134,7 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* ta
 
   TIME_START(1);
   // Select either all QPs, or one qp per-device
-  const int nqps = ncclParamIbSplitDataOnQps() ? comm->base.nqps : comm->base.ndevs;
+  const int nqps = ncclParamIbSplitDataOnQps() ? comm->base.nqps : comm->base.nDataQps;
 
   // Post recvs
   struct ibv_recv_wr* bad_wr;
@@ -2047,7 +2170,7 @@ ncclResult_t ncclIbIflush(void* recvComm, int n, void** data, int* sizes, void**
   struct ncclIbMrHandle* mhandle = (struct ncclIbMrHandle*) mhandles[last];
 
   // We don't know which devIndex the recv was on, so we flush on all devices
-  for (int i = 0; i < comm->base.ndevs; i++) {
+  for (int i = 0; i < comm->base.vProps.ndevs; i++) {
     struct ibv_send_wr wr;
     memset(&wr, 0, sizeof(wr));
     wr.wr_id = req - comm->base.reqs;
@@ -2078,7 +2201,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
   *done = 0;
   while (1) {
     NCCLCHECK(ncclIbStatsCheckFatalCount(&r->base->stats,__func__));
-    if (r->events[0] == 0 && r->events[1] == 0) {
+    if (r->events[0] == 0 && r->events[1] == 0 && r->events[2] == 0 && r->events[3] == 0) {
       TRACE(NCCL_NET, "r=%p done", r);
       *done = 1;
       if (sizes && r->type == NCCL_NET_IB_REQ_RECV) {
@@ -2112,13 +2235,13 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
             char remoteGidString[INET6_ADDRSTRLEN] = "";
             const char* localGidStr = NULL, *remoteGidStr = NULL;
             if (r->devBases[i]->gidInfo.link_layer == IBV_LINK_LAYER_ETHERNET) {
-              localGidStr = inet_ntop(AF_INET6, &r->devBases[i]->gidInfo.localGid, localGidString, sizeof(localGidString));
-              remoteGidStr = inet_ntop(AF_INET6, &r->base->remDevs[i].remoteGid, remoteGidString, sizeof(remoteGidString));
+              localGidStr = ibvGetGidStr(&r->devBases[i]->gidInfo.localGid, localGidString, sizeof(localGidString));
+              remoteGidStr = ibvGetGidStr(&r->base->remDevs[i].remoteGid, remoteGidString, sizeof(remoteGidString));
             }
 
             char line[SOCKET_NAME_MAXLEN+1];
             char *hcaName = r->devBases[i]->pd->context->device->name;
-            WARN("NET/IB: Got completion from peer %s with status=%d opcode=%d len=%d vendor err %d (%s)%s%s%s%s hca %s",
+            WARN("NET/IB: Got completion from peer %s with status=%d opcode=%d len=%u vendor err %u (%s)%s%s%s%s hca %s",
                 ncclSocketToString(&addr, line), wc->status, wc->opcode, wc->byte_len, wc->vendor_err, reqTypeStr[r->type],
                 localGidStr ?  " localGid ":"", localGidString, remoteGidStr ? " remoteGids":"", remoteGidString, hcaName);
             return ncclRemoteError;
@@ -2130,7 +2253,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
 
           #ifdef ENABLE_TRACE
           char line[SOCKET_NAME_MAXLEN+1];
-          TRACE(NCCL_NET, "Got completion from peer %s with status=%d opcode=%d len=%d wr_id=%ld r=%p type=%d events={%d,%d}, i=%d",
+          TRACE(NCCL_NET, "Got completion from peer %s with status=%d opcode=%d len=%u wr_id=%lu r=%p type=%d events={%d,%d}, i=%d",
               ncclSocketToString(&addr, line), wc->status, wc->opcode,wc->byte_len, wc->wr_id, req, req->type, req->events[0], req->events[1], i);
           #endif
           if (req && req->type == NCCL_NET_IB_REQ_SEND) {
@@ -2174,7 +2297,7 @@ ncclResult_t ncclIbCloseSend(void* sendComm) {
     for (int q = 0; q < comm->base.nqps; q++)
       if (comm->base.qps[q].qp != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->base.qps[q].qp));
 
-    for (int i = 0; i < comm->base.ndevs; i++) {
+    for (int i = 0; i < comm->base.vProps.ndevs; i++) {
       struct ncclIbSendCommDev* commDev = comm->devs + i;
       if (commDev->fifoMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(commDev->fifoMr));
       if (comm->remSizesFifo.mrs[i] != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->remSizesFifo.mrs[i]));
@@ -2194,7 +2317,7 @@ ncclResult_t ncclIbCloseRecv(void* recvComm) {
     for (int q = 0; q < comm->base.nqps; q++)
       if (comm->base.qps[q].qp != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->base.qps[q].qp));
 
-    for (int i = 0; i < comm->base.ndevs; i++) {
+    for (int i = 0; i < comm->base.vProps.ndevs; i++) {
       struct ncclIbRecvCommDev* commDev = comm->devs + i;
       if (comm->flushEnabled) {
         if (commDev->gpuFlush.qp.qp != NULL) NCCLCHECK(wrap_ibv_destroy_qp(commDev->gpuFlush.qp.qp));
@@ -2237,6 +2360,11 @@ ncclNet_t ncclNetIb = {
   ncclIbCloseRecv,
   ncclIbCloseListen,
   NULL /* getDeviceMr */,
-  NULL /* irecvConsumed */
+  NULL /* irecvConsumed */,
+  ncclIbMakeVDevice
 };
 
+/*
+  ncclIbSetProperties,
+  ncclIbRefreshDevices
+*/
diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc
index 73a5d55b00..235dee865a 100644
--- a/src/transport/net_socket.cc
+++ b/src/transport/net_socket.cc
@@ -44,6 +44,7 @@ ncclResult_t ncclNetSocketInit(ncclDebugLogger_t logFunction) {
       ncclNetIfs = ncclFindInterfaces(names, addrs, MAX_IF_NAME_SIZE, MAX_IFS);
       if (ncclNetIfs <= 0) {
         WARN("NET/Socket : no interface found");
+        pthread_mutex_unlock(&ncclNetSocketLock);
         return ncclInternalError;
       } else {
         #define MAX_LINE_LEN (2047)
@@ -76,7 +77,7 @@ static ncclResult_t ncclNetSocketGetSpeed(char* devName, int* speed) {
   ncclResult_t ret = ncclSuccess;
   *speed = 0;
   char speedPath[PATH_MAX];
-  sprintf(speedPath, "/sys/class/net/%s/speed", devName);
+  snprintf(speedPath, sizeof(speedPath), "/sys/class/net/%s/speed", devName);
   int fd = -1;
   SYSCHECKSYNC(open(speedPath, O_RDONLY), "open", fd);
   if (fd != -1) {
@@ -102,6 +103,7 @@ ncclResult_t ncclNetSocketGetProperties(int dev, ncclNetProperties_t* props) {
   props->guid = dev;
   props->ptrSupport = NCCL_PTR_HOST;
   props->regIsGlobal = 0;
+  props->forceFlush = 0;
   NCCLCHECK(ncclNetSocketGetSpeed(props->name, &props->speed));
   props->latency = 0; // Not set
   props->port = 0;
@@ -109,6 +111,7 @@ ncclResult_t ncclNetSocketGetProperties(int dev, ncclNetProperties_t* props) {
   props->maxRecvs = 1;
   props->netDeviceType    = NCCL_NET_DEVICE_HOST;
   props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  props->maxP2pBytes = NCCL_MAX_NET_SIZE_BYTES;
   return ncclSuccess;
 }
 
@@ -297,6 +300,7 @@ fail:
 
 ncclResult_t ncclNetSocketListen(int dev, void* opaqueHandle, void** listenComm) {
   if (dev < 0 || dev >= ncclNetIfs) { // data transfer socket is based on specified dev
+    WARN("NET/Socket : ncclNetSocketListen dev=%d ncclNetIfs=%d", dev, ncclNetIfs);
     return ncclInternalError;
   }
   ncclResult_t ret = ncclSuccess;
@@ -558,16 +562,16 @@ ncclResult_t ncclNetSocketRegMr(void* comm, void* data, size_t size, int type, v
 }
 ncclResult_t ncclNetSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; }
 
-ncclResult_t ncclNetSocketIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
+ncclResult_t ncclNetSocketIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
   struct ncclNetSocketComm* comm = (struct ncclNetSocketComm*)sendComm;
-  NCCLCHECK(ncclNetSocketGetRequest(comm, NCCL_SOCKET_SEND, data, size, (struct ncclNetSocketRequest**)request));
+  NCCLCHECK(ncclNetSocketGetRequest(comm, NCCL_SOCKET_SEND, data, (int) size, (struct ncclNetSocketRequest**)request));
   return ncclSuccess;
 }
 
-ncclResult_t ncclNetSocketIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
+ncclResult_t ncclNetSocketIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
   struct ncclNetSocketComm* comm = (struct ncclNetSocketComm*)recvComm;
   if (n != 1) return ncclInternalError;
-  NCCLCHECK(ncclNetSocketGetRequest(comm, NCCL_SOCKET_RECV, data[0], sizes[0], (struct ncclNetSocketRequest**)request));
+  NCCLCHECK(ncclNetSocketGetRequest(comm, NCCL_SOCKET_RECV, data[0], (int)sizes[0], (struct ncclNetSocketRequest**)request));
   return ncclSuccess;
 }
 
@@ -632,5 +636,6 @@ ncclNet_t ncclNetSocket = {
   ncclNetSocketClose,
   ncclNetSocketCloseListen,
   NULL /* getDeviceMr */,
-  NULL /* irecvConsumed */
+  NULL /* irecvConsumed */,
+  NULL /* mergeDevices */
 };
diff --git a/src/transport/nvls.cc b/src/transport/nvls.cc
index aa9c486b14..582c30a353 100644
--- a/src/transport/nvls.cc
+++ b/src/transport/nvls.cc
@@ -108,11 +108,12 @@ ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, size_t size, CUmemGenericAll
   return ncclSuccess;
 }
 
-ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size) {
+ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size) {
   CUCHECK(cuMulticastUnbind(*mcHandler, dev, 0/*mcOffset*/, size));
   CUCHECK(cuMemUnmap(ptr, size));
   CUCHECK(cuMemAddressFree(ptr, size));
   CUCHECK(cuMemRelease(*mcHandler));
+  INFO(NCCL_NVLS, "rank %d - NVLS deregistered buffer %p on device %d, size %ld", comm->rank, (void*)ptr, dev, size);
   return ncclSuccess;
 }
 
@@ -450,11 +451,11 @@ setup:
 
     if (comm->localRank == 0) {
       shmPath[0] = '\0';
-      NCCLCHECKGOTO(ncclShmOpen(shmPath, (sizeof(size_t) + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, comm->localRanks - 1, &comm->nvlsResources->nvlsShmemHandle), res, fail);
+      NCCLCHECKGOTO(ncclShmOpen(shmPath, sizeof(shmPath), (sizeof(size_t) + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, comm->localRanks - 1, &comm->nvlsResources->nvlsShmemHandle), res, fail);
       NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shmPath, sizeof(shmPath)), res, fail);
     } else {
       NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shmPath, sizeof(shmPath)), res, fail);
-      NCCLCHECKGOTO(ncclShmOpen(shmPath, (sizeof(size_t) + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, -1, &comm->nvlsResources->nvlsShmemHandle), res, fail);
+      NCCLCHECKGOTO(ncclShmOpen(shmPath, sizeof(shmPath), (sizeof(size_t) + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, -1, &comm->nvlsResources->nvlsShmemHandle), res, fail);
     }
     /* need 2 pools and a shared counter for shmem-based collectives */
     comm->nvlsResources->nvlsShmem.cnt[0] = (size_t*)nvlsShmem;
@@ -495,7 +496,7 @@ ncclResult_t ncclNvlsFree(struct ncclComm* comm) {
   return ncclSuccess;
 }
 
-ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t buffSize, CUdeviceptr *regAddr, bool *regUsed) {
+ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t buffSize, CUdeviceptr *regAddr, int *regUsed) {
   ncclResult_t ret = ncclSuccess;
   struct ncclReg *regRecord = NULL;
   CUdeviceptr regPtr = 0;
@@ -601,43 +602,33 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
   }
 
   *regAddr = (uintptr_t)regPtr + regData[comm->localRank].offset;
-  *regUsed = true;
+  *regUsed = 1;
 exit:
   free(regData);
   return ret;
 fail:
-  *regUsed = false;
+  *regUsed = 0;
   goto exit;
 }
 
-ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) {
+static ncclResult_t nvlsRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, struct ncclReg *sendRegRecord, struct ncclReg *recvRegRecord, int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) {
   ncclResult_t ret = ncclSuccess;
-  bool localRegBufUsed = false;
+  int regBufUsed = 0;
   struct localRegData *regData = NULL;
   bool sendNeedReg = false, recvNeedReg = false;
   CUdeviceptr regSendPtr = 0;
   CUdeviceptr regRecvPtr = 0;
-  struct ncclReg *sendRegRecord = NULL;
-  struct ncclReg *recvRegRecord = NULL;
-
-  *outRegBufUsed = false;
 
   NCCLCHECKGOTO(ncclCalloc(&regData, comm->localRanks * 2), ret, fail);
 
-  if (sendbuff) {
-    NCCLCHECKGOTO(ncclRegFind(comm, sendbuff, sendbuffSize, &sendRegRecord), ret, fail);
-    if (sendRegRecord) {
-      memcpy(&regData[comm->localRank * 2].reg, sendRegRecord, sizeof(struct ncclReg));
-      regData[comm->localRank * 2].offset = (uintptr_t)sendbuff - sendRegRecord->addr;
-    }
+  if (sendRegRecord) {
+    memcpy(&regData[comm->localRank * 2].reg, sendRegRecord, sizeof(struct ncclReg));
+    regData[comm->localRank * 2].offset = (uintptr_t)sendbuff - sendRegRecord->addr;
   }
 
-  if (recvbuff) {
-    NCCLCHECKGOTO(ncclRegFind(comm, recvbuff, recvbuffSize, &recvRegRecord), ret, fail);
-    if (recvRegRecord) {
-      memcpy(&regData[comm->localRank * 2 + 1].reg, recvRegRecord, sizeof(struct ncclReg));
-      regData[comm->localRank * 2 + 1].offset = (uintptr_t)recvbuff - recvRegRecord->addr;
-    }
+  if (recvRegRecord) {
+    memcpy(&regData[comm->localRank * 2 + 1].reg, recvRegRecord, sizeof(struct ncclReg));
+    regData[comm->localRank * 2 + 1].offset = (uintptr_t)recvbuff - recvRegRecord->addr;
   }
 
   NCCLCHECKGOTO(ncclShmemAllgather(comm, &comm->nvlsResources->nvlsShmem, regData + comm->localRank * 2, regData, sizeof(struct localRegData) * 2), ret, fail);
@@ -682,229 +673,127 @@ ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *send
   }
 
   if ((!sendNeedReg || sendbuff == NULL) && (!recvNeedReg || recvbuff == NULL)) {
-    localRegBufUsed = true;
-    INFO(NCCL_NVLS, "rank %d reuse local-registered NVLS sendbuff %p, recvbuff %p, sendbuff size %ld, recvbuff size %ld, reg sendbuff %p, reg recvbuff %p", comm->rank, sendbuff, recvbuff, sendbuffSize, recvbuffSize, (void*)regSendPtr, (void*)regRecvPtr);
+    regBufUsed = 1;
+    INFO(NCCL_REG, "rank %d reuse registered NVLS sendbuff %p, recvbuff %p, sendbuff size %ld, recvbuff size %ld, reg sendbuff %p, reg recvbuff %p", comm->rank, sendbuff, recvbuff, sendbuffSize, recvbuffSize, (void*)regSendPtr, (void*)regRecvPtr);
     goto exit;
   }
 
   /* Start Registration. Not found registered buffers, then check whether both send and recv buffer locate
    * in register request cache. */
-  if (sendNeedReg && sendbuff) {
-    tryRegisterBuffer(comm, (uintptr_t)sendbuff, sendbuffSize, &regSendPtr, &localRegBufUsed);
-    if (localRegBufUsed == false) goto fail;
+  if (sendNeedReg && sendbuff && sendbuffSize > 0) {
+    tryRegisterBuffer(comm, (uintptr_t)sendbuff, sendbuffSize, &regSendPtr, &regBufUsed);
+    if (regBufUsed == 0) goto fail;
   }
 
-  if (recvNeedReg && recvbuff) {
-    tryRegisterBuffer(comm, (uintptr_t)recvbuff, recvbuffSize, &regRecvPtr, &localRegBufUsed);
-    if (localRegBufUsed == false) goto fail;
+  if (recvNeedReg && recvbuff && recvbuffSize > 0) {
+    tryRegisterBuffer(comm, (uintptr_t)recvbuff, recvbuffSize, &regRecvPtr, &regBufUsed);
+    if (regBufUsed == 0) goto fail;
   }
 
-  INFO(NCCL_NVLS, "rank %d successfully local-registered NVLS sendbuff %p, recvbuff %p, sendbuff size %ld, recvbuff size %ld, reg sendbuff %p, reg recvbuff %p", comm->rank, sendbuff, recvbuff, sendbuffSize, recvbuffSize, (void*)regSendPtr, (void*)regRecvPtr);
+  INFO(NCCL_REG, "rank %d successfully registered NVLS sendbuff %p, recvbuff %p, sendbuff size %ld, recvbuff size %ld, reg sendbuff %p, reg recvbuff %p", comm->rank, sendbuff, recvbuff, sendbuffSize, recvbuffSize, (void*)regSendPtr, (void*)regRecvPtr);
 
 exit:
   *outRegBufSend = (void*)regSendPtr;
   *outRegBufRecv = (void*)regRecvPtr;
-  *outRegBufUsed = localRegBufUsed;
+  *outRegBufUsed = regBufUsed;
   free(regData);
   return ncclSuccess;
 fail:
-  localRegBufUsed = false;
+  regBufUsed = 0;
+  WARN("rank %d failed to NVLS register sendbuff %p sendbuffSize %ld recvbuff %p recvbuffSize %ld", comm->rank, sendbuff, sendbuffSize, recvbuff, recvbuffSize);
   goto exit;
 }
 
+ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) {
+  struct ncclReg *sendRegRecord = NULL;
+  struct ncclReg *recvRegRecord = NULL;
+  bool sendIsValid = false;
+  bool recvIsValid = false;
+
+  *outRegBufUsed = 0;
+  if (sendbuff) {
+    NCCLCHECK(ncclRegFind(comm, sendbuff, sendbuffSize, &sendRegRecord));
+    NCCLCHECK(ncclRegLocalIsValid(sendRegRecord, &sendIsValid));
+  } else {
+    sendIsValid = true;
+  }
+  if (recvbuff) {
+    NCCLCHECK(ncclRegFind(comm, recvbuff, recvbuffSize, &recvRegRecord));
+    NCCLCHECK(ncclRegLocalIsValid(recvRegRecord, &recvIsValid));
+  } else {
+    recvIsValid = true;
+  }
+
+  if (sendIsValid && recvIsValid)
+    NCCLCHECK(nvlsRegisterBuffer(comm, sendbuff, recvbuff, sendbuffSize, recvbuffSize, sendRegRecord, recvRegRecord, outRegBufUsed, outRegBufSend, outRegBufRecv));
+
+  return ncclSuccess;
+}
+
 struct ncclNvlsCleanupCallback {
   struct ncclCommCallback base;
-  CUmemGenericAllocationHandle mcHandle;
-  CUdeviceptr ptr;
-  int dev;
-  size_t size;
+  struct ncclReg *reg;
+  struct ncclComm *comm;
 };
 
 static ncclResult_t cleanupNvls(struct ncclComm* comm, struct ncclCommCallback* cb) {
   struct ncclNvlsCleanupCallback* obj = (struct ncclNvlsCleanupCallback*)cb;
-  NCCLCHECK(ncclNvlsDeregBuffer(&obj->mcHandle, obj->ptr, obj->dev, obj->size));
-  INFO(NCCL_NVLS, "rank %d - deregistered buffer %p on device %d, size %ld", comm->rank, (void*)obj->ptr, obj->dev, obj->size);
+  NCCLCHECK(ncclCommGraphDeregister(obj->comm, obj->reg));
   free(obj);
   return ncclSuccess;
 }
 
 ncclResult_t ncclNvlsGraphRegisterBuffer(
     struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize,
-    bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv,
+    int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv,
     struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, int* nCleanupQueueEltsAdded
   ) {
-  ncclResult_t ret = ncclSuccess;
-  bool localRegBufUsed = false;
   struct ncclNvlsCleanupCallback* sendRecord = NULL;
   struct ncclNvlsCleanupCallback* recvRecord = NULL;
-  CUdeviceptr regSendPtr = 0;
-  CUdeviceptr regRecvPtr = 0;
-  CUmulticastObjectProp mcprop;
-  CUmemAllocationProp ucprop;
-  char shareableHandle[NVLS_HANDLE_SIZE];
-  CUmemGenericAllocationHandle sendMcHandle, recvMcHandle;
-  size_t sendGran = 0, recvGran = 0;
-  bool *regBufFlags = NULL;
-  struct graphRegData *rdata = NULL;
-  const void *baseSend = NULL;
-  const void *baseRecv = NULL;
-  size_t baseSendSize = 1;
-  size_t baseRecvSize = 1;
-  size_t ucgran;
+  void *baseSend = NULL;
+  void *baseRecv = NULL;
+  size_t baseSendSize = 0;
+  size_t baseRecvSize = 0;
+  struct ncclReg *sendRegRecord = NULL;
+  struct ncclReg *recvRegRecord = NULL;
 
-  *outRegBufUsed = false;
-  NCCLCHECKGOTO(ncclCalloc(&regBufFlags, comm->localRanks), ret, fail);
-  NCCLCHECKGOTO(ncclCalloc(&rdata, comm->localRanks), ret, fail);
-
-  if (sendbuffSize > 0 || recvbuffSize > 0) {
-    /* retrieve base pointer and size */
-    if (CUPFN(cuMemGetAddressRange) == nullptr) goto fail;
-    if (sendbuff != NULL)
-      CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &baseSendSize, (CUdeviceptr)sendbuff), ret, fail);
-    if (recvbuff != NULL)
-      CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&baseRecv, &baseRecvSize, (CUdeviceptr)recvbuff), ret, fail);
-
-    memset(&ucprop, 0, sizeof(CUmemAllocationProp));
-    ucprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
-    ucprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-    ucprop.location.id = comm->cudaDev;
-    ucprop.requestedHandleTypes = ncclCuMemHandleType;
-    CUCHECKGOTO(cuMemGetAllocationGranularity(&ucgran, &ucprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail);
-
-    localRegBufUsed = ((uint64_t)baseSend % ucgran != 0 || (uint64_t)baseRecv % ucgran != 0) ? false : true;
-    regBufFlags[comm->localRank] = localRegBufUsed;
-    NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, regBufFlags, sizeof(bool)), ret, fail);
-    for (int i = 0; i < comm->localRanks; ++i)
-      if (regBufFlags[i] == false) goto fail;
-
-    memset(&mcprop, 0, sizeof(CUmulticastObjectProp));
-    mcprop.numDevices = comm->localRanks;
-    mcprop.handleTypes = ncclCuMemHandleType;
-    mcprop.flags = 0;
-
-    if (sendbuff != NULL) {
-      mcprop.size = baseSendSize;
-      CUCHECKGOTO(cuMulticastGetGranularity(&sendGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail);
-
-      /* check send buffer offset and size */
-      rdata[comm->localRank].offset = (uintptr_t)sendbuff - (uintptr_t)baseSend;
-      rdata[comm->localRank].size = baseSendSize;
-      NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, rdata, sizeof(struct graphRegData)), ret, fail);
-      baseSendSize = rdata[0].size;
-      for (int i = 1; i < comm->localRanks; ++i) {
-        if (rdata[0].offset != rdata[i].offset) goto fail;
-        if (baseSendSize > rdata[i].size) baseSendSize = rdata[i].size;
-      }
-      if (baseSendSize % sendGran != 0) goto fail;
-
-      mcprop.size = baseSendSize;
-
-      /* register sendbuff */
-      if (comm->localRank == 0) {
-        NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, &sendMcHandle, shareableHandle), ret, fail);
-        NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
-      } else {
-        NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
-        NCCLCHECKGOTO(nvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], &sendMcHandle), ret, fail);
-      }
-
-      CUCHECKGOTO(cuMulticastAddDevice(sendMcHandle, comm->nvlsResources->dev), ret, fail);
-      CUCHECKGOTO(cuMulticastBindAddr(sendMcHandle, 0, (CUdeviceptr)baseSend, baseSendSize, 0), ret, fail);
-
-      // Create a VA for the NVLS
-      CUCHECKGOTO(cuMemAddressReserve(&regSendPtr, baseSendSize, sendGran, 0U, 0), ret, fail);
-      // Map the VA locally
-      CUCHECKGOTO(cuMemMap(regSendPtr, baseSendSize, 0, sendMcHandle, 0), ret, fail);
-      CUCHECKGOTO(cuMemSetAccess(regSendPtr, baseSendSize, &comm->nvlsResources->accessDesc, 1), ret, fail);
-
-      sendRecord = (struct ncclNvlsCleanupCallback*)malloc(sizeof(struct ncclNvlsCleanupCallback));
-      sendRecord->base.fn = cleanupNvls;
-      sendRecord->mcHandle = sendMcHandle;
-      sendRecord->ptr = regSendPtr;
-      sendRecord->dev = comm->nvlsResources->dev;
-      sendRecord->size = baseSendSize;
-    }
-
-    if (recvbuff != NULL) {
-      mcprop.size = baseRecvSize;
-      CUCHECKGOTO(cuMulticastGetGranularity(&recvGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail);
-
-      rdata[comm->localRank].offset = (uintptr_t)recvbuff - (uintptr_t)baseRecv;
-      rdata[comm->localRank].size = baseRecvSize;
-      NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, rdata, sizeof(struct graphRegData)), ret, fail);
-      baseRecvSize = rdata[0].size;
-      for (int i = 1; i < comm->localRanks; ++i) {
-        if (rdata[0].offset != rdata[i].offset) goto fail;
-        if (baseRecvSize > rdata[i].size) baseRecvSize = rdata[i].size;
-      }
-      if (baseRecvSize % recvGran != 0) goto fail;
-
-      mcprop.size = baseRecvSize;
-      if (comm->localRank == 0) {
-        NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, &recvMcHandle, shareableHandle), ret, fail);
-        NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
-      } else {
-        NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
-        NCCLCHECKGOTO(nvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], &recvMcHandle), ret, fail);
-      }
-
-      CUCHECKGOTO(cuMulticastAddDevice(recvMcHandle, comm->nvlsResources->dev), ret, fail);
-      CUCHECKGOTO(cuMulticastBindAddr(recvMcHandle, 0, (CUdeviceptr)baseRecv, baseRecvSize, 0), ret, fail);
-
-      // Create a VA for the NVLS
-      CUCHECKGOTO(cuMemAddressReserve(&regRecvPtr, baseRecvSize, recvGran, 0U, 0), ret, fail);
-      // Map the VA locally
-      CUCHECKGOTO(cuMemMap(regRecvPtr, baseRecvSize, 0, recvMcHandle, 0), ret, fail);
-      CUCHECKGOTO(cuMemSetAccess(regRecvPtr, baseRecvSize, &comm->nvlsResources->accessDesc, 1), ret, fail);
-
-      recvRecord = (struct ncclNvlsCleanupCallback*)malloc(sizeof(struct ncclNvlsCleanupCallback));
-      recvRecord->base.fn = cleanupNvls;
-      recvRecord->mcHandle = recvMcHandle;
-      recvRecord->ptr = regRecvPtr;
-      recvRecord->dev = comm->nvlsResources->dev;
-      recvRecord->size = baseRecvSize;
-    }
-
-    localRegBufUsed = true;
+  *outRegBufUsed = 0;
+  if (sendbuff) {
+    CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &baseSendSize, (CUdeviceptr)sendbuff));
+    NCCLCHECK(ncclCommGraphRegister(comm, baseSend, baseSendSize, (void**)&sendRegRecord));
   }
 
-exit:
-  if (localRegBufUsed == false) {
-    if (sendRecord) {
-      ncclNvlsDeregBuffer(&sendRecord->mcHandle, sendRecord->ptr, sendRecord->dev, sendRecord->size);
-      free(sendRecord);
-    }
+  if (recvbuff) {
+    CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseRecv, &baseRecvSize, (CUdeviceptr)recvbuff));
+    NCCLCHECK(ncclCommGraphRegister(comm, baseRecv, baseRecvSize, (void**)&recvRegRecord));
+  }
 
-    if (recvRecord) {
-      // Yes, it's a dead code.  That's fine...
-      // coverity[dead_error_begin]
-      ncclNvlsDeregBuffer(&recvRecord->mcHandle, recvRecord->ptr, recvRecord->dev, recvRecord->size);
-      free(recvRecord);
-    }
-  } else {
-    if (sendRecord) {
-      *outRegBufSend = (void*)((uintptr_t)regSendPtr + (uintptr_t)sendbuff - (uintptr_t)baseSend);
+  NCCLCHECK(nvlsRegisterBuffer(comm, baseSend, baseRecv, baseSendSize, baseRecvSize, sendRegRecord, recvRegRecord, outRegBufUsed, outRegBufSend, outRegBufRecv));
+
+  if (*outRegBufUsed) {
+    if (sendRegRecord) {
+      sendRecord = (struct ncclNvlsCleanupCallback*)malloc(sizeof(struct ncclNvlsCleanupCallback));
+      sendRecord->base.fn = cleanupNvls;
+      sendRecord->reg = sendRegRecord;
+      sendRecord->comm = comm;
       ncclIntruQueueEnqueue(cleanupQueue, (struct ncclCommCallback*)sendRecord);
       *nCleanupQueueEltsAdded += 1;
     }
 
-    if (recvRecord) {
-      *outRegBufRecv = (void*)((uintptr_t)regRecvPtr + (uintptr_t)recvbuff - (uintptr_t)baseRecv);
+    if (recvRegRecord) {
+      recvRecord = (struct ncclNvlsCleanupCallback*)malloc(sizeof(struct ncclNvlsCleanupCallback));
+      recvRecord->base.fn = cleanupNvls;
+      recvRecord->reg = recvRegRecord;
+      recvRecord->comm = comm;
       ncclIntruQueueEnqueue(cleanupQueue, (struct ncclCommCallback*)recvRecord);
       *nCleanupQueueEltsAdded += 1;
     }
-
-    INFO(NCCL_NVLS, "rank %d successfully graph-registered sendbuff %p, recvbuff %p, sendbuff size %ld (register size %ld, sendGran %ld), recvbuff size %ld (register size %ld, recvGran %ld), reg sendbuff %p, reg recvbuff %p", comm->rank, sendbuff, recvbuff, sendbuffSize, baseSendSize, sendGran, recvbuffSize, baseRecvSize, recvGran, (void*)regSendPtr, (void*)regRecvPtr);
+  } else {
+    if (sendbuff) NCCLCHECK(ncclCommGraphDeregister(comm, sendRegRecord));
+    if (recvbuff) NCCLCHECK(ncclCommGraphDeregister(comm, recvRegRecord));
   }
 
-  *outRegBufUsed = localRegBufUsed;
-  free(regBufFlags);
-  free(rdata);
-  /* always return success. */
   return ncclSuccess;
-fail:
-  localRegBufUsed = false;
-  goto exit;
 }
 
 #else
@@ -936,19 +825,19 @@ ncclResult_t ncclNvlsTreeConnect(struct ncclComm* comm) {
 
 ncclResult_t ncclNvlsGraphRegisterBuffer(
     struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize,
-    bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv,
+    int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv,
     struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, int* nCleanupQueueEltsAdded
   ) {
   *outRegBufUsed = false;
   return ncclSuccess;
 }
 
-ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) {
+ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) {
   *outRegBufUsed = false;
   return ncclSuccess;
 }
 
-ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size) {
+ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size) {
   return ncclSuccess;
 }
 
diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc
index 6569ae175e..3ae514e450 100644
--- a/src/transport/p2p.cc
+++ b/src/transport/p2p.cc
@@ -91,6 +91,8 @@ struct p2pCuMemProxyInfo {
 
 #include <sys/types.h>
 
+NCCL_PARAM(LegacyCudaRegister, "LEGACY_CUDA_REGISTER", 0);
+
 /* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */
 static int busIdToCudaDev(int64_t busId) {
   int ndev;
@@ -120,21 +122,9 @@ extern int64_t ncclParamMNNVLEnable();
 ncclResult_t p2pCanConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
   initCeOperation();
 
-  // MNNVL support
-  if (comm->MNNVL && info1->hostHash != info2->hostHash) {
-    NCCLCHECK(ncclTopoCheckMNNVL(comm->topo, info1, info2, ret));
-    if (*ret) return ncclSuccess;
-  }
-
-  // Rule out different nodes / isolated containers
-  if (info1->hostHash != info2->hostHash || info1->shmDev != info2->shmDev) {
-    *ret = 0;
-    return ncclSuccess;
-  }
-
   // Check topology / p2p level.
   int intermediateRank;
-  NCCLCHECK(ncclTopoCheckP2p(comm->topo, info1->rank, info2->rank, ret, NULL, &intermediateRank));
+  NCCLCHECK(ncclTopoCheckP2p(comm, comm->topo, info1->rank, info2->rank, ret, NULL, &intermediateRank));
   if (*ret == 0) return ncclSuccess;
   if (intermediateRank != -1) {
     if (useMemcpy) *ret = 0;
@@ -149,6 +139,12 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph
     return ncclSuccess;
   }
 
+  if (info1->hostHash != comm->peerInfo[comm->rank].hostHash ||
+      info1->hostHash != info2->hostHash) {
+    // If either peer is non-local then we are done.
+    return ncclSuccess;
+  }
+
   // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
   int cudaDev1 = busIdToCudaDev(info1->busId);
   int cudaDev2 = busIdToCudaDev(info2->busId);
@@ -313,11 +309,11 @@ NCCL_PARAM(P2pDirectDisable, "P2P_DIRECT_DISABLE", 0);
 
 #define P2P_SAME_PID(MYINFO, PEERINFO) ((MYINFO->hostHash == PEERINFO->hostHash) && (MYINFO->pidHash == PEERINFO->pidHash))
 
-static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* read, int* intermediateRank) {
+static ncclResult_t p2pGetInfo(struct ncclComm* comm, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* read, int* intermediateRank) {
   int p2p;
   // Queries the topology to see if the GPUs are Ampere and
   // connected via NVLink, if so we enable P2P Read by default
-  NCCLCHECK(ncclTopoCheckP2p(topo, info1->rank, info2->rank, &p2p, read, intermediateRank));
+  NCCLCHECK(ncclTopoCheckP2p(comm, comm->topo, info1->rank, info2->rank, &p2p, read, intermediateRank));
 
   int readEnable = ncclParamP2pReadEnable();
   if (readEnable != -2) *read = readEnable;
@@ -367,7 +363,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
   NCCLCHECK(ncclCalloc(&resources, 1));
   send->transportResources = resources;
   int useRead, intermediateRank;
-  NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank));
+  NCCLCHECK(p2pGetInfo(comm, myInfo, peerInfo, &useRead, &intermediateRank));
   if (useMemcpy) useRead = 0;
 
   static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
@@ -386,7 +382,6 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
     info->rank = myInfo->rank;
     if (P2P_SAME_PID(myInfo, peerInfo) && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0) {
       resources->type = P2P_DIRECT;
-      send->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
       INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%d] -> %d[%d] via P2P/direct pointer%s",
           channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, useReadStr);
     } else {
@@ -402,8 +397,8 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
         INFO(NCCL_INIT|NCCL_P2P,"Channel %02d/%01d : %d[%d] -> %d[%d] via P2P/IPC%s%s",
              channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, useReadStr, useMemcpy ? "/CE" : "");
       }
-      send->conn.flags |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
     }
+    send->conn.flags |= info->read ? NCCL_P2P_READ : NCCL_P2P_WRITE;
   } else {
     resources->type = P2P_INTERMEDIATE;
     info->rank = intermediateRank;
@@ -437,7 +432,7 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
   NCCLCHECK(ncclCalloc(&resources, 1));
   recv->transportResources = resources;
   int useRead, intermediateRank;
-  NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank));
+  NCCLCHECK(p2pGetInfo(comm, myInfo, peerInfo, &useRead, &intermediateRank));
 
   static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
   struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
@@ -454,7 +449,6 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
     info->rank = myInfo->rank;
     if (P2P_SAME_PID(myInfo, peerInfo) && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0) {
       resources->type = P2P_DIRECT;
-      recv->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
     } else {
       if (ncclCuMemEnable()) {
         // cuMem API support
@@ -465,8 +459,8 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
         // Legacy CUDA IPC
         resources->type = P2P_IPC;
       }
-      recv->conn.flags |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
     }
+    recv->conn.flags |= info->read ? NCCL_P2P_READ : NCCL_P2P_WRITE;
   } else {
     resources->type = P2P_INTERMEDIATE;
     info->rank = intermediateRank;
@@ -807,9 +801,8 @@ static ncclResult_t p2pSendProxyProgress(struct ncclProxyState* proxyState, stru
   return ncclSuccess;
 }
 
-ncclResult_t ncclIpcLocalRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut) {
-  ncclResult_t ret = ncclSuccess;
-  struct ncclReg *regRecord = NULL;
+static ncclResult_t ipcRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, struct ncclReg* regRecord, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut, bool* isLegacyIpc) {
+ncclResult_t ret = ncclSuccess;
   struct ncclIpcRegInfo* newInfo = NULL;
   uintptr_t* peerRmtAddrs = NULL;
   bool legacyIpcCap = false;
@@ -820,121 +813,149 @@ ncclResult_t ncclIpcLocalRegisterBuffer(ncclComm* comm, const void* userbuff, si
   *regBufFlag = 0;
   *offsetOut = 0;
   *peerRmtAddrsOut = NULL;
-  if (comm && userbuff && buffSize > 0 && nPeers > 0) {
-    NCCLCHECKGOTO(ncclRegFind(comm, userbuff, buffSize, &regRecord), ret, fail);
-    if (regRecord) {
-      // buffer was registered by by users, we need to start to register or reuse it
-      int peerLocalRank;
-      for (int p = 0; p < nPeers; p++) {
-        int peerRank = peerRanks[p];
-        peerLocalRank = comm->rankToLocalRank[peerRank];
-        if (regRecord->ipcInfos[peerLocalRank]) {
-          // We already have IPC info for peerLocalRank, no need to register it, we can reuse it
-          *regBufFlag = 1;
-          INFO(NCCL_REG, "rank %d - IPC local reuse buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p", comm->rank, userbuff, buffSize, (void*)regRecord->addr, regRecord->pages * comm->regCache.pageSize, peerRank, regRecord->ipcInfos[peerLocalRank]->impInfo.rmtRegAddr);
-        } else {
-          // Register buffer with peerLocalRank
-          struct ncclProxyConnector* proxyConn = NULL;
-          struct p2pIpcExpInfo ipcInfo;
+  if (isLegacyIpc) *isLegacyIpc = false;
+  if (regRecord) {
+    // buffer was registered by by users, we need to start to register or reuse it
+    int peerLocalRank;
+    for (int p = 0; p < nPeers; p++) {
+      int peerRank = peerRanks[p];
+      peerLocalRank = comm->rankToLocalRank[peerRank];
+      if (regRecord->ipcInfos[peerLocalRank]) {
+        // We already have IPC info for peerLocalRank, no need to register it, we can reuse it
+        *regBufFlag = 1;
+        if (isLegacyIpc) *isLegacyIpc = regRecord->ipcInfos[peerLocalRank]->impInfo.legacyIpcCap;
+        INFO(NCCL_REG, "rank %d - IPC reuse buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p", comm->rank, userbuff, buffSize, (void*)regRecord->addr, regRecord->pages * comm->regCache.pageSize, peerRank, regRecord->ipcInfos[peerLocalRank]->impInfo.rmtRegAddr);
+      } else {
+        // Register buffer with peerLocalRank
+        struct ncclProxyConnector* proxyConn = NULL;
+        struct p2pIpcExpInfo ipcInfo;
 
-          if (baseAddr == NULL) {
-            CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&baseAddr, &baseSize, (CUdeviceptr)userbuff), ret, fail);
-            CUCHECKGOTO(cuPointerGetAttribute((void*)&legacyIpcCap, CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE, (CUdeviceptr)baseAddr), ret, fail);
-          }
-          if (comm->gproxyConn[peerRank].initialized == false)
-            NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_P2P, 1, peerRank, &comm->gproxyConn[peerRank]), ret, fail);
-          proxyConn = &comm->gproxyConn[peerRank];
+        if (baseAddr == NULL) {
+          CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&baseAddr, &baseSize, (CUdeviceptr)userbuff), ret, fail);
+          CUCHECKGOTO(cuPointerGetAttribute((void*)&legacyIpcCap, CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE, (CUdeviceptr)baseAddr), ret, fail);
+        }
+        if (comm->gproxyConn[peerRank].initialized == false)
+          NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_P2P, 1, peerRank, &comm->gproxyConn[peerRank]), ret, fail);
+        proxyConn = &comm->gproxyConn[peerRank];
 
-          ipcInfo.legacyIpcCap = legacyIpcCap;
-          // Get the mem handle for that buffer. It may have been allocated through cudaMalloc in which case we'll
-          // get the CUDA legacy mem handle, or through cuMem*.
-          if (ipcInfo.legacyIpcCap) {
-            // legacy export
-            if (comm->directMode) goto fail;
+        // Get the mem handle for that buffer. It may have been allocated through cudaMalloc in which case we'll
+        // get the CUDA legacy mem handle, or through cuMem*.
+        if (ncclCuMemEnable()) {
+          CUmemGenericAllocationHandle handle;
+          if (CUPFN(cuMemRetainAllocationHandle(&handle, baseAddr)) != CUDA_SUCCESS) {
+            // if cuMem* export fails, retry legacy export
+            if (comm->directMode || !ncclParamLegacyCudaRegister()) goto fail;
             CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail);
-          } else if (ncclCuMemEnable()) {
-            CUmemGenericAllocationHandle handle;
-            if (CUPFN(cuMemRetainAllocationHandle(&handle, baseAddr)) != CUDA_SUCCESS) {
-              // if cuMem* export fails, retry legacy export
-              if (comm->directMode) goto fail;
-              CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail);
-              ipcInfo.legacyIpcCap = true;
+            ipcInfo.legacyIpcCap = true;
+            if (isLegacyIpc) *isLegacyIpc = true;
+          } else {
+            ipcInfo.legacyIpcCap = false;
+            if (isLegacyIpc) *isLegacyIpc = false;
+            // cuMem* export to file descriptor or fabric handle
+            if (proxyConn->sameProcess) {
+              memcpy(&ipcInfo.ipcDesc.memHandle, &handle, sizeof(CUmemGenericAllocationHandle));
             } else {
-              // cuMem* export to file descriptor or fabric handle
-              if (proxyConn->sameProcess) {
-                memcpy(&ipcInfo.ipcDesc.memHandle, &handle, sizeof(CUmemGenericAllocationHandle));
+              if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
+                int expFd = -1;
+                CUCHECKGOTO(cuMemExportToShareableHandle(&expFd, handle, ncclCuMemHandleType, 0), ret, fail);
+                NCCLCHECKGOTO(ncclProxyClientQueryFdBlocking(comm, proxyConn, expFd, &ipcInfo.impFd), ret, fail);
+                SYSCHECKGOTO(close(expFd), "close", ret, fail);
               } else {
-                if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
-                  int expFd = -1;
-                  CUCHECKGOTO(cuMemExportToShareableHandle(&expFd, handle, ncclCuMemHandleType, 0), ret, fail);
-                  NCCLCHECKGOTO(ncclProxyClientQueryFdBlocking(comm, proxyConn, expFd, &ipcInfo.impFd), ret, fail);
-                  SYSCHECKGOTO(close(expFd), "close", ret, fail);
-                } else {
-                  // Allow this to silently fail for cases where the user buff cannot be registered
-                  if (CUPFN(cuMemExportToShareableHandle(&ipcInfo.ipcDesc.cuDesc.handle, handle, ncclCuMemHandleType, 0)) != CUDA_SUCCESS) {
-                    CUCHECKGOTO(cuMemRelease(handle), ret, fail);
-                    goto fail;
-                  }
+                // Allow this to silently fail for cases where the user buff cannot be registered
+                if (CUPFN(cuMemExportToShareableHandle(&ipcInfo.ipcDesc.cuDesc.handle, handle, ncclCuMemHandleType, 0)) != CUDA_SUCCESS) {
+                  CUCHECKGOTO(cuMemRelease(handle), ret, fail);
+                  goto fail;
                 }
               }
-              CUCHECKGOTO(cuMemRelease(handle), ret, fail);
             }
-          } else {
-            // nothing works, just return
-            goto fail;
+            CUCHECKGOTO(cuMemRelease(handle), ret, fail);
           }
-
-          void* rmtRegAddr = NULL;
-          ipcInfo.size = baseSize;
-          ipcInfo.offset = regRecord->addr - (uintptr_t)baseAddr;
-          // Now ipcInfo contains all necessary registration info. Start to register buffer on proxy side
-          // and get the remote register address back.
-          if (proxyConn)
-            NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgRegister, &ipcInfo, sizeof(p2pIpcExpInfo), &rmtRegAddr, sizeof(void*)), ret, fail);
-          if (rmtRegAddr) {
-            NCCLCHECKGOTO(ncclCalloc(&newInfo, 1), ret, fail);
-            assert(regRecord->ipcInfos[peerLocalRank] == NULL);
-            regRecord->state |= IPC_REG_COMPLETE;
-            newInfo->peerRank = peerRank;
-            newInfo->baseAddr = baseAddr;
-            newInfo->impInfo.rmtRegAddr = rmtRegAddr;
-            newInfo->impInfo.offset = ipcInfo.offset;
-            newInfo->impInfo.legacyIpcCap = ipcInfo.legacyIpcCap;
-            newInfo->ipcProxyconn = proxyConn;
-            regRecord->ipcInfos[peerLocalRank] = newInfo;
-            if (regRecord->regIpcAddrs.hostPeerRmtAddrs == NULL) {
-              NCCLCHECKGOTO(ncclCalloc(&regRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks), ret, fail);
-            }
-            regRecord->regIpcAddrs.hostPeerRmtAddrs[peerLocalRank] = (uintptr_t)rmtRegAddr;
-            needUpdate = true;
-            *regBufFlag = 1;
-            INFO(NCCL_REG, "rank %d - IPC local register buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p offsetOut %ld", comm->rank, userbuff, buffSize, (void*)regRecord->addr, ipcInfo.size, peerRank, rmtRegAddr, (uintptr_t)userbuff - regRecord->addr);
-          }
-        }
-      }
-
-      if (*regBufFlag) {
-        if (type == NCCL_IPC_COLLECTIVE) {
-          // for collective, store registered remote buffers into dev memory for future reference
-          if (regRecord->regIpcAddrs.devPeerRmtAddrs == NULL || needUpdate) {
-            NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail);
-            if (regRecord->regIpcAddrs.devPeerRmtAddrs == NULL)
-              NCCLCHECKGOTO(ncclCudaCallocAsync(&regRecord->regIpcAddrs.devPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
-            if (needUpdate)
-              NCCLCHECKGOTO(ncclCudaMemcpyAsync(regRecord->regIpcAddrs.devPeerRmtAddrs, regRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
-            NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), ret, fail);
-            NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), ret, fail);
-          }
-          peerRmtAddrs = regRecord->regIpcAddrs.devPeerRmtAddrs;
+        } else if (legacyIpcCap) {
+          // legacy export
+          if (comm->directMode || !ncclParamLegacyCudaRegister()) goto fail;
+          CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail);
+          ipcInfo.legacyIpcCap = true;
+          if (isLegacyIpc) *isLegacyIpc = true;
         } else {
-          assert(nPeers == 1);
-          // p2p always returns remote addr here since remote buffer addr is passed in ncclDevWorkP2p struct
-          peerRmtAddrs = (uintptr_t*)regRecord->regIpcAddrs.hostPeerRmtAddrs[peerLocalRank];
+          // nothing works, just return
+          goto fail;
+        }
+
+        void* rmtRegAddr = NULL;
+        ipcInfo.size = baseSize;
+        ipcInfo.offset = regRecord->addr - (uintptr_t)baseAddr;
+        // Now ipcInfo contains all necessary registration info. Start to register buffer on proxy side
+        // and get the remote register address back.
+        if (proxyConn)
+          NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgRegister, &ipcInfo, sizeof(p2pIpcExpInfo), &rmtRegAddr, sizeof(void*)), ret, fail);
+        if (rmtRegAddr) {
+          NCCLCHECKGOTO(ncclCalloc(&newInfo, 1), ret, fail);
+          assert(regRecord->ipcInfos[peerLocalRank] == NULL);
+          regRecord->state |= IPC_REG_COMPLETE;
+          newInfo->peerRank = peerRank;
+          newInfo->baseAddr = baseAddr;
+          newInfo->impInfo.rmtRegAddr = rmtRegAddr;
+          newInfo->impInfo.offset = ipcInfo.offset;
+          newInfo->impInfo.legacyIpcCap = ipcInfo.legacyIpcCap;
+          newInfo->ipcProxyconn = proxyConn;
+          regRecord->ipcInfos[peerLocalRank] = newInfo;
+          if (regRecord->regIpcAddrs.hostPeerRmtAddrs == NULL) {
+            NCCLCHECKGOTO(ncclCalloc(&regRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks), ret, fail);
+          }
+          regRecord->regIpcAddrs.hostPeerRmtAddrs[peerLocalRank] = (uintptr_t)rmtRegAddr;
+          needUpdate = true;
+          *regBufFlag = 1;
+          INFO(NCCL_REG, "rank %d - IPC register buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p offsetOut %ld", comm->rank, userbuff, buffSize, (void*)regRecord->addr, ipcInfo.size, peerRank, rmtRegAddr, (uintptr_t)userbuff - regRecord->addr);
         }
-        *offsetOut = (uintptr_t)userbuff - regRecord->addr;
-        *peerRmtAddrsOut = peerRmtAddrs;
       }
     }
+
+    if (*regBufFlag) {
+      if (type == NCCL_IPC_COLLECTIVE) {
+        // for collective, store registered remote buffers into dev memory for future reference
+        if (regRecord->regIpcAddrs.devPeerRmtAddrs == NULL || needUpdate) {
+          NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail);
+          if (regRecord->regIpcAddrs.devPeerRmtAddrs == NULL)
+            NCCLCHECKGOTO(ncclCudaCallocAsync(&regRecord->regIpcAddrs.devPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
+          if (needUpdate)
+            NCCLCHECKGOTO(ncclCudaMemcpyAsync(regRecord->regIpcAddrs.devPeerRmtAddrs, regRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
+          NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), ret, fail);
+          NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), ret, fail);
+        }
+        peerRmtAddrs = regRecord->regIpcAddrs.devPeerRmtAddrs;
+      } else {
+        assert(nPeers == 1);
+        // p2p always returns remote addr here since remote buffer addr is passed in ncclDevWorkP2p struct
+        peerRmtAddrs = (uintptr_t*)regRecord->regIpcAddrs.hostPeerRmtAddrs[peerLocalRank];
+      }
+      *offsetOut = (uintptr_t)userbuff - regRecord->addr;
+      *peerRmtAddrsOut = peerRmtAddrs;
+    }
+  }
+exit:
+  return ret;
+fail:
+  *regBufFlag = 0;
+  *offsetOut = 0;
+  *peerRmtAddrsOut = NULL;
+  if (newInfo) free(newInfo);
+  WARN("rank %d failed to IPC register userbuff %p buffSize %ld nPeers %d isLegacyIpc %p", comm->rank, userbuff, buffSize, nPeers, isLegacyIpc);
+  goto exit;
+}
+
+ncclResult_t ncclIpcLocalRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut) {
+  ncclResult_t ret = ncclSuccess;
+  struct ncclReg *regRecord = NULL;
+  bool isValid = false;
+
+  *regBufFlag = 0;
+  *offsetOut = 0;
+  *peerRmtAddrsOut = NULL;
+  if (comm && userbuff && buffSize > 0 && nPeers > 0) {
+    NCCLCHECKGOTO(ncclRegFind(comm, userbuff, buffSize, &regRecord), ret, fail);
+    NCCLCHECKGOTO(ncclRegLocalIsValid(regRecord, &isValid), ret, fail);
+    if (isValid)
+      NCCLCHECKGOTO(ipcRegisterBuffer(comm, userbuff, buffSize, peerRanks, nPeers, type, regRecord, regBufFlag, offsetOut, peerRmtAddrsOut, NULL), ret, fail);
   }
 
 exit:
@@ -943,147 +964,56 @@ fail:
   *regBufFlag = 0;
   *offsetOut = 0;
   *peerRmtAddrsOut = NULL;
-  if (newInfo) free(newInfo);
   goto exit;
 }
 
 struct ncclIpcCleanupCallback {
   struct ncclCommCallback base;
-  bool isAddrs;
-  union {
-    struct ncclIpcRegInfo regInfo;
-    struct ncclPeerRegIpcAddr regIpcAddrs;
-  };
+  struct ncclComm *comm;
+  struct ncclReg *reg;
 };
 
 static ncclResult_t cleanupIpc(struct ncclComm* comm, struct ncclCommCallback* cb) {
   struct ncclIpcCleanupCallback* obj = (struct ncclIpcCleanupCallback*)cb;
-  if (obj->isAddrs) {
-    if (obj->regIpcAddrs.hostPeerRmtAddrs)
-      free(obj->regIpcAddrs.hostPeerRmtAddrs);
-    if (obj->regIpcAddrs.devPeerRmtAddrs)
-      NCCLCHECK(ncclCudaFree(obj->regIpcAddrs.devPeerRmtAddrs));
-  } else {
-    NCCLCHECK(ncclIpcDeregBuffer(comm, &obj->regInfo));
-  }
+  NCCLCHECK(ncclCommGraphDeregister(obj->comm, obj->reg));
   free(obj);
   return ncclSuccess;
 }
 
 ncclResult_t ncclIpcGraphRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut, void* cleanupQueuePtr, int* nCleanupQueueElts) {
   ncclResult_t ret = ncclSuccess;
-  struct ncclProxyConnector* proxyConn = NULL;
-  struct p2pIpcExpInfo ipcInfo;
   void* baseAddr;
   size_t baseSize;
   struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue = reinterpret_cast<struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>*>(cleanupQueuePtr);
-  uintptr_t* peerRmtAddrs = NULL;
-  struct ncclIpcCleanupCallback* addrsRecord = NULL;
+  bool isLegacyIpc = false;
+  struct ncclReg *regRecord = NULL;
 
   *regBufFlag = 0;
-  CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&baseAddr, &baseSize, (CUdeviceptr)userbuff), ret, fail);
-  CUCHECKGOTO(cuPointerGetAttribute((void*)&ipcInfo.legacyIpcCap, CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE, (CUdeviceptr)baseAddr), ret, fail);
-
-  if (type == NCCL_IPC_COLLECTIVE) {
-    // collective needs host memory array to hold all remote buffer addrs.
-    // We need to put this into graph release queue
-    NCCLCHECKGOTO(ncclCalloc(&addrsRecord, 1), ret, fail);
-    addrsRecord->base.fn = cleanupIpc;
-    addrsRecord->isAddrs = true;
-    NCCLCHECKGOTO(ncclCalloc(&addrsRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks), ret, fail);
-  } else {
-    assert(nPeers == 1);
-    // p2p does not need anything, just returning the remote buffer is enough, but for now, we register
-    // peer one by one so nPeers must be 1
-  }
-
-  for (int p = 0; p < nPeers; ++p) {
-    int peerRank = peerRanks[p];
-    if (comm->gproxyConn[peerRank].initialized == false)
-      NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_P2P, 1, peerRank, &comm->gproxyConn[peerRank]), ret, fail);
-    proxyConn = &comm->gproxyConn[peerRank];
-    // Same as local registration. Get the mem handle for that buffer. It may have been allocated through
-    // cudaMalloc in which case we'll get the CUDA legacy mem handle, or through cuMem*.
-    if (ipcInfo.legacyIpcCap) {
-      if (comm->directMode) goto fail;
-      CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail);
-    } else if (ncclCuMemEnable()) {
-      // cuMem* export
-      CUmemGenericAllocationHandle handle;
-      if (pfn_cuMemRetainAllocationHandle(&handle, baseAddr) != CUDA_SUCCESS) {
-        if (comm->directMode) goto fail;
-        CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail);
-        ipcInfo.legacyIpcCap = true;
-      } else {
-        if (proxyConn->sameProcess) {
-          memcpy(&ipcInfo.ipcDesc.memHandle, &handle, sizeof(CUmemGenericAllocationHandle));
-        } else {
-          if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
-            int expFd = -1;
-            CUCHECKGOTO(cuMemExportToShareableHandle(&expFd, handle, ncclCuMemHandleType, 0), ret, fail);
-            if (proxyConn->sameProcess) {
-              ipcInfo.impFd = expFd;
-            } else {
-              NCCLCHECKGOTO(ncclProxyClientQueryFdBlocking(comm, proxyConn, expFd, &ipcInfo.impFd), ret, fail);
-              SYSCHECKGOTO(close(expFd), "close", ret, fail);
-            }
-          } else {
-            CUCHECKGOTO(cuMemExportToShareableHandle(&ipcInfo.ipcDesc.cuDesc.handle, handle, ncclCuMemHandleType, 0), ret, fail);
-          }
-        }
-        CUCHECKGOTO(cuMemRelease(handle), ret, fail);
-      }
-    } else {
-      goto fail;
-    }
-
-    void* rmtRegAddr = NULL;
-    ipcInfo.size = baseSize;
-    ipcInfo.offset = 0;
-    NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgRegister, &ipcInfo, sizeof(struct p2pIpcExpInfo), &rmtRegAddr, sizeof(void*)), ret, fail);
-    if (rmtRegAddr) {
+  *offsetOut = 0;
+  *peerRmtAddrsOut = NULL;
+  if (comm && userbuff && buffSize > 0 && nPeers > 0) {
+    CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&baseAddr, &baseSize, (CUdeviceptr)userbuff), ret, fail);
+    NCCLCHECKGOTO(ncclCommGraphRegister(comm, baseAddr, baseSize, (void**)&regRecord), ret, fail);
+    NCCLCHECKGOTO(ipcRegisterBuffer(comm, userbuff, buffSize, peerRanks, nPeers, type, regRecord, regBufFlag, offsetOut, peerRmtAddrsOut, &isLegacyIpc), ret, fail);
+    if (*regBufFlag) {
       struct ncclIpcCleanupCallback* record;
       NCCLCHECKGOTO(ncclCalloc(&record, 1), ret, fail);
       record->base.fn = cleanupIpc;
-      record->isAddrs = false;
-      record->regInfo.peerRank = peerRank;
-      record->regInfo.baseAddr = baseAddr;
-      record->regInfo.impInfo.rmtRegAddr = rmtRegAddr;
-      record->regInfo.impInfo.offset = 0;
-      record->regInfo.impInfo.legacyIpcCap = ipcInfo.legacyIpcCap;
-      record->regInfo.ipcProxyconn = proxyConn;
-      // store the remote address into host addr array
-      if (type == NCCL_IPC_COLLECTIVE)
-        addrsRecord->regIpcAddrs.hostPeerRmtAddrs[comm->rankToLocalRank[peerRank]] = (uintptr_t)rmtRegAddr;
-      else
-        peerRmtAddrs = (uintptr_t*)rmtRegAddr;
-      *regBufFlag = 1;
-      if (ipcInfo.legacyIpcCap)
-        ncclIntruQueueEnqueue(&comm->legacyRegCleanupQueue, &record->base);
-      else
-        ncclIntruQueueEnqueue(cleanupQueue, &record->base);
-      if (nCleanupQueueElts) *nCleanupQueueElts += 1;
-      INFO(NCCL_REG, "rank %d - IPC graph register buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p offsetOut %ld", comm->rank, userbuff, buffSize, baseAddr, ipcInfo.size, peerRank, rmtRegAddr, (uintptr_t)userbuff - (uintptr_t)baseAddr);
+      record->comm = comm;
+      record->reg = regRecord;
+      if (isLegacyIpc) {
+        ncclIntruQueueEnqueue(&comm->legacyRegCleanupQueue, (struct ncclCommCallback*)record);
+      } else {
+        ncclIntruQueueEnqueue(cleanupQueue, (struct ncclCommCallback*)record);
+        if (nCleanupQueueElts) *nCleanupQueueElts += 1;
+      }
+    } else {
+      NCCLCHECKGOTO(ncclCommGraphDeregister(comm, regRecord), ret, fail);
     }
   }
 
-  if (type == NCCL_IPC_COLLECTIVE) {
-    // allocate the dev addr array and copy all previously stored addrs into it.
-    NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail);
-    NCCLCHECKGOTO(ncclCudaCallocAsync(&addrsRecord->regIpcAddrs.devPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
-    NCCLCHECKGOTO(ncclCudaMemcpyAsync(addrsRecord->regIpcAddrs.devPeerRmtAddrs, addrsRecord->regIpcAddrs.hostPeerRmtAddrs, comm->nRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
-    NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), ret, fail);
-    NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), ret, fail);
-    peerRmtAddrs = addrsRecord->regIpcAddrs.devPeerRmtAddrs;
-    if (ipcInfo.legacyIpcCap)
-      ncclIntruQueueEnqueue(&comm->legacyRegCleanupQueue, &addrsRecord->base);
-    else
-      ncclIntruQueueEnqueue(cleanupQueue, &addrsRecord->base);
-  }
-  *offsetOut = (uintptr_t)userbuff - (uintptr_t)baseAddr;
-  *peerRmtAddrsOut = peerRmtAddrs;
-
 exit:
+  // coverity[leaked_storage:FALSE] => normally, addrsRecord is added to the cleanupQueue
   return ret;
 fail:
   *regBufFlag = 0;
diff --git a/src/transport/shm.cc b/src/transport/shm.cc
index 9be95fd803..d2d6906e82 100644
--- a/src/transport/shm.cc
+++ b/src/transport/shm.cc
@@ -454,6 +454,7 @@ static ncclResult_t shmRecvProxyProgress(struct ncclProxyState* proxyState, stru
 }
 
 static ncclResult_t shmSendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  ncclResult_t result = ncclSuccess;
   struct shmRequest* req = (struct shmRequest*)reqBuff;
   /* check message size */
   if (reqSize != sizeof(struct shmRequest)) return ncclInternalError;
@@ -463,13 +464,18 @@ static ncclResult_t shmSendProxySetup(struct ncclProxyConnection* connection, st
   struct shmProxyInfo* proxyInfo;
 
   NCCLCHECK(ncclCalloc(&proxyInfo, 1));
-  NCCLCHECK(ncclShmAllocateShareableBuffer(proxyState->tpRank, req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr));
+  NCCLCHECKGOTO(ncclShmAllocateShareableBuffer(proxyState->tpRank, req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr), result, fail);
   memcpy(&info->desc, &proxyInfo->desc, sizeof(ncclShmIpcDesc_t));
   connection->transportResources = proxyInfo;
-  return ncclSuccess;
+exit:
+  return result;
+fail:
+  free(proxyInfo);
+  goto exit;
 }
 
 static ncclResult_t shmRecvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  ncclResult_t result = ncclSuccess;
   struct shmRequest* req = (struct shmRequest*)reqBuff;
   /* check message size */
   if (reqSize != sizeof(struct shmRequest)) return ncclInternalError;
@@ -479,10 +485,14 @@ static ncclResult_t shmRecvProxySetup(struct ncclProxyConnection* connection, st
   struct shmProxyInfo* proxyInfo;
 
   NCCLCHECK(ncclCalloc(&proxyInfo, 1));
-  NCCLCHECK(ncclShmAllocateShareableBuffer(proxyState->tpRank, req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr));
+  NCCLCHECKGOTO(ncclShmAllocateShareableBuffer(proxyState->tpRank, req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr), result, fail);
   memcpy(&info->desc, &proxyInfo->desc, sizeof(ncclShmIpcDesc_t));
   connection->transportResources = proxyInfo;
-  return ncclSuccess;
+exit:
+  return result;
+fail:
+  free(proxyInfo);
+  goto exit;
 }
 
 static void initCeOperation() {
@@ -534,7 +544,7 @@ ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool l
   } else {
     char shmPath[SHM_PATH_MAX] = { '\0' };
     desc->shmli.shmSize = size;
-    NCCLCHECK(ncclShmOpen(shmPath, size, hptr, dptr, 1, &desc->shmli.handle));
+    NCCLCHECK(ncclShmOpen(shmPath, sizeof(shmPath), size, hptr, dptr, 1, &desc->shmli.handle));
     memcpy(desc->shmli.shmSuffix, shmPath + sizeof("/dev/shm/nccl-") - 1, sizeof(desc->shmli.shmSuffix));
     desc->legacy = true;
     INFO(NCCL_SHM, "MMAP allocated shareable host buffer %s size %zi ptr %p", shmPath, desc->shmli.shmSize, *hptr);
@@ -542,7 +552,7 @@ ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool l
 #else /* CUDART_VERSION >= 12020 */
   char shmPath[SHM_PATH_MAX] = { '\0' };
   desc->shmli.shmSize = size;
-  NCCLCHECK(ncclShmOpen(shmPath, size, hptr, dptr, 1, &desc->shmli.handle));
+  NCCLCHECK(ncclShmOpen(shmPath, sizeof(shmPath), size, hptr, dptr, 1, &desc->shmli.handle));
   memcpy(desc->shmli.shmSuffix, shmPath + sizeof("/dev/shm/nccl-") - 1, sizeof(desc->shmli.shmSuffix));
   desc->legacy = true;
   INFO(NCCL_SHM, "MMAP allocated shareable host buffer %s size %zi ptr %p", shmPath, size, *hptr);
@@ -618,15 +628,15 @@ ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_
     INFO(NCCL_SHM, "CUMEM imported shareable host buffer from tpProxyRank %d size %zi ptr %p, granularity %ld", desc->shmci.tpProxyRank, desc->shmci.size, descOut->shmci.ptr, granularity);
   } else {
     char shmPath[SHM_PATH_MAX];
-    sprintf(shmPath, "/dev/shm/nccl-%s", desc->shmli.shmSuffix);
-    NCCLCHECK(ncclShmOpen(shmPath, desc->shmli.shmSize, hptr, dptr, -1, &descOut->shmli.handle));
+    snprintf(shmPath, sizeof(shmPath), "/dev/shm/nccl-%s", desc->shmli.shmSuffix);
+    NCCLCHECK(ncclShmOpen(shmPath, sizeof(shmPath), desc->shmli.shmSize, hptr, dptr, -1, &descOut->shmli.handle));
     descOut->legacy = true;
     INFO(NCCL_SHM, "MMAP imported shareable host buffer %s size %zi ptr %p", shmPath, desc->shmli.shmSize, *hptr);
   }
 #else /* CUDART_VERSION >= 12020 */
   char shmPath[SHM_PATH_MAX];
-  sprintf(shmPath, "/dev/shm/nccl-%s", desc->shmli.shmSuffix);
-  NCCLCHECK(ncclShmOpen(shmPath, desc->shmli.shmSize, hptr, dptr, -1, &descOut->shmli.handle));
+  snprintf(shmPath, sizeof(shmPath), "/dev/shm/nccl-%s", desc->shmli.shmSuffix);
+  NCCLCHECK(ncclShmOpen(shmPath, sizeof(shmPath), desc->shmli.shmSize, hptr, dptr, -1, &descOut->shmli.handle));
   descOut->legacy = true;
   INFO(NCCL_SHM, "MMAP imported shareable host buffer %s size %zi ptr %p", shmPath, desc->shmli.shmSize, *hptr);
 #endif