2.24.3-1

Network user buffer support for collectives * Leverage user buffer registration to achieve zero-copy inter-node communications for Ring, NVLS and Collnet Add RAS subsystem * Create a RAS thread keeping track of all NCCL communicators. * Add a ncclras tool contacting the RAS thread and getting a report. Add fp8 support * Add support for e5m2 and e4m3 8-bit floating point operations. * Use Tree/PAT algorithms when possible for better numerical stability. Add NIC fusion * Add a NET API to ask the network plugin to fuse a set of interfaces together. * Fuse multiple NICs under the same PCI switch as a single, larger NIC. Socket connection failure retry * Retry in case of socket connection failure (unreachable host) * Avoid "Software caused connection abort" errors on retries QP connection failure retry * Retry in case of IB QP connection failure during ibv_modify_qp. NET API improvements * Allow plugins to force a flush in case data and completion ordering is not guaranteed. * Indicate when completion is not needed (e.g. for the LL128 protocol), allowing plugins to skip generating a completion. * Allow for full offload of allgather operations when using one GPU per node. NCCL_ALGO/NCCL_PROTO strict enforcement * Extend NCCL_ALGO/NCCL_PROTO syntax to be able to specify ALGO/PROTO filters for each collective operation. * Strictly enforce the ALGO/PROTO filters, no longer fall back on the ring algorithm when the filtering leaves no option and error out instead. Enable CUMEM host allocations * Use cumem functions for host memory allocation by default. Improved profiler plugin API * Avoid dependencies with NCCL includes. * Add information on whether the buffer is registered or not Adjust PAT tuning * Improve transition between PAT and ring at scale. Fix hangs when running with different CPU architectures * Detect when we use a mix of GPU architectures * Ensure Algo/Proto decisions are made based on that unified state. Fix FD leak in UDS * Fix a leak when mapping buffers intra-node with cumem IPCs. Fix crash when mixing buffer registration and graph buffer registration. * Separate local and graph registration to avoid crashes when we free buffers. Fix user buffer registration with dmabuf * Make ncclSend/ncclRecv communication with buffer registration functional on network plugins relying on dmabuf for buffer registration. Fix crash in IB code caused by uninitialized fields. Fix non-blocking ncclSend/ncclRecv * Fix case where ncclSend/ncclRecv would return ncclSuccess in non-blocking mode even though the operation was not enqueued onto the stream. * Issue #1495 Various compiler tweaks and fixes * PR #758 Fix typo in ncclTopoPrintGraph * Issue #1468
2024-12-18 08:26:06 -08:00
parent 2ea4ee94bf
commit 6aae379278
97 changed files with 12588 additions and 3127 deletions
@@ -60,9 +60,9 @@ of newer ones.
 The `nccl/` directory is populated with `net_vX.h` files extracting all relevant definitions
 from old API versions. It also provides error codes in `err.h`.

-# API (v6)
+# API (v9)

-Below is the main `ncclNet_v6` struct. Each function is explained in later sections.
+Below is the main `ncclNet_v9` struct. Each function is explained in later sections.

 ```
 typedef struct {
@@ -73,7 +73,7 @@ typedef struct {
  // Return the number of adapters.
  ncclResult_t (*devices)(int* ndev);
  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
  // Create a receiving object and provide a handle to connect to it. The
  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
  // between ranks to create a connection.
@@ -82,24 +82,26 @@ typedef struct {
  // This call must not block for the connection to be established, and instead
  // should return successfully with sendComm == NULL with the expectation that
  // it will be called again until sendComm != NULL.
-  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
  // Finalize connection establishment after remote peer has called connect.
  // This call must not block for the connection to be established, and instead
  // should return successfully with recvComm == NULL with the expectation that
  // it will be called again until recvComm != NULL.
-  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
  /* DMA-BUF support */
  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
  ncclResult_t (*deregMr)(void* comm, void* mhandle);
  // Asynchronous send to a peer.
  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request);
  // Asynchronous recv from a peer.
  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request);
  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
  // visible to the GPU
  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
@@ -110,7 +112,17 @@ typedef struct {
  ncclResult_t (*closeSend)(void* sendComm);
  ncclResult_t (*closeRecv)(void* recvComm);
  ncclResult_t (*closeListen)(void* listenComm);
-} ncclNet_v6_t;
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+
+  // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
+  // what index this new vNIC exists at
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props);
+} ncclNet_t;
 ```

 ## Error codes
@@ -136,11 +148,19 @@ not need to rely on CUDA, this should not be common.
 NCCL will call the `init` function first, then query the number of network devices with the
 `devices` function, getting each network device properties with `getProperties`.

+If NCCL wishes to initialize virtual devices, used in NIC fusion currently, it can call `makeVDevice`
+specifying a list of physical devices (the original devices listed from `devices`) it wishes to
+merge together. If the plugin does not support NIC fusion, it can set `makeVDevice` to null.
+
 To establish a connection between two network devices, NCCL will first call `listen` on the
 receiving side, pass the returned handle to the sender side of the connection, and call `connect`
 with that handle. Finally, `accept` will be called on the receiving side to finalize the connection
 establishment.

+`connect` and `accept` can receive an optional `netDevComm` pointer from the caller, if the caller
+wishes to make use of device networking. This parameter may be ignored by the plugin if it does
+not support device-side networking.
+
 Once the connection is established, communication will be done using the functions `isend`,
 `irecv` and `test`. Prior to calling `isend` or `irecv`, NCCL will call the `regMr` function on
 all buffers to allow RDMA NICs to prepare buffers. `deregMr` will be used to unregister buffers.
@@ -219,6 +239,12 @@ different offset within the original buffer, with a smaller size, etc), then der
 The call to ncclCommDeregister should call the final deregMr() and effectively remove the mapping
 on the network adapter.

+The `forceFlush` field can request the NCCL core to call flush for all transfers. By default,
+flushes are only called when the GPU architecture or PCI topology would not not guarantee correct
+PCI ordering. Plugins can set it to one if the NIC operates in a mode where e.g. the data and the
+completion paths use different PCI links and therefore need a call to flush() to guarantee
+ordering.
+
 The `speed` field indicates the speed of the network port in Mbps (10^6 bits per second). This is
 important to ensure proper optimization of flows within the node.

@@ -234,6 +260,17 @@ The `maxComms` field indicates the maximum number of connections we can create.
 The `maxRecvs` field indicates the maximum number for grouped receive operations (see grouped
 receive).

+The `netDeviceType` indicates which type of device networking this plugin supports. The current supported
+options are `NCCL_NET_DEVICE_HOST` and `NCCL_NET_DEVICE_UNPACK`.
+
+The `netDeviceVersion` indicates the version of device networking this plugin supports. Currently, this must match the associated netDeviceVersion of this netDeviceType compiled into NCCL core. Net device functionality is built as apart of NCCL core's device code.
+
+The `maxP2pBytes` and `maxCollBytes` fields indicate the maximum size the plugin can handle for
+point-to-point and collective calls. This will tell the NCCL core to cut large operations into
+multiple smaller chunks if needed.
+
+`vProps` is the list of devices that have been fused into the current device. Each entry is an index pointing to the child device.
+
 ### Connection establishment

 Connections are used in an unidirectional manner. There is therefore a sender side and a receiver
@@ -332,6 +369,12 @@ handled by a single request handle.
 The sizes provided to `irecv` can (and will) be larger than the size of the `isend` operation.
 The contrary (receive size being lower than the send size) is an error, however.

+NCCL sets request pointer in `irecv` to `NCCL_NET_OPTIONAL_RECV_COMPLETION` when it is using
+LL or LL128 protocols. In these cases, NCCL polls on flag embedded in data to detect completion
+of irecv and is resilient to redundant network writes. This allows the plugin to optimize request
+completions on such irecvs (for example, complete the request immediately). The plugin is still
+expected to set a valid request pointer on return which NCCL can poll to check for completion.
+
 Note: for a given connection, send/receive operations should always match in the order they were
 posted. Tags provided for receive operations are only used to assign a given send operation to one
 of the buffers of the first (multi-)receive in the queue, not to allow for out-of-order tag
@@ -12,6 +12,8 @@
 #include "err.h"

 #define NCCL_NET_HANDLE_MAXSIZE 128
+#define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L) //1TB
+#define NCCL_NET_OPTIONAL_RECV_COMPLETION 0x1

 #define NCCL_PTR_HOST 0x1
 #define NCCL_PTR_CUDA 0x2
@@ -20,6 +22,7 @@
 // Maximum number of requests per comm object
 #define NCCL_NET_MAX_REQUESTS 32

+#include "net_v9.h"
 #include "net_v8.h"
 #include "net_v7.h"
 #include "net_v6.h"
@@ -25,6 +25,7 @@ typedef struct {
 } ncclNetDeviceHandle_v7_t;

 typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
-typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_t;
+typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t;
+typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_t;

 #endif
@@ -23,8 +23,6 @@ typedef struct {
  int netDeviceVersion;            // Version number for network offload
 } ncclNetProperties_v8_t;

-typedef ncclNetProperties_v8_t ncclNetProperties_t;
-
 typedef struct {
  // Name of the network (mainly for logs)
  const char* name;
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NCCL_NET_V9_H_
+#define NCCL_NET_V9_H_
+
+#include "net_device.h"
+
+#define NCCL_NET_MAX_DEVS_PER_NIC_V9 4
+#define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V9
+typedef struct {
+  int ndevs;
+  int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9];
+} ncclNetVDeviceProps_v9_t;
+typedef ncclNetVDeviceProps_v9_t ncclNetVDeviceProps_t;
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int regIsGlobal;                 // regMr is not tied to a particular comm
+  int forceFlush;                  // Force a flush on receives
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+  ncclNetVDeviceProps_v9_t vProps;
+  size_t maxP2pBytes;              // Max transfer size for point-to-point operations
+  size_t maxCollBytes;             // Max transfer size for collective operations
+} ncclNetProperties_v9_t;
+
+typedef ncclNetProperties_v9_t ncclNetProperties_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v9_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v9_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+
+  // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
+  // what index this new vNIC exists at
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props);
+} ncclNet_v9_t;
+
+#endif // end include guard
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2024, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -7,15 +7,15 @@
 #include "net.h"

 #define __hidden __attribute__ ((visibility("hidden")))
+#define NCCL_PLUGIN_MAX_RECVS 1

 int max_requests = NCCL_NET_MAX_REQUESTS;

 __hidden ncclResult_t pluginInit(ncclDebugLogger_t logFunction) { return ncclSuccess; }
 __hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess; }
-
 __hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; }
 __hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; }
-__hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_v8_t* props) {
+__hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_t* props) {
  // Below are default values, if unsure don't change.

  props->name = "Example";
@@ -27,6 +27,8 @@ __hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_v8_t* props
  props->ptrSupport = NCCL_PTR_HOST;
  // If you regMr has a fast registration cache, set to 1. If set to 0, user buffer registration may be disabled.
  props->regIsGlobal = 0;
+  // Force flush after receive. Needed if the control path and data path use a different path to the GPU
+  props->forceFlush = 0;
  // Speed in *Mbps*. 100000 means 100G
  props->speed = 100000;
  // Port number, used in conjunction with guid
@@ -36,20 +38,27 @@ __hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_v8_t* props
  // Maximum number of comm objects we can create.
  props->maxComms = 1024*1024;
  // Maximum number of receive operations taken by irecv().
-  props->maxRecvs = 1;
+  props->maxRecvs = NCCL_PLUGIN_MAX_RECVS;
  // Coupling with NCCL network device-side code.
-  props->netDeviceType = 0;
+  props->netDeviceType = NCCL_NET_DEVICE_HOST;
  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
-  return ncclInternalError;
+  // Used to tell NCCL core whether this is a virtual device fusing multiple physical devices.
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  // maximum transfer sizes the plugin can handle
+  props->maxP2pBytes = NCCL_MAX_NET_SIZE_BYTES;
+  props->maxCollBytes = NCCL_MAX_NET_SIZE_BYTES;
+  return ncclSuccess;
 }
+
 __hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; }
-__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm) { return ncclInternalError; }
-__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** recvDevComm) { return ncclInternalError; }
 __hidden ncclResult_t pluginRegMr(void* collComm, void* data, size_t size, int type, void** mhandle) { return ncclInternalError; }
 __hidden ncclResult_t pluginRegMrDmaBuf(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { return ncclInternalError; }
 __hidden ncclResult_t pluginDeregMr(void* collComm, void* mhandle) { return ncclInternalError;}
-__hidden ncclResult_t pluginIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { return ncclInternalError; }
-__hidden ncclResult_t pluginIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { return ncclInternalError; }
+__hidden ncclResult_t pluginIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) { return ncclInternalError; }
+__hidden ncclResult_t pluginIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) { return ncclInternalError; }
 __hidden ncclResult_t pluginIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { return ncclInternalError; }
 __hidden ncclResult_t pluginTest(void* request, int* done, int* size) { return ncclInternalError; }
 __hidden ncclResult_t pluginCloseSend(void* sendComm) { return ncclInternalError; }
@@ -57,10 +66,11 @@ __hidden ncclResult_t pluginCloseRecv(void* recvComm) { return ncclInternalError
 __hidden ncclResult_t pluginCloseListen(void* listenComm) { return ncclInternalError; }
 __hidden ncclResult_t pluginIrecvConsumed(void* recvComm, int n, void* request) { return ncclInternalError; }
 __hidden ncclResult_t pluginGetDeviceMr(void* comm, void* mhandle, void** dptr_mhandle) { return ncclInternalError; }
+__hidden ncclResult_t pluginMakeVDevice(int* d, ncclNetVDeviceProps_t* props) { return ncclInternalError; }

 #define PLUGIN_NAME "Plugin"

-const ncclNet_v8_t ncclNetPlugin_v8 = {
+ncclNet_v9_t ncclNetPlugin_v9 = {
  .name = PLUGIN_NAME,
  .init = pluginInit,
  .devices = pluginDevices,
@@ -80,8 +90,60 @@ const ncclNet_v8_t ncclNetPlugin_v8 = {
  .closeListen = pluginCloseListen,
  .getDeviceMr = pluginGetDeviceMr,
  .irecvConsumed = pluginIrecvConsumed,
+  .makeVDevice   = pluginMakeVDevice,
 };

+__hidden ncclResult_t pluginGetProperties_v8(int dev, ncclNetProperties_v8_t* props_v8) {
+  ncclNetProperties_t props;
+  ncclResult_t ret = pluginGetProperties(dev, &props);
+  if (ret != ncclSuccess) return ret;
+  props_v8->name = props.name;
+  props_v8->pciPath = props.pciPath;
+  props_v8->guid = props.guid;
+  props_v8->ptrSupport = props.ptrSupport;
+  props_v8->regIsGlobal = props.regIsGlobal;
+  props_v8->speed = props.speed;
+  props_v8->latency = props.latency;
+  props_v8->port = props.port;
+  props_v8->maxComms = props.maxComms;
+  props_v8->maxRecvs = props.maxRecvs;
+  props_v8->netDeviceType = props.netDeviceType;
+  props_v8->netDeviceVersion = props.netDeviceVersion;
+  return ncclSuccess;
+}
+
+__hidden ncclResult_t pluginIsend_v8(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
+  return pluginIsend(sendComm, data, (int)size, tag, mhandle, request);
+}
+
+__hidden ncclResult_t pluginIrecv_v8(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
+  size_t sizesOut[NCCL_PLUGIN_MAX_RECVS];
+  for (int i=0; i<n; i++) sizesOut[i] = sizes[i];
+  return pluginIrecv(recvComm, 1, data, sizesOut, tags, mhandles, request);
+}
+
+const ncclNet_v8_t ncclNetPlugin_v8 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit,
+  .devices = pluginDevices,
+  .getProperties = pluginGetProperties_v8,
+  .listen = pluginListen,
+  .connect = pluginConnect,
+  .accept = pluginAccept,
+  .regMr = pluginRegMr,
+  .regMrDmaBuf = pluginRegMrDmaBuf,
+  .deregMr = pluginDeregMr,
+  .isend = pluginIsend_v8,
+  .irecv = pluginIrecv_v8,
+  .iflush = pluginIflush,
+  .test = pluginTest,
+  .closeSend = pluginCloseSend,
+  .closeRecv = pluginCloseRecv,
+  .closeListen = pluginCloseListen,
+  .getDeviceMr = pluginGetDeviceMr,
+  .irecvConsumed = pluginIrecvConsumed,
+ };
+
 __hidden ncclResult_t pluginGetProperties_v7(int dev, ncclNetProperties_v7_t* props_v7) {
  ncclNetProperties_t props;
  ncclResult_t ret = pluginGetProperties(dev, &props);
@@ -91,6 +153,7 @@ __hidden ncclResult_t pluginGetProperties_v7(int dev, ncclNetProperties_v7_t* pr
  props_v7->guid = props.guid;
  props_v7->ptrSupport = props.ptrSupport;
  props_v7->speed = props.speed;
+  props_v7->latency = props.latency;
  props_v7->port = props.port;
  props_v7->maxComms = props.maxComms;
  props_v7->maxRecvs = props.maxRecvs;
@@ -114,8 +177,8 @@ const ncclNet_v7_t ncclNetPlugin_v7 = {
  .regMr = pluginRegMr_v7,
  .regMrDmaBuf = pluginRegMrDmaBuf,
  .deregMr = pluginDeregMr,
-  .isend = pluginIsend,
-  .irecv = pluginIrecv,
+  .isend = pluginIsend_v8,
+  .irecv = pluginIrecv_v8,
  .iflush = pluginIflush,
  .test = pluginTest,
  .closeSend = pluginCloseSend,
@@ -134,6 +197,7 @@ __hidden ncclResult_t pluginGetProperties_v6(int dev, ncclNetProperties_v6_t* pr
  props_v6->guid = props.guid;
  props_v6->ptrSupport = props.ptrSupport;
  props_v6->speed = props.speed;
+  props_v6->latency = props.latency;
  props_v6->port = props.port;
  props_v6->maxComms = props.maxComms;
  props_v6->maxRecvs = props.maxRecvs;
@@ -154,8 +218,8 @@ const ncclNet_v6_t ncclNetPlugin_v6 = {
  .regMr = pluginRegMr_v7,
  .regMrDmaBuf = pluginRegMrDmaBuf,
  .deregMr = pluginDeregMr,
-  .isend = pluginIsend,
-  .irecv = pluginIrecv,
+  .isend = pluginIsend_v8,
+  .irecv = pluginIrecv_v8,
  .iflush = pluginIflush,
  .test = pluginTest,
  .closeSend = pluginCloseSend,
@@ -174,8 +238,8 @@ const ncclNet_v5_t ncclNetPlugin_v5 = {
  .accept = pluginAccept_v6,
  .regMr = pluginRegMr_v7,
  .deregMr = pluginDeregMr,
-  .isend = pluginIsend,
-  .irecv = pluginIrecv,
+  .isend = pluginIsend_v8,
+  .irecv = pluginIrecv_v8,
  .iflush = pluginIflush,
  .test = pluginTest,
  .closeSend = pluginCloseSend,
@@ -198,11 +262,11 @@ static ncclResult_t pluginGetProperties_v4(int dev, ncclNetProperties_v4_t* prop
  return ncclSuccess;
 }
 static ncclResult_t pluginIsend_v4(void *sendComm, void* data, int size, void *mhandle, void** request) {
-  return pluginIsend(sendComm, data, size, 0, mhandle, request);
+  return pluginIsend_v8(sendComm, data, size, 0, mhandle, request);
 }
 static ncclResult_t pluginIrecv_v4(void* recvComm, void* data, int size, void* mhandle, void** request) {
  int tag = 0;
-  return pluginIrecv(recvComm, 1, &data, &size, &tag, &mhandle, request);
+  return pluginIrecv_v8(recvComm, 1, &data, &size, &tag, &mhandle, request);
 }
 static ncclResult_t pluginIflush_v4(void* recvComm, void* data, int size, void* mhandle, void** request) {
  return pluginIflush(recvComm, 1, &data, &size, &mhandle, request);
@@ -14,6 +14,7 @@

 #define MAX_CHANNELS                     32
 #define MAX_STEPS                        16
+#define MAX_OPS                          16 // Up to 64K ranks for PAT

 #define PROXY_OP_SEND_STATE_OFFSET       (ncclProfilerProxyOpSendPosted)
 #define PROXY_OP_RECV_STATE_OFFSET       (ncclProfilerProxyOpRecvPosted)
@@ -86,7 +87,7 @@ struct taskEventBase {
  int rank;                         // rank of the operation in NCCL communicator
  const char* name;                 // FIXME: unused
  uint64_t commHash;                // communicator identifier
-  uint8_t func;                     // ncclFunc*
+  const char* func;                 // ncclFunc*
  int refCount;                     // number of references for this operation
  struct group* parent;             // parent event group
  struct taskEventBase* next;       // next top level event in group
@@ -102,16 +103,14 @@ struct collective {
  size_t count;
  size_t trafficBytes;
  int root;
-  uint8_t datatype;
+  const char* datatype;
  uint8_t nMaxChannels;
-  uint8_t algo;
-  uint8_t proto;
-  int op;
+  const char* algo;
+  const char* proto;
  int nWarps;
-  int isCollnet;
-  int isNvls;
-  struct proxyOp send[MAX_CHANNELS];// array of send proxy operation events
-  struct proxyOp recv[MAX_CHANNELS];// array of recv proxy operation events
+  struct proxyOp send[MAX_CHANNELS][MAX_OPS];// array of send proxy operation events
+  struct proxyOp recv[MAX_CHANNELS][MAX_OPS];// array of recv proxy operation events
+  int nProxyOps[MAX_CHANNELS];
 };

 struct p2p {
@@ -119,9 +118,9 @@ struct p2p {
  uint8_t func;
  void const* buff;
  size_t count;
-  uint8_t datatype;
+  const char* datatype;
  int peer;
-  struct proxyOp op;
+  struct proxyOp op[MAX_CHANNELS];
 };

 struct group {
@@ -13,6 +13,7 @@
 #include "common.h"
 #include "err.h"

+#include "profiler_v2.h"
 #include "profiler_v1.h"

 #endif // end include guard
@@ -9,16 +9,6 @@

 #include <stdint.h>

-enum {
-  ncclProfileGroup     = (1 << 0),  // group event type
-  ncclProfileColl      = (1 << 1),  // host collective call event type
-  ncclProfileP2p       = (1 << 2),  // host point-to-point call event type
-  ncclProfileProxyOp   = (1 << 3),  // proxy operation event type
-  ncclProfileProxyStep = (1 << 4),  // proxy step event type
-  ncclProfileProxyCtrl = (1 << 5),  // proxy control event type
-  ncclProfileNumEvents = (     6),
-};
-
 typedef struct {
  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
@@ -69,42 +59,8 @@ typedef struct {
  };
 } ncclProfilerEventDescr_v1_t;

-typedef enum {
-  ncclProfilerProxyOpSendPosted,
-  ncclProfilerProxyOpSendRemFifoWait,
-  ncclProfilerProxyOpSendTransmitted,
-  ncclProfilerProxyOpSendDone,
-  ncclProfilerProxyOpRecvPosted,
-  ncclProfilerProxyOpRecvReceived,
-  ncclProfilerProxyOpRecvTransmitted,
-  ncclProfilerProxyOpRecvDone,
-
-  /* Legacy proxy profiler states */
-  ncclProfilerProxyStepSendGPUWait,
-  ncclProfilerProxyStepSendWait,
-  ncclProfilerProxyStepRecvWait,
-  ncclProfilerProxyStepRecvFlushWait,
-  ncclProfilerProxyStepRecvGPUWait,
-
-  /* Legacy proxy control states */
-  ncclProfilerProxyCtrlIdle,
-  ncclProfilerProxyCtrlActive,
-  ncclProfilerProxyCtrlSleep,
-  ncclProfilerProxyCtrlWakeup,
-  ncclProfilerProxyCtrlAppend,
-  ncclProfilerProxyCtrlAppendEnd,
-} ncclProfilerEventState_v1_t;
-
-typedef union {
-  struct {
-    size_t transSize;
-    int steps;
-  } proxyOp;
-
-  struct {
-    int appendedProxyOps;
-  } proxyCtrl;
-} ncclProfilerEventStateArgs_v1_t;
+typedef ncclProfilerEventState_v2_t ncclProfilerEventState_v1_t;
+typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_v1_t;

 typedef struct {
  const char* name;
@@ -142,9 +98,4 @@ typedef struct {
  ncclResult_t (*finalize)(void* context);
 } ncclProfiler_v1_t;

-typedef ncclProfilerEventDescr_v1_t ncclProfilerEventDescr_t;
-typedef ncclProfilerEventState_v1_t ncclProfilerEventState_t;
-typedef ncclProfilerEventStateArgs_v1_t ncclProfilerEventStateArgs_t;
-typedef ncclProfiler_v1_t ncclProfiler_t;
-
 #endif
@@ -0,0 +1,146 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_PROFILER_V2_H_
+#define NCCL_PROFILER_V2_H_
+
+#include <stdint.h>
+
+enum {
+  ncclProfileGroup     = (1 << 0),  // group event type
+  ncclProfileColl      = (1 << 1),  // host collective call event type
+  ncclProfileP2p       = (1 << 2),  // host point-to-point call event type
+  ncclProfileProxyOp   = (1 << 3),  // proxy operation event type
+  ncclProfileProxyStep = (1 << 4),  // proxy step event type
+  ncclProfileProxyCtrl = (1 << 5),  // proxy control event type
+};
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      size_t trafficBytes;
+      uint8_t nMaxChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+    } coll;
+
+    struct {
+      const char* name;
+      uint64_t commHash;
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+  };
+} ncclProfilerEventDescr_v2_t;
+
+typedef enum {
+  ncclProfilerProxyOpSendPosted,
+  ncclProfilerProxyOpSendRemFifoWait,
+  ncclProfilerProxyOpSendTransmitted,
+  ncclProfilerProxyOpSendDone,
+  ncclProfilerProxyOpRecvPosted,
+  ncclProfilerProxyOpRecvReceived,
+  ncclProfilerProxyOpRecvTransmitted,
+  ncclProfilerProxyOpRecvDone,
+
+  /* Legacy proxy profiler states */
+  ncclProfilerProxyStepSendGPUWait,
+  ncclProfilerProxyStepSendWait,
+  ncclProfilerProxyStepRecvWait,
+  ncclProfilerProxyStepRecvFlushWait,
+  ncclProfilerProxyStepRecvGPUWait,
+
+  /* Legacy proxy control states */
+  ncclProfilerProxyCtrlIdle,
+  ncclProfilerProxyCtrlActive,
+  ncclProfilerProxyCtrlSleep,
+  ncclProfilerProxyCtrlWakeup,
+  ncclProfilerProxyCtrlAppend,
+  ncclProfilerProxyCtrlAppendEnd,
+} ncclProfilerEventState_v2_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+    int steps;
+  } proxyOp;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+} ncclProfilerEventStateArgs_v2_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v2_t;
+
+typedef ncclProfilerEventDescr_v2_t ncclProfilerEventDescr_t;
+typedef ncclProfilerEventState_v2_t ncclProfilerEventState_t;
+typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_t;
+typedef ncclProfiler_v2_t ncclProfiler_t;
+
+#endif
@@ -21,11 +21,18 @@
 static int initialized;             // initialization counter for profiler
 static double startTime;            // profiler start time

-static int groupPoolSize = 16;
-static int collPoolSize = 16;
-static int p2pPoolSize = 1024;
-static int proxyCtrlPoolSize = 16;
-static int detachPoolSize = 128;
+static const int defaultEActivationMask = ncclProfileColl | ncclProfileP2p;
+static const int defaultGroupPoolSize = 16;
+static const int defaultCollPoolSize = 16;
+static const int defaultP2pPoolSize = 1024;
+static const int defaultProxyCtrlPoolSize = 16;
+static const int defaultDetachPoolSize = 128;
+
+static int groupPoolSize;
+static int collPoolSize;
+static int p2pPoolSize;
+static int proxyCtrlPoolSize;
+static int detachPoolSize;
 static int detachPoolBase;
 static int detachPoolIndex;
 static int detachPoolDone;
@@ -56,25 +63,25 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask)
  pthread_mutex_lock(&lock);
  if (__atomic_fetch_add(&initialized, 1, __ATOMIC_RELAXED) == 0) {
    // first thread initializes event mask, environment and detach pool
-    __atomic_store_n(eActivationMask, ncclProfileColl | ncclProfileP2p, __ATOMIC_RELAXED);
-    if (getenv("NCCL_PROFILE_EVENT_MASK")) {
-      __atomic_store_n(eActivationMask, atoi(getenv("NCCL_PROFILE_EVENT_MASK")), __ATOMIC_RELAXED);
-    }
-    if (getenv("NCCL_PROFILE_GROUP_POOL_SIZE")) {
-      groupPoolSize = atoi(getenv("NCCL_PROFILE_GROUP_POOL_SIZE"));
-    }
-    if (getenv("NCCL_PROFILE_COLL_POOL_SIZE")) {
-      collPoolSize = atoi(getenv("NCCL_PROFILE_COLL_POOL_SIZE"));
-    }
-    if (getenv("NCCL_PROFILE_P2P_POOL_SIZE")) {
-      p2pPoolSize = atoi(getenv("NCCL_PROFILE_P2P_POOL_SIZE"));
-    }
-    if (getenv("NCCL_PROFILE_PROXY_CTRL_POOL_SIZE")) {
-      proxyCtrlPoolSize = atoi(getenv("NCCL_PROFILE_PROXY_CTRL_POOL_SIZE"));
-    }
-    if (getenv("NCCL_PROFILE_PROXY_DETACH_POOL_SIZE")) {
-      detachPoolSize = atoi(getenv("NCCL_PROFILE_PROXY_DETACH_POOL_SIZE"));
-    }
+    const char* str;
+    str = getenv("NCCL_PROFILE_EVENT_MASK");
+    __atomic_store_n(eActivationMask, str ? atoi(str) : defaultEActivationMask, __ATOMIC_RELAXED);
+
+    str = getenv("NCCL_PROFILE_GROUP_POOL_SIZE");
+    groupPoolSize = str ? atoi(str) : defaultGroupPoolSize;
+
+    str = getenv("NCCL_PROFILE_COLL_POOL_SIZE");
+    collPoolSize = str ? atoi(str) : defaultCollPoolSize;
+
+    str = getenv("NCCL_PROFILE_P2P_POOL_SIZE");
+    p2pPoolSize = str ? atoi(str) : defaultP2pPoolSize;
+
+    str = getenv("NCCL_PROFILE_PROXY_CTRL_POOL_SIZE");
+    proxyCtrlPoolSize = str ? atoi(str) : defaultProxyCtrlPoolSize;
+
+    str = getenv("NCCL_PROFILE_PROXY_DETACH_POOL_SIZE");
+    detachPoolSize = str ? atoi(str) : defaultDetachPoolSize;
+
    // detach pool is used to store PXN proxyOps and is shared among threads
    detachPool = (struct proxyOp *)calloc(detachPoolSize, sizeof(*detachPool));
    if (detachPool == NULL) {
@@ -107,6 +114,13 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask)
  ctx->proxyCtrlPool = (struct proxyCtrl *)calloc(proxyCtrlPoolSize, sizeof(*ctx->proxyCtrlPool));
  if (ctx->proxyCtrlPool == NULL) goto fail;

+  // Print event pool sizes for debugging
+  //fprintf(stdout, "Profiler: Group pool size (bytes): %lu\n", sizeof(struct group)*groupPoolSize);
+  //fprintf(stdout, "Profiler: Coll  pool size (bytes): %lu\n", sizeof(struct collective)*collPoolSize);
+  //fprintf(stdout, "Profiler: P2p   pool size (bytes): %lu\n", sizeof(struct p2p)*p2pPoolSize);
+  //fprintf(stdout, "Profiler: Proxy pool size (bytes): %lu\n", sizeof(struct proxyCtrl)*proxyCtrlPoolSize);
+  //fprintf(stdout, "Profiler: PXN   pool size (bytes): %lu\n", sizeof(struct proxyOp)*detachPoolSize);
+
  *context = ctx;
  return ncclSuccess;

@@ -154,7 +168,7 @@ __hidden ncclResult_t exampleProfilerFinalize(void* context) {
  free(ctx);

  // last thread cleans up shared detach pool
-  if (__atomic_fetch_sub(&initialized, 1, __ATOMIC_RELAXED) - 1 == 0) {
+  if (__atomic_sub_fetch(&initialized, 1, __ATOMIC_RELAXED) == 0) {
    start = (detachPoolIndex - detachPoolSize >= 0) ? detachPoolIndex - detachPoolSize : 0;
    end = detachPoolIndex;
    for (int i = start; i < end; i++) {
@@ -171,7 +185,7 @@ __hidden ncclResult_t exampleProfilerFinalize(void* context) {

 __hidden void updateEvent(void* handle);

-__hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr) {
+__hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, ncclProfilerEventDescr_t* eDescr) {
  *eHandle = NULL;
  struct context* ctx = (struct context *)context;
  if (eDescr->type == ncclProfileGroup) {
@@ -185,14 +199,15 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
        if (base->type == ncclProfileColl) {
          struct collective* c = (struct collective *)base;
          // reset event proxyOps & proxySteps
-          memset(c->send, 0, sizeof(struct proxyOp)*MAX_CHANNELS);
-          memset(c->recv, 0, sizeof(struct proxyOp)*MAX_CHANNELS);
+          memset(c->send, 0, sizeof(struct proxyOp)*MAX_CHANNELS*MAX_OPS);
+          memset(c->recv, 0, sizeof(struct proxyOp)*MAX_CHANNELS*MAX_OPS);
+          memset(c->nProxyOps, 0, sizeof(int)*MAX_CHANNELS);
          // release collective events in the group and return them to the collective pool
          __atomic_fetch_add(&ctx->collPoolBase, 1, __ATOMIC_RELAXED);
        } else if (base->type == ncclProfileP2p) {
          struct p2p* p = (struct p2p *)base;
          // reset event proxyOp and proxySteps
-          memset(&p->op, 0, sizeof(struct proxyOp));
+          memset(&p->op, 0, sizeof(struct proxyOp)*MAX_CHANNELS);
          // release p2p events in the group and return them to the p2p pool
          __atomic_fetch_add(&ctx->p2pPoolBase, 1, __ATOMIC_RELAXED);
        }
@@ -203,7 +218,6 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
      return ncclSuccess;
    }
    event->type = ncclProfileGroup;
-    __atomic_store_n(&event->refCount, 1, __ATOMIC_RELAXED);
    event->ctx = ctx;
    event->groupId = groupId;
    event->startTs = gettime() - startTime;
@@ -238,14 +252,11 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
    event->count = eDescr->coll.count;
    event->root = eDescr->coll.root;
    event->datatype = eDescr->coll.datatype;
-    event->op = eDescr->coll.op;
    event->trafficBytes = eDescr->coll.trafficBytes;
    event->nMaxChannels = eDescr->coll.nMaxChannels;
    event->nWarps = eDescr->coll.nWarps;
    event->algo = eDescr->coll.algo;
    event->proto = eDescr->coll.proto;
-    event->isCollnet = eDescr->coll.isCollnet;
-    event->isNvls = eDescr->coll.isNvls;
    *eHandle = event;
    taskEventQueueEnqueue(parent, (struct taskEventBase *)event);
    // increment the group ref counter so the event will staty open
@@ -326,9 +337,13 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n

    if (eventBase->type == ncclProfileColl) {
      struct collective* parent = (struct collective *)eDescr->parentObj;
-      struct proxyOp* event = (eDescr->proxyOp.isSend) ? &parent->send[eDescr->proxyOp.channelId] : &parent->recv[eDescr->proxyOp.channelId];
+      int channelId = eDescr->proxyOp.channelId;
+      struct proxyOp* event = (eDescr->proxyOp.isSend) ?
+        &parent->send[channelId][parent->nProxyOps[channelId]++] :
+        &parent->recv[channelId][parent->nProxyOps[channelId]++];
+
      event->type = ncclProfileProxyOp;
-      event->channelId = eDescr->proxyOp.channelId;
+      event->channelId = channelId;
      event->pid = eDescr->proxyOp.pid;
      event->rank = eDescr->rank;
      event->peer = eDescr->proxyOp.peer;
@@ -338,13 +353,14 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
      event->parent = eventBase;
      event->startTs = gettime() - startTime;
      *eHandle = event;
-      __atomic_store_n(&parent->base.refCount, 1, __ATOMIC_RELAXED);
+      __atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED);
      debugEvent(event, "ProxyOpStart");
    } else { // ncclProfileP2p
      struct p2p* parent = (struct p2p *)eDescr->parentObj;
-      struct proxyOp* event = &parent->op;
+      int channelId = eDescr->proxyOp.channelId;
+      struct proxyOp* event = &parent->op[channelId];
      event->type = ncclProfileProxyOp;
-      event->channelId = eDescr->proxyOp.channelId;
+      event->channelId = channelId;
      event->pid = eDescr->proxyOp.pid;
      event->rank = eDescr->rank;
      event->peer = eDescr->proxyOp.peer;
@@ -354,7 +370,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
      event->parent = eventBase;
      event->startTs = gettime() - startTime;
      *eHandle = event;
-      __atomic_store_n(&parent->base.refCount, 1, __ATOMIC_RELAXED);
+      __atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED);
      debugEvent(event, "ProxyOpStart");
    }
 } else if (eDescr->type == ncclProfileProxyStep) {
@@ -379,7 +395,7 @@ void updateEvent(void* handle) {
  uint8_t type = *(uint8_t *)handle;
  if (type == ncclProfileGroup) {
    struct group* event = (struct group *)handle;
-    if (__atomic_fetch_sub(&event->refCount, 1, __ATOMIC_RELAXED) == 1) {
+    if (__atomic_sub_fetch(&event->refCount, 1, __ATOMIC_RELAXED) == 0) {
      event->stopTs = gettime() - startTime;
      // return group event to the pool
      __atomic_fetch_add(&event->ctx->groupPoolBase, 1, __ATOMIC_RELAXED);
@@ -387,7 +403,7 @@ void updateEvent(void* handle) {
    debugEvent(event, "GroupStop");
  } else if (type == ncclProfileColl) {
    struct collective* event = (struct collective *)handle;
-    if (__atomic_fetch_sub(&event->base.refCount, 1, __ATOMIC_RELAXED) == 1) {
+    if (__atomic_sub_fetch(&event->base.refCount, 1, __ATOMIC_RELAXED) == 0) {
      event->base.stopTs = gettime() - startTime;
      debugEvent(event, "CollStop");
      updateEvent(event->base.parent);
@@ -396,7 +412,7 @@ void updateEvent(void* handle) {
    debugEvent(event, "CollStop");
  } else if (type == ncclProfileP2p) {
    struct p2p* event = (struct p2p *)handle;
-    if (__atomic_fetch_sub(&event->base.refCount, 1, __ATOMIC_RELAXED) == 1) {
+    if (__atomic_sub_fetch(&event->base.refCount, 1, __ATOMIC_RELAXED) == 0) {
      event->base.stopTs = gettime() - startTime;
      debugEvent(event, "P2pStop");
      updateEvent(event->base.parent);
@@ -408,7 +424,7 @@ void updateEvent(void* handle) {
    event->stopTs = gettime() - startTime;
    if (event->pid != pid) {
      // only for proxyOps that don't have a parent collective/p2p (i.e., PXN)
-      int done = __atomic_fetch_add(&detachPoolDone, 1, __ATOMIC_RELAXED) + 1;
+      int done = __atomic_add_fetch(&detachPoolDone, 1, __ATOMIC_RELAXED);
      if (done == detachPoolSize) {
        // reset the event completed (done) counter
        __atomic_store_n(&detachPoolDone, 0, __ATOMIC_RELAXED);
@@ -451,12 +467,20 @@ __hidden ncclResult_t exampleProfilerStopEvent(void* eHandle) {
    struct collective* event = (struct collective *)eHandle;
    event->base.stopTs = gettime() - startTime;
    return ncclSuccess;
+  } else if (type == ncclProfileP2p) {
+    // stopping the p2p event in NCCL core does not
+    // mean the p2p has completed. It means the p2p
+    // was submitted/enqueued so we need to keep the event open
+    struct p2p* event = (struct p2p *)eHandle;
+    event->base.stopTs = gettime() - startTime;
+    return ncclSuccess;
  }
+
  updateEvent(eHandle);
  return ncclSuccess;
 }

-__hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs) {
+__hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfilerEventState_t eState, ncclProfilerEventStateArgs_t* eStateArgs) {
  // the event handle might be null if we run out of events
  if (eHandle == NULL) return ncclSuccess;

@@ -482,7 +506,7 @@ __hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfile
  return ncclSuccess;
 }

-ncclProfiler_v1_t ncclProfiler_v1 = {
+ncclProfiler_t ncclProfiler_v2 = {
  "Example-profiler",
  exampleProfilerInit,
  exampleProfilerStartEvent,
@@ -11,56 +11,6 @@

 #define __hidden __attribute__ ((visibility("hidden")))

-__hidden const char* ncclFuncToString(int func) {
-  switch(func) {
-    case 0:
-      return "ncclBroadcast";
-    case 1:
-      return "ncclReduce";
-    case 2:
-      return "ncclAllGather";
-    case 3:
-      return "ncclReduceScatter";
-    case 4:
-      return "ncclAllReduce";
-    case 5:
-      return "ncclSendRecv";
-    case 6:
-      return "ncclSend";
-    case 7:
-      return "ncclRecv";
-  }
-  return NULL;
-}
-
-__hidden const char* ncclAlgoToString(int algo) {
-  switch(algo) {
-    case 0:
-      return "Tree";
-    case 1:
-      return "Ring";
-    case 2:
-      return "CollnetDirect";
-    case 3:
-      return "CollnetChain";
-    case 4:
-      return "Nvls";
-    case 5:
-      return "NvlsTree";
-  }
-}
-
-__hidden const char* ncclProtoToString(int proto) {
-  switch(proto) {
-    case 0:
-      return "LL";
-    case 1:
-      return "LL128";
-    case 2:
-      return "Simple";
-  }
-}
-
 // FIXME: chrome tracing asynchronous events (following used) allow event nesting for events that have same id and category
 // It appears that nesting more than three events causes issues. Therefore, every event is given an increasing id and a
 // category that matches the type of event (GROUP, COLL, P2P, PROXY, NET)
@@ -77,24 +27,24 @@ __hidden void printGroupEventTrailer(FILE* fh, struct group* event) {

 static __thread int collId;
 __hidden void printCollEventHeader(FILE* fh, struct collective* event) {
-  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"SeqNum\": %lu, \"CommHash\": %lu, \"Rank\": %d, \"Count\": %lu, \"Datatype\": %d, \"Algorithm\": \"%s\", \"Protocol\": \"%s\", \"nMaxChannels\": %d}},\n",
-          ncclFuncToString(event->base.func), collId, getpid(), 1, event->base.startTs, event->seqNumber, event->base.commHash, event->base.rank, event->count, event->datatype, ncclAlgoToString(event->algo), ncclProtoToString(event->proto), event->nMaxChannels);
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"SeqNum\": %lu, \"CommHash\": %lu, \"Rank\": %d, \"Count\": %lu, \"Datatype\": \"%s\", \"Algorithm\": \"%s\", \"Protocol\": \"%s\", \"nMaxChannels\": %d}},\n",
+          event->base.func, collId, getpid(), 1, event->base.startTs, event->seqNumber, event->base.commHash, event->base.rank, event->count, event->datatype, event->algo, event->proto, event->nMaxChannels);
 }

 __hidden void printCollEventTrailer(FILE* fh, struct collective* event) {
  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
-          ncclFuncToString(event->base.func), collId++, getpid(), 1, event->base.stopTs);
+          event->base.func, collId++, getpid(), 1, event->base.stopTs);
 }

 static __thread int p2pId;
 __hidden void printP2pEventHeader(FILE* fh, struct p2p* event) {
-  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"CommHash\": %lu, \"Rank\": %d, \"Peer\": %d, \"Count\": %lu, \"Datatype\": %d}},\n",
-          ncclFuncToString(event->base.func), p2pId, getpid(), 1, event->base.startTs, event->base.commHash, event->base.rank, event->peer, event->count, event->datatype);
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"CommHash\": %lu, \"Rank\": %d, \"Peer\": %d, \"Count\": %lu, \"Datatype\": \"%s\"}},\n",
+          event->base.func, p2pId, getpid(), 1, event->base.startTs, event->base.commHash, event->base.rank, event->peer, event->count, event->datatype);
 }

 __hidden void printP2pEventTrailer(FILE* fh, struct p2p* event) {
  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
-          ncclFuncToString(event->base.func), p2pId++, getpid(), 1, event->base.stopTs);
+          event->base.func, p2pId++, getpid(), 1, event->base.stopTs);
 }

 static __thread int proxyOpId;
@@ -250,14 +200,18 @@ void printEvent(FILE* fh, void* handle) {
    struct collective* c = (struct collective *)handle;
    printCollEventHeader(fh, c);
    for (int i = 0; i < MAX_CHANNELS; i++) {
-      printEvent(fh, &c->send[i]);
-      printEvent(fh, &c->recv[i]);
+      for (int j = 0; j < c->nProxyOps[i]; j++) {
+        printEvent(fh, &c->send[i][j]);
+        printEvent(fh, &c->recv[i][j]);
+      }
    }
    printCollEventTrailer(fh, c);
  } else if (type == ncclProfileP2p) {
    struct p2p* p = (struct p2p *)handle;
    printP2pEventHeader(fh, p);
-    printEvent(fh, &p->op);
+    for (int i = 0; i < MAX_CHANNELS; i++) {
+      printEvent(fh, &p->op[i]);
+    }
    printP2pEventTrailer(fh, p);
  } else if (type == ncclProfileProxyOp) {
    struct proxyOp* p = (struct proxyOp *)handle;
@@ -67,6 +67,7 @@ typedef struct {
  //   - numPipeOps: number of operations in the group
  //   - numAlgo: number of algorithms in collCostTable
  //   - numProto: number of protocols in collCostTable
+  //   - regBuff: can register user buffer
  //
  // Outputs:
  //   - nChannels: number of channels (hence SMs) to be used.
@@ -82,15 +83,15 @@ typedef struct {
  // Unset fields will be set automatically by NCCL.
  ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
                              int numPipeOps, float** collCostTable, int numAlgo, int numProto,
-                              int* nChannels);
+                              int regBuff, int* nChannels);

  // Terminates the plugin and cleans up any resources that the plugin allocated.
  // context: tuner context object
  ncclResult_t (*destroy)(void* context);
-} ncclTuner_v3_t;
+} ncclTuner_v4_t;

-typedef ncclTuner_v3_t ncclTuner_t;
+typedef ncclTuner_v4_t ncclTuner_t;

-#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v3"
+#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4"

 #endif
@@ -12,10 +12,11 @@ __hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t

 __hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes,
                              int numPipeOps, float** collCostTable, int numAlgo, int numProto,
-                              int* nChannels) {
+                              int regBuff, int* nChannels) {
  // Update NCCL core generated cost table. Updated table will be evaluated by NCCL to pick the best algo/proto combo
-  if (collCostTable[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) {
-    collCostTable[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0;
+  float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
+  if (table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) {
+    table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0;
  }
  *nChannels = 1;
  return ncclSuccess;
@@ -25,7 +26,7 @@ __hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; }

 #define PLUGIN_NAME "Example"

-const ncclTuner_v3_t ncclTunerPlugin_v3 = {
+const ncclTuner_v4_t ncclTunerPlugin_v4 = {
  .name = PLUGIN_NAME,
  .init = pluginInit,
  .getCollInfo = pluginGetCollInfo,
@@ -12,6 +12,7 @@ DEBUG ?= 0
 ASAN ?= 0
 UBSAN ?= 0
 TRACE ?= 0
+WERROR ?= 0
 PROFAPI ?= 1
 NVTX ?= 1
 RDMA_CORE ?= 0
@@ -115,6 +116,10 @@ ifeq ($(NVTX), 0)
 CXXFLAGS  += -DNVTX_DISABLE
 endif

+ifneq ($(WERROR), 0)
+CXXFLAGS  += -Werror
+endif
+
 ifneq ($(KEEP), 0)
 NVCUFLAGS += -keep
 endif
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 23
-NCCL_PATCH   := 4
+NCCL_MINOR   := 24
+NCCL_PATCH   := 3
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
@@ -7,17 +7,22 @@ include ../makefiles/common.mk
 include ../makefiles/version.mk

 ##### src files
-INCEXPORTS  := nccl.h nccl_net.h
+INCEXPORTS  := nccl.h
 LIBSRCFILES := \
 	bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \
-	init.cc init_nvtx.cc net.cc proxy.cc transport.cc register.cc \
+	init.cc init_nvtx.cc net.cc proxy.cc transport.cc \
 	$(wildcard graph/*.cc) \
 	$(wildcard misc/*.cc) \
-	$(wildcard transport/*.cc)
+	$(wildcard transport/*.cc) \
+	$(wildcard register/*.cc) \
+	$(filter-out ras/client.cc,$(wildcard ras/*.cc))
+BINSRCFILES := ras/client.cc

 ##### lib files
 LIBNAME     := libnccl.so
 STATICLIBNAME := libnccl_static.a
+##### binaries
+BINNAME := ncclras
 ##### pkgconfig files
 PKGCONFIGFILE := nccl.pc
 ##### dirs
@@ -26,11 +31,12 @@ INCDIR := $(BUILDDIR)/include
 LIBDIR := $(BUILDDIR)/lib
 OBJDIR := $(BUILDDIR)/obj
 PKGDIR := $(BUILDDIR)/lib/pkgconfig
+BINDIR := $(BUILDDIR)/bin
 ##### target files
 CUDARTLIB  ?= cudart_static

+# Use compatibility shim only with static cudart; see https://github.com/NVIDIA/nccl/issues/658
 ifeq ($(CUDARTLIB), cudart_static)
-	# Use compatibility shim only with static cudart; see https://github.com/NVIDIA/nccl/issues/658
 	LIBSRCFILES += enhcompat.cc
 endif

@@ -40,18 +46,21 @@ LIBTARGET  := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH))
 STATICLIBTARGET := $(STATICLIBNAME)
 PKGTARGET  := $(PKGCONFIGFILE)
 LIBOBJ     := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o)
-DEPFILES   := $(LIBOBJ:%.o=%.d)
+BINOBJ     := $(BINSRCFILES:%.cc=$(OBJDIR)/%.o)
+DEPFILES   := $(LIBOBJ:%.o=%.d) $(BINOBJ:%.o=%.d)
 LDFLAGS    += -L${CUDA_LIB} -l$(CUDARTLIB) -lpthread -lrt -ldl

 DEVMANIFEST := $(BUILDDIR)/obj/device/manifest

 ##### rules
-build : lib staticlib
+build : lib staticlib binary

 lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET) $(PKGDIR)/$(PKGTARGET)

 staticlib : $(LIBDIR)/$(STATICLIBTARGET)

+binary : $(BINDIR)/$(BINNAME)
+
 $(DEVMANIFEST): ALWAYS_REBUILD $(INCTARGETS)
 	$(MAKE) -C ./device

@@ -85,6 +94,11 @@ $(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) $(DEVMANIFEST)
 	mkdir -p $(LIBDIR)
 	ar cr $@ $(LIBOBJ) $$(cat $(DEVMANIFEST))

+$(BINDIR)/$(BINNAME): $(BINOBJ)
+	@printf "Linking    %-35s > %s\n" $(BINNAME) $@
+	mkdir -p $(BINDIR)
+	$(CXX) $(CXXFLAGS) $^ -o $@
+
 $(PKGDIR)/nccl.pc : nccl.pc.in
 	mkdir -p $(PKGDIR)
 	@printf "Generating %-35s > %s\n" $< $@
@@ -121,15 +135,17 @@ $(OBJDIR)/%.o : %.cc $(INCTARGETS)

 clean :
 	$(MAKE) -C device clean
-	rm -rf ${INCDIR} ${LIBDIR} ${PKGDIR} ${OBJDIR}
+	rm -rf ${BINDIR} ${INCDIR} ${LIBDIR} ${PKGDIR} ${OBJDIR}

 install : build
 	mkdir -p $(PREFIX)/lib
 	mkdir -p $(PREFIX)/lib/pkgconfig
 	mkdir -p $(PREFIX)/include
+	mkdir -p $(PREFIX)/bin
 	cp -P -v $(BUILDDIR)/lib/lib* $(PREFIX)/lib/
 	cp -P -v $(BUILDDIR)/lib/pkgconfig/* $(PREFIX)/lib/pkgconfig/
 	cp -v $(BUILDDIR)/include/* $(PREFIX)/include/
+	cp -v $(BUILDDIR)/bin/ncclras $(PREFIX)/bin/

 FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cc" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|gdrwrap.h|nccl.h')
 # Note that formatting.mk defines a new target so in order to not overwrite the default target,
@@ -13,6 +13,7 @@
 #include <sys/types.h>
 #include "proxy.h"
 #include "param.h"
+#include "ras.h"

 #define BOOTSTRAP_N_CHECK_ABORT           10000
 #define BOOTSTRAP_TAG_CONNECT             (0x1 << 31)
@@ -110,13 +111,13 @@ ncclResult_t bootstrapNetInit() {
        if (nIfs <= 0) {
          WARN("Bootstrap : no socket interface found");
          pthread_mutex_unlock(&bootstrapNetLock);
-          return ncclInternalError;
+          return ncclInvalidUsage;
        }
      }
      char line[SOCKET_NAME_MAXLEN+MAX_IF_NAME_SIZE+2];
-      sprintf(line, " %s:", bootstrapNetIfName);
+      snprintf(line, sizeof(line), " %s:", bootstrapNetIfName);
      ncclSocketToString(&bootstrapNetIfAddr, line+strlen(line));
-      INFO(NCCL_BOOTSTRAP, "Bootstrap : Using%s", line);
+      INFO(NCCL_BOOTSTRAP, "Bootstrap: Using%s", line);
      bootstrapNetInitDone = 1;
    }
    pthread_mutex_unlock(&bootstrapNetLock);
@@ -152,7 +153,7 @@ static ncclResult_t netIsend(ncclNet_t* net, void* sendComm, void* data, int siz
                             int* done) {
  if (*done) return ncclSuccess;
  if (!*sendReq) {
-    NCCLCHECK(net->isend(sendComm, data, size, tag, dataHandle, sendReq));
+    NCCLCHECK(net->isend(sendComm, data, (size_t)size, tag, dataHandle, sendReq));
  }
  if (*sendReq) {
    NCCLCHECK(net->test(*sendReq, done, NULL));
@@ -166,7 +167,8 @@ static ncclResult_t netIrecv(ncclNet_t* net, void* recvComm, void* data, int siz
                             int* done) {
  if (*done) return ncclSuccess;
  if (!*recvReq) {
-    NCCLCHECK(net->irecv(recvComm, 1, &data, &size, &tag, &dataHandle, recvReq));
+    size_t size64 = size; 
+    NCCLCHECK(net->irecv(recvComm, 1, &data, &size64, &tag, &dataHandle, recvReq));
  }
  if (*recvReq) {
    NCCLCHECK(net->test(*recvReq, done, NULL));
@@ -302,7 +304,7 @@ static void* bootstrapRoot(void* rargs) {
      // if the number of root > 1, we will receive one extra info from the first local_id of the next root
      n2send = nRankFromRoot(iroot, nranks, nroots);
      nrecv = n2send + ((nroots > 1) ? 1 : 0);
-      NCCLCHECKGOTO(ncclCalloc(&rankInfo, nrecv * sizeof(union ringConnectInfo)), res, out);
+      NCCLCHECKGOTO(ncclCalloc(&rankInfo, nrecv), res, out);
      NCCLCHECKGOTO(ncclCalloc(&rankAddressesRoot, nrecv), res, out);
    }

@@ -492,29 +494,37 @@ static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) {
        struct netIf userIfs[MAX_OOB_DEVS];
        int nUserIfs = parseStringList(userIfEnv, userIfs, MAX_OOB_DEVS);
        // loop over the device and return the first one matching
-        int devId = 0;
        int nDev = 0;
        NCCLCHECK(comm->ncclNet->devices(&nDev));
+        int devId = 0;
        while (devId < nDev) {
          ncclNetProperties_t props;
          comm->ncclNet->getProperties(devId, &props);
          // check against user specified HCAs/ports
-          bool found = matchIfList(props.name, props.port, userIfs, nUserIfs, searchExact) ^ searchNot;
-          if (found) {
+          if (matchIfList(props.name, props.port, userIfs, nUserIfs, searchExact) ^ searchNot) {
+            // All plain physical devices have been initialized at this point
            devOOB = devId;
            break;
          }
          devId++;
        }
        if (devOOB == -1) {
-          WARN("no device found matching NCCL_OOB_NET_IFNAME=%s, ignoring", userIfEnv);
-          goto noEnv;
+          if (!searchNot)
+            WARN("no device found matching %s%s, verify NCCL_OOB_NET_IFNAME", searchExact ? "exactly " : "", userIfEnv);
+          else
+            WARN("no device found after excluding %s%s, verify NCCL_OOB_NET_IFNAME", searchExact ? "exactly " : "", userIfEnv);
+          pthread_mutex_unlock(&bootstrapNetLock);
+          return ncclInvalidArgument;
        }
      } else {
-      noEnv:
        // default choice is device 0
        devOOB = 0;
      }
+      // display info on the chosen device
+      ncclNetProperties_t props;
+      ncclResult_t res = comm->ncclNet->getProperties(devOOB, &props);
+      bool hasProp = res == ncclSuccess;
+      INFO(NCCL_BOOTSTRAP, "Bootstrap: Using %s:%d", (hasProp) ? props.name : "N/A", (hasProp) ? props.port : -1);
    }
    pthread_mutex_unlock(&bootstrapNetLock);
  }
@@ -545,7 +555,8 @@ static ncclResult_t socketRingConnect(ncclSocketAddress* addr, struct ncclSocket
 }
 static ncclResult_t ringAllInfo(struct ncclComm* comm, struct bootstrapState* state,
                                union ncclSocketAddress* peerAddresss,
-                                union ncclSocketAddress* peerProxy, uint64_t* peerUDS) {
+                                union ncclSocketAddress* peerProxy, uint64_t* peerUDS,
+                                struct rasRankInit* rasRanks) {
  ncclResult_t res = ncclSuccess;
  int rank = comm->rank;
  int nRanks = comm->nRanks;
@@ -553,6 +564,7 @@ static ncclResult_t ringAllInfo(struct ncclComm* comm, struct bootstrapState* st
    union ncclSocketAddress peerAddress;
    union ncclSocketAddress peerProxy;
    uint64_t peerUDS;
+    struct rasRankInit rasRank;
  }* ringData = NULL;

  NCCLCHECK(ncclCalloc(&ringData, nRanks));
@@ -563,6 +575,8 @@ static ncclResult_t ringAllInfo(struct ncclComm* comm, struct bootstrapState* st
    memcpy(&(ringData[rank].peerProxy), peerProxy + rank, sizeof(union ncclSocketAddress));
  if (peerUDS)
    memcpy(&(ringData[rank].peerUDS), peerUDS + rank, sizeof(uint64_t));
+  if (rasRanks)
+    memcpy(&(ringData[rank].rasRank), rasRanks + rank, sizeof(*rasRanks));

  // allgather
  NCCLCHECKGOTO(bootstrapAllGather(state, ringData, sizeof(struct bootstrapRingData)), res, exit);
@@ -575,6 +589,8 @@ static ncclResult_t ringAllInfo(struct ncclComm* comm, struct bootstrapState* st
      memcpy(peerProxy + irank, &(ringData[irank].peerProxy), sizeof(union ncclSocketAddress));
    if (peerUDS)
      memcpy(peerUDS + irank, &(ringData[irank].peerUDS), sizeof(uint64_t));
+    if (rasRanks)
+      memcpy(rasRanks + irank, &(ringData[irank].rasRank), sizeof(*rasRanks));
  }

 exit:
@@ -598,7 +614,10 @@ fail:
 NCCL_PARAM(StaggerRate, "UID_STAGGER_RATE", 7000);
 NCCL_PARAM(StaggerThreshold, "UID_STAGGER_THRESHOLD", 256);

+NCCL_PARAM(RasEnable, "RAS_ENABLE", 1);
+
 ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
+  ncclResult_t result = ncclSuccess;
  int rank = comm->rank;
  int nranks = comm->nRanks;
  // char nextPeerHandle[NCCL_NET_HANDLE_MAXSIZE];
@@ -607,6 +626,8 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
  struct ncclSocket sock, listenSockRoot;
  struct extInfo info = {0};
  union ringConnectInfo nextPeer;
+  bool performRasAddRanks = true;
+  struct rasRankInit* rasRanks = nullptr;

  uint64_t timers[BOOTSTRAP_INIT_TIME_N] = {0};

@@ -696,23 +717,45 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
  // in case of failure, those resources will be free'd when calling bootstrapDestroy, so we can return immediatly
  NCCLCHECK(ncclCalloc(&state->peerProxyAddresses, nranks));
  NCCLCHECK(ncclCalloc(&proxySocket, 1));
-  NCCLCHECK(createListenSocket(comm, comm->magic, proxySocket, state->peerProxyAddresses + rank, ncclSocketTypeProxy));
+  NCCLCHECKGOTO(createListenSocket(comm, comm->magic, proxySocket, state->peerProxyAddresses + rank, ncclSocketTypeProxy), result, fail);

-  NCCLCHECK(ncclCalloc(&state->peerProxyAddressesUDS, nranks));
-  NCCLCHECK(getUDS(state->peerProxyAddressesUDS + rank));
+  NCCLCHECKGOTO(ncclCalloc(&state->peerProxyAddressesUDS, nranks), result, fail);
+  NCCLCHECKGOTO(getUDS(state->peerProxyAddressesUDS + rank), result, fail);

  // create a socket for others to reach out (P2P)
  union ncclSocketAddress peerSocketAddress;
-  NCCLCHECK(createListenSocket(comm, comm->magic, &STATE_LISTEN(state, peerSocket), &peerSocketAddress, ncclSocketTypeBootstrap));
-  NCCLCHECK(ncclCalloc(&state->peerP2pAddresses, nranks * sizeof(union ncclSocketAddress)));
+  NCCLCHECKGOTO(createListenSocket(comm, comm->magic, &STATE_LISTEN(state, peerSocket), &peerSocketAddress, ncclSocketTypeBootstrap), result, fail);
+  NCCLCHECKGOTO(ncclCalloc(&state->peerP2pAddresses, nranks), result, fail);
  memcpy(state->peerP2pAddresses + rank, &peerSocketAddress, sizeof(union ncclSocketAddress));

+  // Initialize RAS
+  if (ncclParamRasEnable() == 1) {
+    // The RAS thread will take care of freeing the memory allocated below.
+    NCCLCHECK(ncclCalloc(&rasRanks, nranks));
+    memcpy(&rasRanks[rank].addr, &bootstrapNetIfAddr, sizeof(rasRanks[rank].addr));
+    rasRanks[rank].pid = getpid();
+    rasRanks[rank].cudaDev = comm->cudaDev;
+    rasRanks[rank].nvmlDev = comm->nvmlDev;
+    if (ncclRasCommInit(comm, rasRanks+rank) != ncclSuccess) {
+      INFO(NCCL_INIT|NCCL_RAS, "Continuing in spite of a RAS initialization error");
+      // We should still participate in the ringAllInfo below as the peers will be waiting for us.
+      // Just make sure that the address is clearly invalid...
+      memset(rasRanks+rank, '\0', sizeof(*rasRanks));
+      performRasAddRanks = false;
+    }
+  }
+
  BOOTSTRAP_PROF_OPEN(timers[BOOTSTRAP_INIT_TIME_RING]);
-  NCCLCHECK(ringAllInfo(comm, state, state->peerP2pAddresses, state->peerProxyAddresses, state->peerProxyAddressesUDS));
+  NCCLCHECKGOTO(ringAllInfo(comm, state, state->peerP2pAddresses, state->peerProxyAddresses, state->peerProxyAddressesUDS, rasRanks), result, fail);
  BOOTSTRAP_PROF_CLOSE(timers[BOOTSTRAP_INIT_TIME_RING]);

  // Create the service proxy and get the UDS
-  NCCLCHECK(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses, state->peerProxyAddressesUDS));
+  NCCLCHECKGOTO(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses, state->peerProxyAddressesUDS), result, fail);
+
+  if (ncclParamRasEnable() == 1 && performRasAddRanks) {
+    if (ncclRasAddRanks(rasRanks, nranks) != ncclSuccess)
+      INFO(NCCL_INIT|NCCL_RAS, "Continuing in spite of a RAS initialization error");
+  }

  BOOTSTRAP_PROF_CLOSE(timers[BOOTSTRAP_INIT_TIME_TOTAL]);
  TRACE(NCCL_BOOTSTRAP, "rank %d nranks %d - DONE", rank, nranks);
@@ -722,8 +765,11 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
       timers[BOOTSTRAP_INIT_TIME_RECV] / 1e9,
       timers[BOOTSTRAP_INIT_TIME_RING] / 1e9,
       timers[BOOTSTRAP_INIT_TIME_DELAY] / 1e9);
-
-  return ncclSuccess;
+exit:
+  return result;
+fail:
+  free(proxySocket);
+  goto exit;
 }

 ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks) {
@@ -761,6 +807,11 @@ ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclCo
  union ncclSocketAddress peerSocketAddress;
  NCCLCHECK(createListenSocket(comm, comm->magic, &STATE_LISTEN(state, peerSocket), &peerSocketAddress, ncclSocketTypeBootstrap));

+  if (ncclParamRasEnable() == 1) {
+    if (ncclRasCommInit(comm, nullptr) != ncclSuccess)
+      INFO(NCCL_INIT|NCCL_RAS, "Continuing in spite of a RAS initialization error");
+  }
+
  // Get addr from next rank using the parent's connections
  NCCLCHECKGOTO(bootstrapSend(parent->bootstrap, prev, BOOTSTRAP_TAG_COMMSPLIT, &info, sizeof(union ringConnectInfo)), ret, fail);
  NCCLCHECKGOTO(bootstrapRecv(parent->bootstrap, next, BOOTSTRAP_TAG_COMMSPLIT, &nextPeer, sizeof(union ringConnectInfo)), ret, fail);
@@ -773,14 +824,14 @@ ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclCo
    NCCLCHECK(socketRingConnect(&nextPeer.addr, &STATE_RING(state, socket.send), &STATE_LISTEN(state, socket), &STATE_RING(state, socket.recv), comm->magic, state->abortFlag));
  }

-  NCCLCHECKGOTO(ncclCalloc(&state->peerP2pAddresses, nranks * sizeof(union ncclSocketAddress)), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&state->peerP2pAddresses, nranks), ret, fail);
  memcpy(state->peerP2pAddresses + rank, &peerSocketAddress, sizeof(union ncclSocketAddress));
  if (parent->config.splitShare) {
    /* map local rank to top parent local rank. */
    for (int i = 0; i < nranks; ++i) {
      comm->topParentRanks[i] = parent->topParentRanks[parentRanks[i]];
    }
-    NCCLCHECKGOTO(ringAllInfo(comm, state, state->peerP2pAddresses, NULL, NULL), ret, fail);
+    NCCLCHECKGOTO(ringAllInfo(comm, state, state->peerP2pAddresses, NULL, NULL, NULL), ret, fail);
  } else {
    NCCLCHECKGOTO(ncclCalloc(&state->peerProxyAddresses, nranks), ret, fail);
    NCCLCHECKGOTO(ncclCalloc(&state->peerProxyAddressesUDS, nranks), ret, fail);
@@ -788,7 +839,7 @@ ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclCo
    NCCLCHECKGOTO(ncclCalloc(&proxySocket, 1), ret, fail);
    NCCLCHECKGOTO(getUDS(state->peerProxyAddressesUDS + rank), ret, fail);
    NCCLCHECKGOTO(createListenSocket(comm, comm->magic, proxySocket, state->peerProxyAddresses + rank, ncclSocketTypeProxy), ret, fail);
-    NCCLCHECKGOTO(ringAllInfo(comm, state, state->peerP2pAddresses, state->peerProxyAddresses, state->peerProxyAddressesUDS), ret, fail);
+    NCCLCHECKGOTO(ringAllInfo(comm, state, state->peerP2pAddresses, state->peerProxyAddresses, state->peerProxyAddressesUDS, NULL), ret, fail);
    NCCLCHECKGOTO(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses, state->peerProxyAddressesUDS), ret, fail);
  }

@@ -811,7 +862,7 @@ static ncclResult_t socketConnect(void* commState, int peer, int tag, struct ncc
  struct bootstrapState* state = (struct bootstrapState*)commState;

  struct socketAckInfo ack = (struct socketAckInfo){.rank = state->rank, .tag = tag};
-  NCCLCHECKGOTO(ncclSocketInit(sock, state->peerP2pAddresses + peer, state->magic, ncclSocketTypeBootstrap), ret, fail);
+  NCCLCHECKGOTO(ncclSocketInit(sock, state->peerP2pAddresses + peer, state->magic, ncclSocketTypeBootstrap, state->abortFlag), ret, fail);
  NCCLCHECKGOTO(ncclSocketConnect(sock), ret, fail);
  NCCLCHECKGOTO(socketSend(sock, &ack, sizeof(struct socketAckInfo)), ret, fail);
  return ncclSuccess;
@@ -44,9 +44,9 @@ const char* ncclDatatypeToString(ncclDataType_t type) {
  case ncclFloat16: return "ncclFloat16";
  case ncclFloat32: return "ncclFloat32";
  case ncclFloat64: return "ncclFloat64";
-#if defined(__CUDA_BF16_TYPES_EXIST__)
  case ncclBfloat16: return "ncclBfloat16";
-#endif
+  case ncclFloat8e4m3: return "ncclFloat8e4m3";
+  case ncclFloat8e5m2: return "ncclFloat8e5m2";
  default: return "Unknown";
  }
 }
@@ -87,8 +87,7 @@ ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcoun
  struct ncclInfo info = { ncclFuncAllGather, "AllGather",
    sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
    ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS };
-  NCCLCHECK(ncclEnqueueCheck(&info));
-  return ncclSuccess;
+  return ncclEnqueueCheck(&info);
 }

 NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
@@ -111,8 +110,7 @@ ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
  struct ncclInfo info = { ncclFuncAllReduce, "AllReduce",
    sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
    ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS };
-  NCCLCHECK(ncclEnqueueCheck(&info));
-  return ncclSuccess;
+  return ncclEnqueueCheck(&info);
 }

 NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
@@ -133,16 +131,14 @@ ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, n
  struct ncclInfo info = { ncclFuncBroadcast, "Broadcast",
    sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
    BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS };
-  NCCLCHECK(ncclEnqueueCheck(&info));
-  return ncclSuccess;
+  return ncclEnqueueCheck(&info);
 }
 /* Deprecated original "in place" function, similar to MPI */
 NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root,
    ncclComm_t comm, cudaStream_t stream);
 ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
    ncclComm_t comm, cudaStream_t stream) {
-  NCCLCHECK(ncclBroadcast(buff, buff, count, datatype, root, comm, stream));
-  return ncclSuccess;
+  return ncclBroadcast(buff, buff, count, datatype, root, comm, stream);
 }

 NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
@@ -166,8 +162,7 @@ ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
  struct ncclInfo info = { ncclFuncReduce, "Reduce",
    sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
    REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS };
-  NCCLCHECK(ncclEnqueueCheck(&info));
-  return ncclSuccess;
+  return ncclEnqueueCheck(&info);
 }

 NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
@@ -189,8 +184,7 @@ ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recv
  struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter",
    sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
    REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS };
-  NCCLCHECK(ncclEnqueueCheck(&info));
-  return ncclSuccess;
+  return ncclEnqueueCheck(&info);
 }

 struct NvtxParamsSendRecv {
@@ -212,12 +206,7 @@ ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatyp
  struct ncclInfo info = { ncclFuncSend, "Send",
    NULL, (void*)sendbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
    1, 1 };
-  ncclResult_t ret;
-  NCCLCHECK(ncclGroupStart());
-  NCCLCHECKGOTO(ncclEnqueueCheck(&info), ret, exit);
-exit:
-  NCCLCHECK(ncclGroupEnd());
-  return ret;
+  return ncclEnqueueCheck(&info);
 }

 NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
@@ -230,10 +219,5 @@ ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int
  struct ncclInfo info = { ncclFuncRecv, "Recv",
    NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
    1, 1 };
-  ncclResult_t ret;
-  NCCLCHECK(ncclGroupStart());
-  NCCLCHECKGOTO(ncclEnqueueCheck(&info), ret, exit);
-exit:
-  NCCLCHECK(ncclGroupEnd());
-  return ret;
+  return ncclEnqueueCheck(&info);
 }
@@ -8,6 +8,7 @@
 #include "nccl_net.h"
 #include <stdlib.h>
 #include <stdarg.h>
+#include <stdio.h>
 #include <string.h>
 #include <strings.h>
 #include <sys/syscall.h>
@@ -89,6 +90,8 @@ static void ncclDebugInit() {
        mask = NCCL_REG;
      } else if (strcasecmp(subsys, "PROFILE") == 0) {
        mask = NCCL_PROFILE;
+      } else if (strcasecmp(subsys, "RAS") == 0) {
+        mask = NCCL_RAS;
      } else if (strcasecmp(subsys, "ALL") == 0) {
        mask = NCCL_ALL;
      }
@@ -224,6 +227,19 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
  }
 }

+NCCL_API(void, ncclResetDebugInit);
+void ncclResetDebugInit() {
+  // Cleans up from a previous ncclDebugInit() and reruns.
+  // Use this after changing NCCL_DEBUG and related parameters in the environment.
+  __atomic_load_n(&ncclDebugLevel, __ATOMIC_ACQUIRE);
+  if (ncclDebugFile != stdout) {
+    fclose(ncclDebugFile);
+    ncclDebugFile = stdout;
+  }
+  ncclDebugLevel = -1;
+  ncclDebugInit();
+}
+
 NCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0);

 void ncclSetThreadName(pthread_t thread, const char *fmt, ...) {
@@ -9,64 +9,88 @@
 #include "primitives.h"

 namespace {
-  template<typename T, typename RedOp, typename Proto>
+  template<typename T, typename RedOp, typename Proto, bool isNetOffload = false>
  __device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
    ncclRing *ring = &ncclShmem.channel.ring;
    const int *ringRanks = ring->userRanks;
    const int nranks = ncclShmem.comm.nRanks;
-    size_t count, partOffset, partCount, chunkCount;
+    ssize_t count, partOffset, partCount, chunkCount;
    ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &partOffset, &partCount, &chunkCount);
-    size_t offset;
-    size_t dataOffset;
+    ssize_t offset;
+    ssize_t dataOffset;
    int nelem;
    int rankDest;
-
+    int workNthreads;
    T *inputBuf = (T*)work->sendbuff;
    T *outputBuf = (T*)work->recvbuff;
-    // Coverity reports that the callee treats &ring->next as an array.  However, due to the use of
-    // FanSymmetric<1>, only the first element is ever accessed, so it's fine.
-    // coverity[callee_ptr_arith:FALSE]
-    Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0> prims
-      (tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg, 0, 0, 0, work);

-    for (size_t elemOffset = 0; elemOffset < partCount; elemOffset += chunkCount) {
-      /////////////// begin AllGather steps ///////////////
-      nelem = min(chunkCount, partCount - elemOffset);
-      dataOffset = partOffset + elemOffset;
+    // If isNetOffload == true, we only use 1 warp to drive Ring algo/network communication
+    // and the rest of warps proceed to copy src data into dst buffer in parallel when AG
+    // is not in-place.
+    if (isNetOffload) {
+      workNthreads = WARP_SIZE;
+      chunkCount = NCCL_MAX_NET_SIZE;
+    } else {
+      workNthreads = nthreads;
+    }

-      // step 0: push data to next GPU
-      rankDest = ringRanks[0];
-      offset = dataOffset + rankDest * count;
+    if (tid < workNthreads) {
+      // Coverity reports that the callee treats &ring->next as an array.  However, due to the use of
+      // FanSymmetric<1>, only the first element is ever accessed, so it's fine.
+      // coverity[callee_ptr_arith:FALSE]
+      Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0, isNetOffload> prims
+        (tid, workNthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg, 0, 0, 0, work, NULL, isNetOffload ? NCCL_MAX_NET_SIZE : 0);
+      for (size_t elemOffset = 0; elemOffset < partCount; elemOffset += chunkCount) {
+        /////////////// begin AllGather steps ///////////////
+        nelem = min(chunkCount, partCount - elemOffset);
+        dataOffset = partOffset + elemOffset;

-      if (inputBuf + dataOffset == outputBuf + offset) { // In place
-        prims.directSend(dataOffset, offset, nelem);
-      } else {
-        prims.directCopySend(dataOffset, offset, nelem);
-      }
-
-      // k-2 steps: copy to next GPU
-      for (int j=1; j<nranks-1; ++j) {
-        rankDest = ringRanks[nranks-j];
+        // step 0: push data to next GPU
+        rankDest = ringRanks[0];
        offset = dataOffset + rankDest * count;

-        prims.directRecvCopyDirectSend(offset, nelem);
+        if ((inputBuf + dataOffset == outputBuf + offset) || isNetOffload) { // In place or onePPN
+          prims.directSend(dataOffset, offset, nelem);
+        } else {
+          prims.directCopySend(dataOffset, offset, nelem);
+        }
+
+        // k-2 steps: copy to next GPU
+        for (int j = 1; j < nranks - 1; ++j) {
+          rankDest = ringRanks[nranks - j];
+          offset = dataOffset + rankDest * count;
+          prims.directRecvCopyDirectSend(offset, offset, nelem);
+        }
+
+        // Make final copy from buffer to dest.
+        rankDest = ringRanks[1];
+        offset = dataOffset + rankDest * count;
+
+        // Final wait/copy.
+        prims.directRecv(offset, offset, nelem);
      }
-
-      // Make final copy from buffer to dest.
-      rankDest = ringRanks[1];
-      offset = dataOffset + rankDest * count;
-
-      // Final wait/copy.
-      prims.directRecv(offset, offset, nelem);
+    } else if (inputBuf != outputBuf + ringRanks[0] * count) {
+      inputBuf = inputBuf + partOffset;
+      outputBuf = outputBuf + partOffset + ringRanks[0] * count;
+      reduceCopy<COLL_UNROLL, RedOp, T, 0, 1, 1, 0, 1, 1, /*PreOpSrcs=*/0>
+        (tid - workNthreads, nthreads - workNthreads, work->redOpArg, &work->redOpArg, false, 1, (void**)&inputBuf, 1, (void**)&outputBuf, partCount);
    }
+    // we have to wait for all warps before we can proceed to the next work;
+    // otherwise, we can have contention if next work will use the outputBuf
+    // in this work. We use bar 14 to avoid conflicts with prims barrier and
+    // __syncthread().
+    if (isNetOffload) barrier_sync(14, nthreads);
  }
 }

 template<typename T, typename RedOp>
 struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
-    using Proto = ProtoSimple<ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS>;
-    runRing<T, RedOp, Proto>(tid, nthreads, work);
+    bool isNetOffload = work->isOneRPN && work->netRegUsed;
+    if (isNetOffload)
+      runRing<T, RedOp, ProtoSimple<1, 1>, true>(tid, nthreads, work);
+    else
+      runRing<T, RedOp, ProtoSimple<ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS>, false>(tid, nthreads, work);
  }
 };

@@ -96,7 +120,7 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SIMPLE
    T *inputBuf = (T*)work->sendbuff;
    T *outputBuf = (T*)work->recvbuff;
    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
-      (tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, false, false, 0, primsModePatAg);
+      (tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, nullptr, 0, primsModePatAg);

    PatAGAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
    int last = 0;
@@ -137,6 +161,7 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
          nelem = min(chunkCount, channelCount - elemOffset);
          prims.gather(offset, nvls->nHeads * count, nelem, count, -1, 0);
        }
+        // coverity[overrun-call] => Coverity think prims.index can be greater than 1
      } else if (tid < tidEndBcast) {
        // Bcast through NVLS
        using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
@@ -148,6 +173,7 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
          nelem = min(chunkCount, channelCount - elemOffset);
          prims.send(offset, nelem);
        }
+        // coverity[overrun-call] => Coverity think prims.index can be greater than 1
      }
    } else {
      /* direct allgather */
@@ -204,11 +230,11 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
      int part = ncclShmem.channelId - work->channelLo;
      char* inbuf = (char*)work->sendbuff;
      char* outbuf = (char*)work->recvbuff;
-      ssize_t sizePerRank = work->collnet.count*sizeof(T);
-      bool inPlace = (inbuf == outbuf + ncclShmem.comm.rank*sizePerRank);
+      ssize_t countPerRank = work->collnet.count*sizeof(T);
+      bool inPlace = (inbuf == outbuf + ncclShmem.comm.rank*countPerRank);

-      ssize_t railAllBeg = min(railGridOffset + part*chunkSize, nNodes*sizePerRank);
-      ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*sizePerRank);
+      ssize_t railAllBeg = min(railGridOffset + part*chunkSize, nNodes*countPerRank);
+      ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*countPerRank);
      int railAllSize = railAllEnd - railAllBeg;
      if (tid < nDsts) dstSizes[tid] = railAllSize;

@@ -221,15 +247,15 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
        if (rail == nRails) rail = 0;
      }
      do {
-        int node = railAllBeg/sizePerRank;
+        int node = railAllBeg/countPerRank;
        int railAllOffset = 0;
        while (railAllOffset < railAllSize) {
-          ssize_t railOneBeg = node*sizePerRank;
-          ssize_t railOneEnd = railOneBeg + sizePerRank;
+          ssize_t railOneBeg = node*countPerRank;
+          ssize_t railOneEnd = railOneBeg + countPerRank;
          ssize_t railOneOffset = (railAllBeg+railAllOffset) - railOneBeg;
          int delta = min(railAllEnd, railOneEnd) - (railAllBeg+railAllOffset);
          int rank = ncclShmem.comm.collNetDenseToUserRank[node*nRails + rail];
-          ssize_t userOneBeg = rank*sizePerRank + railOneOffset;
+          ssize_t userOneBeg = rank*countPerRank + railOneOffset;
          int outIsDst = (inPlace && rank == ncclShmem.comm.rank) ? 0 : 1;
          if (nSrcs != 0 && outIsDst+nDsts != 0) {
            reduceCopy<ncclCollUnroll(), RedOp, T,
@@ -238,11 +264,11 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
                     /*PreOpSrcs=*/0>
            (tid, tn, 0, nullptr, false,
             /*nSrcs=*/1, [=]__device__(int s/*==0*/) -> void* {
-               return work->regUsed && (recvDirectFlag & NCCL_DIRECT_READ) ? (char*)srcPtrs[src] + userOneBeg : (char*)srcPtrs[src] + railAllOffset;
+               return work->regUsed && (recvDirectFlag & NCCL_P2P_READ) ? (char*)srcPtrs[src] + userOneBeg : (char*)srcPtrs[src] + railAllOffset;
             },
             /*nDsts=*/outIsDst+nDsts, [=]__device__(int d) -> void* {
               return d < outIsDst ? outbuf + userOneBeg
-                                   : work->regUsed && (sendDirectFlag & NCCL_DIRECT_WRITE) ? (char*)dstPtrs[d-outIsDst] + userOneBeg
+                                   : work->regUsed && (sendDirectFlag & NCCL_P2P_WRITE) ? (char*)dstPtrs[d-outIsDst] + userOneBeg
                                   : (char*)dstPtrs[d-outIsDst] + railAllOffset;
             },
             delta);
@@ -262,8 +288,9 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
    const int nChannels = work->channelHi - work->channelLo + 1;
    struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
    int const &nNodes = ncclShmem.comm.nNodes;
-    ssize_t sizePerRank = work->collnet.count*sizeof(T);
+    ssize_t countPerRank = work->collnet.count;
    size_t chunkSize = work->collnet.chunkCount;
+    const int hasDn = (direct->down[0] >= 0) ? 1 : 0;
    bool isMultiRail = (direct->nHeads > 1);
    int nWarps1 = 1;
    int nWarps2 = (isMultiRail ? 2 : 1);
@@ -277,9 +304,12 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P

    int tn = nWarps1*WARP_SIZE;
    if (tid < tn) {
-      if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
+      if (work->netRegUsed) {
        if (tid == 0) {
-          int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE);
+          // If this rank has local peers (i.e, hasDn == true), we cannot offload all data to network.
+          // In this case, steps should be computed based on chunkSize and so on; otherwise, we just
+          // bump the step by 1 to kick off collnet progress.
+          int steps = hasDn ? (int)divUp(nNodes * countPerRank, nChannels * chunkSize) : 1;
          Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, steps);
        }
        __syncwarp();
@@ -288,11 +318,11 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
        Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
          prims(tid, tn, nullptr, &direct->out, work->sendbuff, nullptr,
            /*redOpArg=*/0, 0 * Proto::MaxGroupWidth, 1, 1);
-        for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
+        for (ssize_t railGridOffset = 0; railGridOffset < nNodes * countPerRank; railGridOffset += nChannels * chunkSize) {
          ssize_t railAllBeg = railGridOffset + part * chunkSize;
-          ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes * sizePerRank);
-          ssize_t railOneBeg = ncclShmem.comm.node * sizePerRank;
-          ssize_t railOneEnd = railOneBeg + sizePerRank;
+          ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes * countPerRank);
+          ssize_t railOneBeg = ncclShmem.comm.node * countPerRank;
+          ssize_t railOneEnd = railOneBeg + countPerRank;
          ssize_t beg = max(railAllBeg, railOneBeg);
          ssize_t end = min(railAllEnd, railOneEnd);
          prims.send(beg - railOneBeg, max(ssize_t(0), end - beg));
@@ -304,10 +334,9 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P

    tn = nWarps2*WARP_SIZE;
    if (tid < tn) {
-      if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
+      if (work->netRegUsed && !hasDn) {
        if (tid == 0) {
-          int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE);
-          Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, steps);
+          Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, 1);
        }
        __syncwarp();
      } else {
@@ -315,7 +344,7 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
        Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0>
          prims(tid, tn, &direct->out, direct->heads + 1, nullptr, work->recvbuff,
            /*redOpArg=*/0, 1 * Proto::MaxGroupWidth, 0, 0, work);
-        for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
+        for (ssize_t railGridOffset = 0; railGridOffset < nNodes * countPerRank; railGridOffset += nChannels * chunkSize) {
          Scatterer</*BcastSendNotRecv=*/true> scat;
          scat.work = work;
          scat.chunkSize = chunkSize;
@@ -333,7 +362,7 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
      Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 0>, /*Direct=*/1, Proto, 0>
        prims(tid, tn, direct->heads+1, nullptr, nullptr, work->recvbuff,
              /*redOpArg=*/0, 2*Proto::MaxGroupWidth, 0, 0, work);
-      for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
+      for (ssize_t railGridOffset=0; railGridOffset < nNodes*countPerRank; railGridOffset += nChannels*chunkSize) {
        Scatterer</*BcastSendNotRecv=*/false> scat;
        scat.work = work;
        scat.chunkSize = chunkSize;
@@ -69,7 +69,7 @@ namespace {
        chunkOffset = chunk * chunkCount;
        offset = gridOffset + elemOffset + chunkOffset;
        nelem = (int)min(chunkCount, remCount - chunkOffset);
-        prims.directRecvCopyDirectSend(offset, nelem);
+        prims.directRecvCopyDirectSend(offset, offset, nelem);
      }

      // Make final copy from buffer to dest.
@@ -139,7 +139,7 @@ namespace {
        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
          offset = gridOffset + elemOffset;
          nelem = min(chunkCount, channelCount - elemOffset);
-          prims.directRecvCopyDirectSend(offset, nelem);
+          prims.directRecvCopyDirectSend(offset, offset, nelem);
        }
      }
    }
@@ -222,7 +222,7 @@ namespace {
        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
          offset = gridOffset + elemOffset;
          nelem = min(chunkCount, channelCount - elemOffset);
-          prims.directRecvCopyDirectSend(offset, nelem);
+          prims.directRecvCopyDirectSend(offset, offset, nelem);
        }
      }
    }
@@ -268,22 +268,30 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
    const int tidStartBcast = nThreadsGather;
    const int tidStartScatter = tidStartBcast + nThreadsBcast;
    const int tidStartReduce = tidStartScatter + nThreadsScatter;
-
    using Proto = ProtoSimple<1, 1>;

    if (tid >= tidStartScatter && tid < tidStartReduce && hasUp) {
      // Scatter
-      Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
+      Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0>
        prims(tid-tidStartScatter, nThreadsScatter, NULL, direct->up, work->sendbuff, work->recvbuff,
-           work->redOpArg, 2*Proto::MaxGroupWidth, 1, 1);
+           work->redOpArg, 2*Proto::MaxGroupWidth, 1, 1, work);
+      ssize_t offsetBase, peerOffset;
+      ssize_t maxNelems;
+      if (work->netRegUsed) {
+        offsetBase = bid * chunkSize;
+        maxNelems = size;  // never be the min
+        peerOffset = nChannels * chunkSize;
+      } else {
+        offsetBase = bid * direct->nHeads * chunkSize;
+        maxNelems = direct->nHeads * chunkSize;
+        peerOffset = chunkSize;
+      }
+      // For collnet UB case, we need to organize buffers differently for contiguous buffer access
+      // across channels. This access pattern should be consistent with code in coll_net.cc
      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-        ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
-        int nelem = min(direct->nHeads*chunkSize, size-offset);
-        if (work->regUsed) {
-          prims.directScatter(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift);
-        } else {
-          prims.scatter(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift);
-        }
+        ssize_t offset = gridOffset + offsetBase;
+        ssize_t nelem = min(maxNelems, size - offset);
+        prims.scatter(offset, nelem, chunkSize, peerOffset, direct->headRank, direct->shift);
      }
      // Coverity complains about a possible overrun inside the destructor of "prims", but that's actually
      // a false positive.
@@ -291,24 +299,20 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
    } else if (tid >= tidStartReduce && direct->out != -1) {
      if (hasDn) {
        // Reduce, send to network
-        Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>
+        Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/1, Proto, 0>
          prims(tid-tidStartReduce, nThreadsReduce, direct->down, &direct->out, work->sendbuff, work->recvbuff,
-             work->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
+             work->redOpArg, 3*Proto::MaxGroupWidth, 1, 1, work);
        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
-          int nelem = min(chunkSize, size-offset);
-          if (work->regUsed) {
-            prims.directRecvReduceSend(offset, nelem);
-          } else {
-            prims.recvReduceSend(offset, nelem);
-          }
+          ssize_t offset = work->netRegUsed ? gridOffset + (bid + direct->headRank * nChannels) * chunkSize
+                                    : gridOffset + (bid * direct->nHeads + direct->headRank) * chunkSize;
+          int nelem = min(chunkSize, size - offset);
+          prims.recvReduceDirectSend(offset, offset, nelem);
        }
      } else {
        // Directly send to network
-        if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
+        if (work->netRegUsed) {
          if (tid == tidStartReduce) {
-            int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE);
-            Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, steps);
+            Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, 1);
          }
          __syncwarp();
        } else {
@@ -316,8 +320,8 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
          prims(tid-tidStartReduce, nThreadsReduce, nullptr, &direct->out, work->sendbuff, work->recvbuff,
             work->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
          for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-            ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
-            int nelem = min(chunkSize, size-offset);
+            ssize_t offset = gridOffset + (bid * direct->nHeads + direct->headRank) * chunkSize;
+            int nelem = min(chunkSize, size - offset);
            prims.send(offset, nelem);
          }
        }
@@ -327,10 +331,21 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
      Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 0>, /*Direct=*/1, Proto, 0>
        prims(tid, nThreadsGather, direct->up, NULL, work->sendbuff, work->recvbuff,
           work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, work);
+      ssize_t offsetBase, peerOffset;
+      ssize_t maxNelems;
+      if (work->netRegUsed) {
+        offsetBase = bid * chunkSize;
+        maxNelems = size;  // never be the min
+        peerOffset = nChannels * chunkSize;
+      } else {
+        offsetBase = bid * direct->nHeads * chunkSize;
+        maxNelems = direct->nHeads * chunkSize;
+        peerOffset = chunkSize;
+      }
      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-        ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
-        int nelem = min(direct->nHeads*chunkSize, size-offset);
-        prims.directGather(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift);
+        ssize_t offset = gridOffset + offsetBase;
+        ssize_t nelem = min(maxNelems, size - offset);
+        prims.directGather(offset, nelem, chunkSize, peerOffset, direct->headRank, direct->shift);
      }
    } else if (tid >= tidStartBcast && tid < tidStartScatter && direct->out != -1) {
      if (hasDn) {
@@ -342,15 +357,15 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
          prims(tid-tidStartBcast, nThreadsBcast, &direct->out, direct->down, work->sendbuff, work->recvbuff,
             work->redOpArg, 1*Proto::MaxGroupWidth, 0, 0, work);
        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
-          int nelem = min(chunkSize, size-offset);
-          prims.recvCopyDirectSend(offset, nelem, /*postOp=*/true);
+          ssize_t offset = work->netRegUsed ? gridOffset + (bid + direct->headRank * nChannels) * chunkSize
+                                            : gridOffset + (bid * direct->nHeads + direct->headRank) * chunkSize;
+          int nelem = min(chunkSize, size - offset);
+          prims.directRecvCopyDirectSend(offset, offset, nelem, /*postOp=*/true);
        }
      } else {
-        if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
+        if (work->netRegUsed) {
          if (tid == tidStartBcast) {
-            int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE);
-            Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, steps);
+            Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, 1);
          }
          __syncwarp();
        } else {
@@ -394,8 +409,6 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
      ssize_t gridOffset, channelCount, chunkSize;
      ncclCollCbdPart(work, ncclShmem.channelId, NCCL_PROTO_SIMPLE, sizeof(T), (ssize_t*)nullptr, &gridOffset, &channelCount, &chunkSize);
      const ssize_t loopCount = nvls->nHeads * chunkSize;
-      ssize_t offset;
-      int nelem;
      int remCount = channelCount%(nvls->nHeads*chunkSize);
      int lastChunkSize = alignUp(divUp(remCount, nvls->nHeads), 16384/sizeof(T));

@@ -407,8 +420,8 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
            work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
        for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
          if (channelCount - elemOffset < loopCount) chunkSize = lastChunkSize;
-          offset = gridOffset + elemOffset;
-          nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
+          ssize_t offset = gridOffset + elemOffset;
+          int nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
          prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
        }
      } else if (tid < tidEndGather) {
@@ -419,8 +432,8 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
            work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
        for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
          if (channelCount - elemOffset < loopCount) chunkSize = lastChunkSize;
-          offset = gridOffset + elemOffset;
-          nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
+          ssize_t offset = gridOffset + elemOffset;
+          int nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
          prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
        }
      } else if (tid < tidEndReduce) {
@@ -430,7 +443,8 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
          prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL,
            work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work);
        for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
-          ssize_t chunkOffset;
+          ssize_t chunkOffset, offset;
+          int nelem;
          if (channelCount - elemOffset < loopCount) chunkSize = lastChunkSize;
          chunkOffset = elemOffset + nvls->headRank * chunkSize;
          offset = gridOffset + chunkOffset;
@@ -456,6 +470,7 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
          int nelem = work->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset);
          prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
        }
+        // coverity[overrun-call] => Coverity think prims.index can be greater than 1
      } else if (tid < tidEndGather) {
        // Gather
        using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
@@ -464,38 +479,23 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
            work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
          ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
-          int nelem = work->regUsed ? 0 :min(nvls->nHeads * chunkSize, size - offset);
+          int nelem = work->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset);
          prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
        }
      } else if (tid < tidEndReduce && nvls->headRank != -1) {
-        if (!hasOut) {
-          // Reduce, broadcast through NVLS
-          using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
-          // Coverity complains about a possible overrun inside the class below, but that's actually
-          // a false positive.
-          // coverity[identity_transfer:FALSE]
-          Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
-            prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL,
-              work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work);
-          for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-            ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
-            int nelem = min(chunkSize, size - offset);
-            prims.directRecvDirectSend(offset, offset, nelem);
-          }
-        } else {
-          // Reduce, send to network
-          using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
-          // Coverity complains about a possible overrun inside the class below, but that's actually
-          // a false positive.
-          // coverity[identity_transfer:FALSE]
-          Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
-            prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->out, NULL, NULL,
-              work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 1, work);
-          for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-            ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
-            int nelem = min(chunkSize, size - offset);
-            prims.directRecvDirectSend(offset, offset, nelem);
-          }
+        // Reduce, send to network
+        using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
+        // Coverity complains about a possible overrun inside the class below, but that's actually
+        // a false positive.
+        // coverity[identity_transfer:FALSE]
+        Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
+        prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->out, NULL, work->recvbuff,
+          work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 1, work);
+        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+          ssize_t offset = work->regUsed && work->netRegUsed ? gridOffset + (nvls->headRank * nChannels + bid) * chunkSize
+                                                             : gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
+          int nelem = min(chunkSize, size - offset);
+          prims.directRecvDirectSend(offset, offset, nelem);
        }
      } else if (tid < tidEndBcast && nvls->headRank != -1) {
        // Recv from network, broadcast
@@ -504,10 +504,11 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
        // a false positive.
        // coverity[identity_transfer:FALSE]
        Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
-          prims(tid - tidEndReduce, nThreadsBcast, &nvls->out, &nvls->down, NULL, NULL,
+          prims(tid - tidEndReduce, nThreadsBcast, &nvls->out, &nvls->down, NULL, work->recvbuff,
            work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, work);
        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
+          ssize_t offset = work->regUsed && work->netRegUsed ? gridOffset + (nvls->headRank * nChannels + bid) * chunkSize
+                                                             : gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
          int nelem = min(chunkSize, size - offset);
          prims.directRecvDirectSend(offset, offset, nelem);
        }
@@ -660,10 +661,9 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PR

    if (tid < nthreadsSplit) {
      if (recv == -1) {
-        if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
+        if (work->netRegUsed) {
          if (groupTid == 0) {
-            int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE);
-            Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>::sendPeerNotify(send, connIndex, steps);
+            Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>::sendPeerNotify(send, connIndex, 1);
          }
          __syncwarp();
        } else {
@@ -673,8 +673,10 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PR
          for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
            ssize_t offset = gridOffset + bid * int(chunkSize);
            int nelem = min(chunkSize, size - offset);
+            // coverity[overrun-call] => Coverity think prims.index can be greater than 1
            prims.directSend(offset, offset, nelem);
          }
+          // coverity[overrun-call] => Coverity think prims.index can be greater than 1
        }
      } else {
        Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
@@ -683,18 +685,19 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PR
        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
          ssize_t offset = gridOffset + bid * int(chunkSize);
          int nelem = min(chunkSize, size - offset);
+          // coverity[overrun-call] => Coverity think prims.index can be greater than 1
          prims.directRecvReduceDirectSend(offset, offset, nelem);
        }
+        // coverity[overrun-call] => Coverity think prims.index can be greater than 1
      }
    }
    else {
      if (recv == nranks) {
        // I'm the first in the broadcast chain, I need to perform the division (postOp)
        if (send == -1) {
-          if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
+          if (work->netRegUsed) {
            if (groupTid == 0) {
-              int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE);
-              Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>::recvPeerNotify(recv, connIndex, steps);
+              Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>::recvPeerNotify(recv, connIndex, 1);
            }
            __syncwarp();
          } else {
@@ -720,7 +723,7 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PR
          for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
            ssize_t offset = gridOffset + bid * int(chunkSize);
            int nelem = min(chunkSize, size - offset);
-            prims.directRecvCopyDirectSend(offset, nelem, /*postOp*/true);
+            prims.directRecvCopyDirectSend(offset, offset, nelem, /*postOp*/true);
          }
        }
      } else {
@@ -740,7 +743,7 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PR
          for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
            ssize_t offset = gridOffset + bid*int(chunkSize);
            int nelem = min(chunkSize, size-offset);
-            prims.directRecvCopyDirectSend(offset, nelem);
+            prims.directRecvCopyDirectSend(offset, offset, nelem);
          }
        }
      }
@@ -15,37 +15,49 @@ namespace {
    const int rank = ring->userRanks[0];
    const int nextRank = ring->userRanks[1];
    const int root = work->root;
-    size_t chunkCount;
-    size_t channelCount;
-    size_t gridOffset;
-    ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
+    ssize_t chunkCount;
+    ssize_t channelCount;
+    ssize_t gridOffset;
+    ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (ssize_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
    size_t offset;
    int nelem;
+    int workNthreads;
+    bool isNetOffload = work->isOneRPN && work->netRegUsed;

    T *inputBuf = (T*)work->sendbuff;
    T *outputBuf = (T*)work->recvbuff;
-    // Coverity reports that the callee treats &ring->next as an array.  However, due to the use of
-    // FanSymmetric<1>, only the first element is ever accessed, so it's fine.
-    // coverity[callee_ptr_arith:FALSE]
-    Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0>
-      prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg, 0, 0, 0, work);
+    workNthreads = isNetOffload ? WARP_SIZE : nthreads;

-    for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
-      offset = gridOffset + elemOffset;
-      nelem = min(chunkCount, channelCount - elemOffset);
+    if (tid < workNthreads) {
+      // Coverity reports that the callee treats &ring->next as an array.  However, due to the use of
+      // FanSymmetric<1>, only the first element is ever accessed, so it's fine.
+      // coverity[callee_ptr_arith:FALSE]
+      Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0>
+        prims(tid, workNthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg, 0, 0, 0, work);

-      if (rank == root) {
-        if (inputBuf == outputBuf) {
-          prims.directSend(offset, offset, nelem);
+      for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
+        offset = gridOffset + elemOffset;
+        nelem = min(chunkCount, channelCount - elemOffset);
+
+        if (rank == root) {
+          if (inputBuf == outputBuf || isNetOffload) {
+            prims.directSend(offset, offset, nelem);
+          } else {
+            prims.directCopySend(offset, offset, nelem);
+          }
+        } else if (nextRank == root) {
+          prims.directRecv(offset, offset, nelem);
        } else {
-          prims.directCopySend(offset, offset, nelem);
+          prims.directRecvCopyDirectSend(offset, offset, nelem);
        }
-      } else if (nextRank == root) {
-        prims.directRecv(offset, offset, nelem);
-      } else {
-        prims.directRecvCopyDirectSend(offset, nelem);
      }
+    } else if (inputBuf != outputBuf && rank == root) {
+      inputBuf = inputBuf + gridOffset;
+      outputBuf = outputBuf + gridOffset;
+      reduceCopy<COLL_UNROLL, RedOp, T, 0, 1, 1, 0, 1, 1, /*PreOpSrcs=*/0>
+        (tid - workNthreads, nthreads - workNthreads, work->redOpArg, &work->redOpArg, false, 1, (void**)&inputBuf, 1, (void**)&outputBuf, channelCount);
    }
+    if (isNetOffload) barrier_sync(14, nthreads);
  }
 }

@@ -396,6 +396,9 @@ __device__ void ncclDevFunc_Nop();
    ncclKernelMain<specializedFnId, RunWorkBatch<coll, ty, redop<ty>, algo, proto>>(&args4K.args); \
  }

+#define DEFINE_ncclDevKernel_nop(suffix, coll, redop, ty, algo, proto, specializedFnId) \
+  __global__ void ncclDevKernel_##suffix(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {}
+
 #define DEFINE_ncclDevFunc(suffix, coll, redop, ty, algo, proto) \
  __device__ void ncclDevFunc_##suffix() { \
    RunWorkBatch<coll, ty, redop<ty>, algo, proto>().run(); \
@@ -65,19 +65,23 @@ __device__ __forceinline__ void reduceCopyPacks(
  uintptr_t minSrcs[MinSrcs + !MinSrcs];
  uintptr_t minDsts[MinDsts + !MinDsts];
  #pragma unroll
-  for (int s=0; s < MinSrcs; s++)
+  for (int s=0; s < MinSrcs; s++) {
    minSrcs[s] = cvta_to_global(srcPtrFn(s)) + threadBytesBehind;
+  }
+
  #pragma unroll
-  for (int d=0; d < MinDsts; d++)
+  for (int d=0; d < MinDsts; d++) {
    // Yes, for some template arguments this code will be unreachable.  That's fine.
    // coverity[dead_error_line]
    minDsts[d] = cvta_to_global(dstPtrFn(d)) + threadBytesBehind;
+  }

  // We dictate loop termination condition according to whether partial hunks
  // can be handled or not.
  while (Unroll==1 ? (BytePerPack <= threadBytesAhead) : (0 < nHunksAhead)) {
    BytePack<BytePerPack> acc[Unroll];

+    // minSrcs[0] cannot be nullptr so we always process it
    { RedFn preFn(0 < PreOpSrcs ? preOpArgs[0] : 0);
      #pragma unroll Unroll
      for (int u=0; u < Unroll; u++) {
@@ -163,7 +167,8 @@ __device__ __forceinline__ void reduceCopyPacks(
      }
    }
    for (int d=MinDsts; (MinDsts < MaxDsts) && (d < MaxDsts) && (d < nDsts); d++) {
-      uintptr_t dst = cvta_to_global(dstPtrFn(d)) + threadBytesBehind;
+      uintptr_t dstPtr = cvta_to_global(dstPtrFn(d));
+      uintptr_t dst = dstPtr + threadBytesBehind;
      #pragma unroll Unroll
      for (int u=0; u < Unroll; u++) {
        st_global<BytePerPack>(dst, acc[u]);
@@ -173,11 +178,15 @@ __device__ __forceinline__ void reduceCopyPacks(

    nWarps = nThreads/WARP_SIZE;
    #pragma unroll
-    for (int s=0; s < MinSrcs; s++) minSrcs[s] += (nWarps-1)*BytePerHunk;
+    for (int s=0; s < MinSrcs; s++) {
+      minSrcs[s] += (nWarps-1)*BytePerHunk;
+    }
    #pragma unroll
    // Yes, for some template arguments this code will be unreachable.  That's fine.
    // coverity[dead_error_line]
-    for (int d=0; d < MinDsts; d++) minDsts[d] += (nWarps-1)*BytePerHunk;
+    for (int d=0; d < MinDsts; d++) {
+      minDsts[d] += (nWarps-1)*BytePerHunk;
+    }
    threadBytesBehind += nWarps*BytePerHunk;
    threadBytesAhead -= nWarps*BytePerHunk;
    nHunksAhead -= nWarps;
@@ -5,7 +5,7 @@ import sys
 # Order of redops, tys, protos, algos must match src/include/device.h
 all_colls =  ["Broadcast","Reduce","AllGather","ReduceScatter","AllReduce","SendRecv"]
 all_redops = ["Sum","Prod","MinMax","PreMulSum","SumPostDiv"]
-all_tys =    ["i8","u8","i32","u32","i64","u64","f16","f32","f64","bf16"]
+all_tys =    ["i8","u8","i32","u32","i64","u64","f16","f32","f64","bf16","f8e4m3","f8e5m2"]
 all_protos = ["LL","LL128","SIMPLE"]
 all_algos =  ["TREE","RING","COLLNET_DIRECT","COLLNET_CHAIN","NVLS","NVLS_TREE","PAT"]

@@ -107,6 +107,9 @@ def required_cuda(coll, redop, ty, algo, proto):
  if coll in ("AllReduce","Reduce","ReduceScatter"):
    if redop=="SumPostDiv" and ty[0] not in ("i","u"): return None
    if ty=="bf16": cudart = max(cudart, 11000)
+    if ty.startswith("f8"):
+      cudart = max(cudart, 11080)
+      arch = max(arch, 900)

  if "NVLS" in algo:
    if coll in ("AllReduce","Reduce","ReduceScatter"):
@@ -125,7 +128,7 @@ def required_cuda(coll, redop, ty, algo, proto):
 def equivalent_primary(coll, redop, ty, algo, proto):
  if coll in ("AllReduce", "Reduce", "ReduceScatter"):
    # map signed integer sum/prod to unsigned
-    if redop in ("Sum","Prod","PreMulSum") and ty[0]=="i":
+    if redop in ("Sum","Prod","PreMulSum","SumPostDiv") and ty[0]=="i":
      return (coll, redop, "u"+ty[1:], algo, proto)
    # map signed integer min/max to unsigned for non-NVLS
    if redop=="MinMax" and ty[0]=="i" and ("NVLS" not in algo):
@@ -365,7 +368,9 @@ ty_to_cxx = {
  "f16": "half",
  "f32": "float",
  "f64": "double",
-  "bf16": "__nv_bfloat16"
+  "bf16": "__nv_bfloat16",
+  "f8e4m3": "__nv_fp8_e4m3",
+  "f8e5m2": "__nv_fp8_e5m2"
 }

 # Generate each <gensrc>/<impl>.cu:
@@ -385,15 +390,23 @@ for name in name_to_funcs.keys():
      sym = paste("_", coll, redop, ty, algo, proto)
      fn_id = primary_to_index[kfn]
      cudart, arch = required_cuda(*kfn)
+      s = "DEFINE_ncclDevKernel({sym}, ncclFunc{coll}, {redop_cxx}, {ty_cxx}, NCCL_ALGO_{algo}, NCCL_PROTO_{proto}, {fn_id})\n"
      if (cudart, arch) != (0, 0):
-        out("#if CUDART_VERSION >= %d && __CUDA_ARCH__ >= %d\n" % (cudart, arch))
-      out(
-        "DEFINE_ncclDevKernel({sym}, ncclFunc{coll}, {redop_cxx}, {ty_cxx}, NCCL_ALGO_{algo}, NCCL_PROTO_{proto}, {fn_id})\n"
-        .format(sym=sym, coll=coll, redop_cxx=redop_to_cxx[redop], ty_cxx=ty_to_cxx[ty],
-                algo=(algo or "RING"), proto=(proto or "SIMPLE"), fn_id=fn_id)
-      )
-      if (cudart, arch) != (0, 0):
-        out("#endif\n")
+        # Add conditional compilation logic around s. If CUDART_VERSION is satisfactory
+        # we must compile a kernel regardless of __CUDA_ARCH__ since the host code has
+        # to link against some stub.
+        s = "#if CUDART_VERSION >= {cudart}\n" \
+            "  #if __CUDA_ARCH__ < {arch}\n" \
+            "    DEFINE_ncclDevKernel_nop({sym}, ncclFunc{coll}, {redop_cxx}, {ty_cxx}, NCCL_ALGO_{algo}, NCCL_PROTO_{proto}, {fn_id})\n" \
+            "  #else\n" \
+            "    " + s + \
+            "  #endif\n" \
+            "#endif\n"
+      out(s.format(
+        cudart=cudart, arch=arch, sym=sym, coll=coll,
+        redop_cxx=redop_to_cxx[redop], ty_cxx=ty_to_cxx[ty],
+        algo=(algo or "RING"), proto=(proto or "SIMPLE"), fn_id=fn_id
+      ))

    for fn in fns:
      (coll, redop, ty, algo, proto) = fn
@@ -33,17 +33,21 @@ inline __device__ void load64gpu(const uint64_t* ptr, uint64_t &v) {
 // Map internal association of handle with group and peer index (called once at init time)
 inline __device__ void ncclNetDeviceUnpackSetup(void* ohandle, const int group, const int index) {
  struct unpackNetDeviceHandle* handle = (struct unpackNetDeviceHandle*) ohandle;
+  // coverity[index_parm:FALSE]
  ncclShmem.groups[group].devicePlugin.unpack.g_meta[index] = handle->meta;
  ncclShmem.devicePlugin.unpack.bounce_buf = handle->bounce_buf;
+  // coverity[index_parm:FALSE]
  ncclShmem.groups[group].devicePlugin.unpack.head[index] = handle->head;
 }

 inline __device__ void ncclNetDeviceIncrementHead(const int group, const int index) {
+  // coverity[index_parm:FALSE]
  ncclShmem.groups[group].devicePlugin.unpack.head[index]++;
 }

 inline __device__ void ncclNetDeviceSaveHead(void* ohandle, const int group, const int index) {
  struct unpackNetDeviceHandle* handle = (struct unpackNetDeviceHandle*) ohandle;
+  // coverity[index_parm:FALSE]
  handle->head = ncclShmem.groups[group].devicePlugin.unpack.head[index];
 }

@@ -62,6 +62,10 @@ ncclResult_t ncclLaunchOneRank(void* dst, void const* src, size_t nElts, struct
  case ncclUint32:   kernel = (void const*)&oneRankReduce<FuncPreMulSum<uint32_t>>; break;
  case ncclInt64:    kernel = (void const*)&oneRankReduce<FuncPreMulSum<int64_t>>; break;
  case ncclUint64:   kernel = (void const*)&oneRankReduce<FuncPreMulSum<uint64_t>>; break;
+  #if defined(__CUDA_FP8_TYPES_EXIST__) && __CUDA_ARCH__ >= 900
+  case ncclFloat8e4m3: kernel = (void const*)&oneRankReduce<FuncPreMulSum<__nv_fp8_e4m3>>; break;
+  case ncclFloat8e5m2: kernel = (void const*)&oneRankReduce<FuncPreMulSum<__nv_fp8_e5m2>>; break;
+  #endif
  case ncclFloat16:  kernel = (void const*)&oneRankReduce<FuncPreMulSum<half>>; break;
  #if defined(__CUDA_BF16_TYPES_EXIST__)
  case ncclBfloat16: kernel = (void const*)&oneRankReduce<FuncPreMulSum<__nv_bfloat16>>; break;
@@ -103,7 +103,7 @@ struct FanSymmetric {
 };

 // The primitives class. Specialized per protocol in the other headers.
-template<typename T, typename RedOp, typename Fan, int Direct, typename Proto, int P2p>
+template<typename T, typename RedOp, typename Fan, int Direct, typename Proto, int P2p, bool isNetOffload = false>
 class Primitives;

 // Used by LL & LL128 to implement direct members in the naive way.
@@ -121,9 +121,12 @@ struct PrimitivesWithoutDirect {
  __device__ void directCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
    static_cast<RealPrimitives*>(this)->copySend(inpIx, outIx, eltN, postOp);
  }
-  __device__ void directRecvCopyDirectSend(intptr_t outIx, int eltN, bool postOp=false) {
+  __device__ void directRecvCopyDirectSend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
    static_cast<RealPrimitives*>(this)->recvCopySend(outIx, eltN, /*postOp=*/false);
  }
+  __device__ void directRecvDirectSend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
+    return;
+  }
  __device__ void recvReduceCopyDirectSend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
    // Direct is only for the send part
    static_cast<RealPrimitives*>(this)->recvReduceCopySend(inpIx, outIx, eltN, postOp);
@@ -4,9 +4,9 @@
 * See LICENSE.txt for license information
 ************************************************************************/

-template<typename T, typename RedOp, typename Fan, int Direct, int P2p>
-class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
-  public PrimitivesWithoutDirect<Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>> {
+template<typename T, typename RedOp, typename Fan, int Direct, int P2p, bool isNetOffload>
+class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p, isNetOffload>:
+  public PrimitivesWithoutDirect<Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p, isNetOffload>> {

  // In the case of Fan::MaxRecv == 0, we need to force MaxRecv to 1 for this to compile
  // This is because of a recv buffer which is allocated to MaxRecv length in send-only cases
@@ -8,9 +8,9 @@

 #define NCCL_LL128_FLAGTHREAD (NCCL_LL128_LINEELEMS-1)

-template<typename T, typename RedOp, typename Fan, int Direct, int P2p>
-class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
-  public PrimitivesWithoutDirect<Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>> {
+template<typename T, typename RedOp, typename Fan, int Direct, int P2p, bool isNetOffload>
+class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p, isNetOffload>:
+  public PrimitivesWithoutDirect<Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p, isNetOffload>> {

  static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend;
  static constexpr int Input=0, Output=1;
@@ -14,9 +14,9 @@ enum primsMode {
 };

 template<typename T, typename RedOp, typename Fan, int Direct,
-         int SlicePerChunk, int StepPerSlice, int Unroll, int P2p, int MultimemSrcs, int MultimemDsts>
+         int SlicePerChunk, int StepPerSlice, int Unroll, int P2p, int MultimemSrcs, int MultimemDsts, bool isNetOffload>
 class Primitives<
-    T, RedOp, Fan, Direct, ProtoSimple<SlicePerChunk, StepPerSlice, Unroll, MultimemSrcs, MultimemDsts>, P2p
+    T, RedOp, Fan, Direct, ProtoSimple<SlicePerChunk, StepPerSlice, Unroll, MultimemSrcs, MultimemDsts>, P2p, isNetOffload
  > {
  static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend;
  static constexpr int Input=0, Output=1;
@@ -34,11 +34,7 @@ class Primitives<
                       PatMode = 0x800,
                       NvlsMinPolling = 0x1000,
                       NetDeviceUnpack = 0x2000,
-                       AnyNetDeviceUnpack = 0x4000,
-                       NvlsDirectRead = 0x8000,
-                       NvlsDirectWrite = 0x10000,
-                       IpcWrite = 0x20000,
-                       IpcRead = 0x40000;
+                       AnyNetDeviceUnpack = 0x4000;
  const int tid, tidInBlock;
  const int nthreads;
  int nworkers;
@@ -119,12 +115,9 @@ class Primitives<
  template <int DirectRecv, int DirectSend, int Recv, int Send, int Src, int Dst>
  __device__ __forceinline__ void waitPeer(intptr_t srcIx, intptr_t dstIx, int offset, int nelts) {
    const bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send;
-    const bool noRecvWait = DirectRecv && Src && (flags & (DirectRead | IpcRead));        // no wait when directly reading from remote input
-    const bool noSendWait = DirectSend && (flags & (DirectRead|DirectWrite)); // no wait in empty send (e.g. directScatter) or direct remote write
    // Yes, for some template arguments this code will be unreachable.  That's fine.
    // coverity[dead_error_line]
-    if (((flags & (Recv*RoleWaitRecv)) && !noRecvWait) ||
-        ((flags & (Send*RoleWaitSend)) && !noSendWait)) {
+    if ((flags & (Recv * RoleWaitRecv)) || (flags & (Send * RoleWaitSend))) {
      int spins = 0;
      while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) {
        connStepCache = loadStepValue(connStepPtr);
@@ -134,27 +127,38 @@ class Primitives<
    }

    if (flags & (Recv*RoleWaitRecv | Send*RoleWaitSend)) {
-      if (flags & ConnFifoEnabled)
+      if ((flags & ConnFifoEnabled) && (flags & (Send * RoleWaitSend)))
        connFifo[step%NCCL_STEPS].size = nelts*sizeof(T);

      void **ptrs = isSendNotRecv ? (ncclShmem.groups[group].dsts + Dst)
                                  : (ncclShmem.groups[group].srcs + Src);
      if (flags & NetRegMode) {
-         // Do nothing
+        if (P2p) {
+          ptrs[index] = NULL;
+        } else {
+          if (isSendNotRecv) {
+            if (!Recv)
+              ptrs[index] = NULL;
+            else
+              ptrs[index] = (T*)ncclShmem.groups[group].userOutput + dstIx + offset;
+          } else {
+            ptrs[index] = (T*)ncclShmem.groups[group].userOutput + srcIx + offset;
+          }
+        }
      } else if ((flags & ConnFifoEnabled) && connFifo[step%NCCL_STEPS].mode == NCCL_MODE_OFFSET) {
        ptrs[index] = connEltsFifo + loadInt(&connFifo[step%NCCL_STEPS].offset)/sizeof(T);
      } else if (isSendNotRecv && DirectSend) {
-        if (flags & (DirectWrite | NvlsDirectWrite | IpcWrite)) {
+        if (flags & DirectWrite) {
          ptrs[index] = directBuff + dstIx + offset;
-        } else if ((flags & DirectRead) || (flags & IpcRead)) {  // empty send
+        } else if (flags & DirectRead) {  // empty send
          ptrs[index] = nullptr;
        } else {
          ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize;
        }
      } else if (!isSendNotRecv && DirectRecv) {
-        if (flags & (DirectRead | NvlsDirectRead | IpcRead)) {
+        if (flags & DirectRead) {
          ptrs[index] = directBuff + srcIx + offset;
-        } else if ((flags & DirectWrite) || (flags & IpcWrite)) {
+        } else if (flags & DirectWrite) {
          ptrs[index] = directBuff + dstIx + offset;  // send to next from my output buffer
        } else {
          ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize;
@@ -198,7 +202,7 @@ class Primitives<
    int slice = 0;
    int offset = 0;

-    if (tid < nworkers && offset < nelem && ((flags & NetRegMode) == 0)) {
+    if (tid < nworkers && offset < nelem && !isNetOffload) {
      // Worker-only loop for non-empty slices. Non-workers and empty slices are
      // processed in the loop following this if block. The benefit of splitting
      // the loop like this is we pull two branches out of the critical path.
@@ -252,7 +256,7 @@ class Primitives<
             * so we need to check whether MultimemSrcs and MultimemDsts are 0. */
            && MultimemSrcs == 0 && MultimemDsts == 0 && !Src) {
          // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
-          if (Send) {
+          if (Send && Dst && ncclShmem.groups[group].srcs[0] != ncclShmem.groups[group].dsts[1]) {
            reduceCopy<Unroll, RedOp, T, 0, 1, 1, 0, 1, MaxSend, /*PreOpSrcs*/0>
              (tid, nworkers, /*redArg*/0, /*preOpArgs*/nullptr, /*postOp*/false,
               1, ncclShmem.groups[group].srcs,
@@ -269,16 +273,32 @@ class Primitives<
        } else if (ncclShmem.groups[group].srcs[0] && ncclShmem.groups[group].dsts[0]) {
          constexpr int PreOpSrcs = SrcBuf != Input ? 0 :
                                    DirectRecv*MaxRecv == NCCL_MAX_DIRECT_ARITY ? (1+NCCL_MAX_DIRECT_ARITY) : 1;
-          reduceCopy<Unroll, RedOp, T,
-            MultimemSrcs, Recv+Src, Recv*MaxRecv+Src,
-            MultimemDsts, Send+Dst, Send*MaxSend+Dst, PreOpSrcs>
-            (tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp,
-             Recv*fan.nrecv()+Src, ncclShmem.groups[group].srcs,
-             Send*fan.nsend()+Dst, ncclShmem.groups[group].dsts,
-             workSize);
+          if (Send && Dst && ncclShmem.groups[group].dsts[1] == nullptr) {
+            // this case should only be directCopySend() with registered buffers and send to net peer
+            reduceCopy<Unroll, RedOp, T,
+              0, Recv + Src, Recv * MaxRecv + Src,
+              0, 1, 1, PreOpSrcs>
+              (tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp,
+                Recv * fan.nrecv() + Src, ncclShmem.groups[group].srcs,
+                1, ncclShmem.groups[group].dsts,
+                workSize);
+          } else {
+            reduceCopy<Unroll, RedOp, T,
+              MultimemSrcs, Recv + Src, Recv * MaxRecv + Src,
+              MultimemDsts, Send + Dst, Send * MaxSend + Dst, PreOpSrcs>
+              (tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp,
+                Recv * fan.nrecv() + Src, ncclShmem.groups[group].srcs,
+                Send * fan.nsend() + Dst, ncclShmem.groups[group].dsts,
+                workSize);
+          }
+        } else {
+          // we will come here when calling prims.directSend with net peer,
+          // in this case, ncclShmem.groups[group].dsts[0] == NULL, so we
+          // skip data flush.
+          workSize = 0;
        }
        barrier(); // This barrier has a counterpart in following loop
-        postPeer<Recv, Send>(0 < sliceSize);
+        postPeer<Recv, Send>(0 < workSize);
        offset += sliceSize;
        slice += 1;
        // Yes, for some template arguments this code will be unreachable.  That's fine.
@@ -295,10 +315,11 @@ class Primitives<
      sliceSize = sliceSize < nelem-offset ? sliceSize : nelem-offset;
      { // Only workers could have Wait roles so we know the slice must be empty
        // since we've exited the loop above.
-        waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(0, 0, 0, 0);
+        waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(0, 0, 0, sliceSize);
      }
      barrier(); // Has couterpart in preceding worker-only loop.
-      postPeer<Recv, Send>(0 < sliceSize);
+      int workSize = ncclShmem.aborted ? 0 : sliceSize;
+      postPeer<Recv, Send>(0 < workSize);
      offset += sliceSize;
      slice += 1;
    }
@@ -347,17 +368,17 @@ public:
            ptrs[index] = connEltsFifo + offset/sizeof(T);
          } else if (Direct && fn.work->regUsed) {
            if (isSendNotRecv) {
-              if (flags & (DirectWrite | IpcWrite)) {
+              if (flags & DirectWrite) {
                ptrs[index] = directBuff;
-              } else if (flags & (DirectRead | IpcRead)) {  // empty send
+              } else if (flags & DirectRead) {  // empty send
                ptrs[index] = nullptr;
              } else {
                ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
              }
            } else {
-              if (flags & (DirectRead | IpcRead)) {
+              if (flags & DirectRead) {
                ptrs[index] = directBuff;
-              } else if (flags & (DirectWrite | IpcWrite)) {
+              } else if (flags & DirectWrite) {
                if (Send)
                  ptrs[index] = directBuff;  // send to next from my output buffer
                else
@@ -440,7 +461,7 @@ private:
            int i = (j+shift)%fan.nsend();
            ssize_t pOffset = i*peerOffset;
            // Skip the data I am responsible of reducing myself
-            if (skip >= 0 && i >= skip) pOffset += peerElem;
+            if (skip >= 0 && i >= skip) pOffset += peerOffset;
            void* src0 = (T*)ncclShmem.groups[group].srcs[0] + pOffset;
            ssize_t realPeerSize = min(realSize, totalElem-pOffset);
            if (realPeerSize > 0 && ncclShmem.groups[group].dsts[i] != nullptr) {
@@ -452,7 +473,7 @@ private:
        } else if (Recv) {
          if (tid==0) ncclShmem.groups[group].dsts[0] = (T*)ncclShmem.groups[group].userOutput + outIx + offset;
          ssize_t pOffset = index*peerOffset;
-          if (skip >= 0 && index >= skip) pOffset += peerElem;
+          if (skip >= 0 && index >= skip) pOffset += peerOffset;
          // Adjust remote index with peer offset in case we are directly pulling from peer's output buffer
          waitPeer<DirectRecv, 0, 1, 0, 0, 1>(outIx+pOffset, outIx+pOffset, offset, realSize);
          subBarrier();
@@ -460,7 +481,7 @@ private:
          for (int j=0; j<fan.nrecv(); j++) {
            int i = (j+shift)%fan.nrecv();
            pOffset = i*peerOffset;
-            if (skip >= 0 && i >= skip) pOffset += peerElem;
+            if (skip >= 0 && i >= skip) pOffset += peerOffset;
            void* dst0 = (T*)ncclShmem.groups[group].dsts[0] + pOffset;
            ssize_t realPeerSize = min(realSize, totalElem-pOffset);
            if (DirectRecv && ncclShmem.groups[group].srcs[i] == dst0) realPeerSize = 0;
@@ -474,7 +495,7 @@ private:
    }
  }

-  __device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, uint32_t direct, int regFlag) {
+  __device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, uint32_t direct, int ipcRegFlag, int netRegFlag) {
    conn = &peer->recv[connIndex];
    if (conn->netDeviceHandle.netDeviceType == NCCL_NET_DEVICE_UNPACK) {
      // handle must be a device ptr
@@ -499,33 +520,34 @@ private:
      if (conn->connFifo != nullptr) {
        flags |= ConnFifoEnabled;
        connFifo = conn->connFifo;
-      } else if (Direct && regFlag) {
-        // User buffers have been registered
-        if (conn->flags & (NCCL_IPC_READ | NCCL_IPC_WRITE)) {
-          if (P2p) {
-            flags |= conn->flags & NCCL_IPC_WRITE ? IpcWrite : IpcRead;
-          } else if (connIndex == 1 && direct) {
-            flags |= IpcRead;
-          } else {
-            flags |= direct & NCCL_DIRECT_READ ? IpcRead : IpcWrite;
+      }
+      if (Direct) {
+        if (ipcRegFlag) {
+          // User buffers have been registered
+          if (conn->flags & (NCCL_P2P_READ | NCCL_P2P_WRITE)) {
+            if (P2p) {
+              flags |= conn->flags & NCCL_P2P_WRITE ? DirectWrite : DirectRead;
+            } else if (connIndex == 1 && direct) {
+              flags |= DirectRead;
+            } else {
+              flags |= direct & NCCL_P2P_READ ? DirectRead : DirectWrite;
+            }
+          } else if ((conn->flags & NCCL_NVLS_MIN_POLL)) {
+            /* NVLS direct */
+            flags |= DirectRead;
          }
-        } else if (conn->flags & (NCCL_DIRECT_WRITE | NCCL_DIRECT_READ)) {
-          if (P2p) {
-            flags |= conn->flags & NCCL_DIRECT_WRITE ? DirectWrite : DirectRead;
-          } else if (connIndex == 1 && direct) {
-            flags |= DirectRead;  // scatter-reduce use direct pull
-          } else {
-            flags |= direct & NCCL_DIRECT_READ ? DirectRead : DirectWrite;
+        }
+        if (netRegFlag) {
+          if (conn->flags & NCCL_DIRECT_NIC) {
+            flags |= NetRegMode;
+            connFifo[step % NCCL_STEPS].size = 0;
          }
-        } else if ((conn->flags & NCCL_NVLS_MIN_POLL)) {
-          /* NVLS direct */
-          flags |= NvlsDirectRead;
        }
      }
    }
  }

-  __device__ __forceinline__ void loadSendConn(ncclDevChannelPeer *peer, int connIndex, uint32_t direct, int regFlag) {
+  __device__ __forceinline__ void loadSendConn(ncclDevChannelPeer *peer, int connIndex, uint32_t direct, int ipcRegFlag, int netRegFlag) {
    conn = &peer->send[connIndex];
    step = conn->step;
    step = roundUp(step, SlicePerChunk*StepPerSlice);
@@ -544,27 +566,26 @@ private:
      connStepCache = loadStepValue(connStepPtr);
      connStepSize = conn->stepSize/sizeof(T);
      connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
-      if (connFifo == nullptr && Direct && regFlag) {
-        // User buffers have been registered
-        if (conn->flags & (NCCL_IPC_READ | NCCL_IPC_WRITE)) {
-          if (P2p) {
-            flags |= conn->flags & NCCL_IPC_WRITE ? IpcWrite : IpcRead;
-          } else if (connIndex == 1 && direct) {
-            flags |= IpcRead;
-          } else {
-            flags |= direct & NCCL_DIRECT_READ ? IpcRead : IpcWrite;
+      if (Direct) {
+        if (ipcRegFlag) {
+          // User buffers have been registered
+          if (conn->flags & (NCCL_P2P_WRITE | NCCL_P2P_READ)) {
+            if (P2p) {
+              flags |= conn->flags & NCCL_P2P_WRITE ? DirectWrite : DirectRead;
+            } else if (connIndex == 1 && direct) {
+              flags |= DirectRead;  // scatter-reduce use direct pull
+            } else {
+              flags |= direct & NCCL_P2P_READ ? DirectRead : DirectWrite;
+            }
+          } else if ((conn->flags & NCCL_NVLS_MIN_POLL)) {
+            /* NVLS direct */
+            flags |= DirectWrite;
          }
-        } else if (conn->flags & (NCCL_DIRECT_WRITE | NCCL_DIRECT_READ)) {
-          if (P2p) {
-            flags |= conn->flags & NCCL_DIRECT_WRITE ? DirectWrite : DirectRead;
-          } else if (connIndex == 1 && direct) {
-            flags |= DirectRead;  // scatter-reduce use direct pull
-          } else {
-            flags |= direct & NCCL_DIRECT_READ ? DirectRead : DirectWrite;
+        }
+        if (netRegFlag) {
+          if (conn->flags & NCCL_DIRECT_NIC) {
+            flags |= NetRegMode;
          }
-        } else if ((conn->flags & NCCL_NVLS_MIN_POLL)) {
-          /* NVLS direct */
-          flags |= NvlsDirectWrite;
        }
      }
    }
@@ -574,8 +595,8 @@ private:
  __device__ Primitives(
      int tid, int nthreads, int const *recvPeers, int const *sendPeers,
      void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
-      uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclDevWorkColl* e = nullptr,
-      bool ipcReg = false, bool netReg = false, int stepSize_ = 0, int mode = primsModeDefault
+      uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclDevWorkColl* collWork = nullptr,
+      struct ncclDevWorkP2p* p2pWork = nullptr, int stepSize_ = 0, int mode = primsModeDefault
    ):
    tid(tid), nthreads(nthreads), tidInBlock(threadIdx.x), group(group),
    stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_) {
@@ -643,11 +664,23 @@ private:

    // Coverity thinks that index could be -1 here but that's not actually the case.
    // coverity[negative_returns:FALSE]
-    if (flags & (RoleWaitRecv|RolePostRecv)) loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, e ? e->direct : 0, e ? e->regUsed : ipcReg);
-    // coverity[negative_returns:FALSE]
-    if (flags & (RoleWaitSend|RolePostSend)) loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, e ? e->direct : 0, e ? e->regUsed : ipcReg);
-
-    if (netReg) flags |= NetRegMode;
+    int sendIpcReg;
+    int recvIpcReg;
+    int sendNetReg;
+    int recvNetReg;
+    if (P2p) {
+      sendIpcReg = p2pWork ? p2pWork->sendIpcReg : 0;
+      recvIpcReg = p2pWork ? p2pWork->recvIpcReg : 0;
+      sendNetReg = p2pWork ? p2pWork->sendNetReg : 0;
+      recvNetReg = p2pWork ? p2pWork->recvNetReg : 0;
+    } else {
+      recvIpcReg = sendIpcReg = collWork ? collWork->regUsed : 0;
+      recvNetReg = sendNetReg = collWork ? collWork->netRegUsed : 0;
+    }
+    // coverity[overrun-call] => Coverity think prims.index can be greater than 1
+    if (flags & (RoleWaitRecv|RolePostRecv)) loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, collWork ? collWork->direct : 0, recvIpcReg, recvNetReg);
+    // coverity[overrun-call] => Coverity think prims.index can be greater than 1
+    if (flags & (RoleWaitSend|RolePostSend)) loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, collWork ? collWork->direct : 0, sendIpcReg, sendNetReg);

    if (barrierAny(flags & NetDeviceUnpack)) {
      flags |= AnyNetDeviceUnpack;
@@ -659,8 +692,10 @@ private:
      }
    }

-    // coverity[negative_returns:FALSE]
-    setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclDevWorkCollReg*)e, (uint8_t)(e ? e->regUsed : ipcReg), peer);
+    // coverity[negative_returns:FALSE] => coverity thinks that index could be -1 but that's not actually the case
+    // coverity[var_deref_model] => coverity thinks work can dereferenced if NULL but this is not the case
+    setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclDevWorkCollReg*)collWork, sendIpcReg || recvIpcReg, peer);
+    // coverity[uninit_member] => coverity thinks fan.n is not initialized
  }

  __device__ ~Primitives() {
@@ -683,6 +718,16 @@ private:
    // Make sure all threads are done writing back conn->step and done using
    // ncclShmem.groups[group]
    barrier();
+
+    if ((flags & DirectRead) && (flags & RoleWaitSend) && P2p) {
+      // For sendrecv DirectRead, sender needs to wait for receiver reading data from src.
+      // This has to be done after barrier() since post thread might have contention with
+      // this check.
+      int spins = 0;
+      volatile uint64_t* tail = conn->tail;
+      volatile uint64_t* head = conn->head;
+      while (*tail > *head) if (checkAbort(spins)) break;
+    }
  }

  __device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclDevWorkCollReg* work, uint8_t ipcReg, int peer) {
@@ -693,10 +738,10 @@ private:
    }

    if (Direct && ipcReg) {
-      bool recvProvider = (flags & RoleWaitRecv) && (flags & DirectWrite || flags & IpcWrite);
-      bool sendAcceptor = (flags & RoleWaitSend) && (flags & DirectWrite || flags & IpcWrite || flags & NvlsDirectWrite);
-      bool sendProvider = (flags & RoleWaitSend) && (flags & DirectRead || flags & IpcRead); // sender provides direct buffer (to be fetched)
-      bool recvAcceptor = (flags & RoleWaitRecv) && (flags & DirectRead || flags & IpcRead || flags & NvlsDirectRead); // receiver accepts direct buffer
+      bool recvProvider = (flags & RoleWaitRecv) && (flags & DirectWrite);
+      bool sendAcceptor = (flags & RoleWaitSend) && (flags & DirectWrite);
+      bool sendProvider = (flags & RoleWaitSend) && (flags & DirectRead); // sender provides direct buffer (to be fetched)
+      bool recvAcceptor = (flags & RoleWaitRecv) && (flags & DirectRead); // receiver accepts direct buffer
      if (recvProvider) {
        int spins = 0;
        void* volatile* slot = ncclShmem.groups[group].recvConns[index]->ptrExchange;
@@ -709,6 +754,7 @@ private:
            exchgPtr = (T*)outputBuf;
          } else {
            int localPeer = ncclShmem.comm.rankToLocalRank[peer];
+            // coverity[deref_parm:FALSE] => work cannot be NULL if ipcReg != NULL
            exchgPtr = (T*)(work->coll.recvbuffOffset + work->coll.recvbuffRmtAddrs[localPeer]);
          }
          *slot = reinterpret_cast<void*>(exchgPtr);
@@ -727,6 +773,7 @@ private:
          directBuff = reinterpret_cast<T*>(ptr);
          *slot = nullptr;
        } else {
+          // coverity[var_deref_op]
          directBuff = (T*)work->dnOutputs[index];
        }
      }
@@ -747,8 +794,10 @@ private:
          } else {
            int localPeer = ncclShmem.comm.rankToLocalRank[peer];
            if (MaxRecv == 0)
+              // coverity[var_deref_op]
              exchgPtr = (T*)(work->coll.sendbuffOffset + work->coll.sendbuffRmtAddrs[localPeer]);
            else
+              // coverity[var_deref_op]
              exchgPtr = (T*)(work->coll.recvbuffOffset + work->coll.recvbuffRmtAddrs[localPeer]);
          }

@@ -837,11 +886,11 @@ private:
  __device__ __forceinline__ void recvCopySend(intptr_t outIx, int eltN, bool postOp=false) {
    genericOp<0, 0, 1, 1, -1, Output>(-1, outIx, eltN, postOp);
  }
-  __device__ __forceinline__ void directRecvCopyDirectSend(intptr_t outIx, int eltN, bool postOp=false) {
-    genericOp<1, 1, 1, 1, -1, Output>(-1, outIx, eltN, postOp);
+  __device__ __forceinline__ void directRecvCopyDirectSend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
+    genericOp<1, 1, 1, 1, -1, Output>(inpIx, outIx, eltN, postOp);
  }
-  __device__ __forceinline__ void directRecvDirectSend(intptr_t inpIx, intptr_t outIx, int eltN) {
-    genericOp<1, 1, 1, 1, -1, -1>(inpIx, outIx, eltN, false);
+  __device__ __forceinline__ void directRecvDirectSend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
+    genericOp<1, 1, 1, 1, -1, -1>(inpIx, outIx, eltN, postOp);
  }
  __device__ __forceinline__ void recvCopyDirectSend(intptr_t outIx, int eltN, bool postOp=false) {
    genericOp<0, 1, 1, 1, -1, Output>(-1, outIx, eltN, postOp);
@@ -860,6 +909,9 @@ private:
  __device__ __forceinline__ void directRecvReduceSend(intptr_t inpIx, int eltN, bool postOp=false) {
    genericOp<1, 0, 1, 1, Input, -1>(inpIx, -1, eltN, postOp);
  }
+  __device__ __forceinline__ void recvReduceDirectSend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
+    genericOp<0, 1, 1, 1, Input, -1>(inpIx, outIx, eltN, postOp);
+  }
  __device__ __forceinline__ void directRecvReduceDirectSend(intptr_t inpIx, intptr_t outIx, ssize_t eltN, bool postOp=false) {
    genericOp<1, 1, 1, 1, Input, -1>(inpIx, outIx, eltN, postOp);
  }
@@ -20,6 +20,12 @@ struct IsFloatingPoint<half>: std::true_type {};
 template<>
 struct IsFloatingPoint<__nv_bfloat16>: std::true_type {};
 #endif
+#if defined(__CUDA_FP8_TYPES_EXIST__)
+template<>
+struct IsFloatingPoint<__nv_fp8_e4m3>: std::true_type {};
+template<>
+struct IsFloatingPoint<__nv_fp8_e5m2>: std::true_type {};
+#endif
 template<>
 struct IsFloatingPoint<float>: std::true_type {};
 template<>
@@ -298,6 +304,24 @@ SPECIALIZE_REDUCE(FuncMinMax, double, 1, double, fn.isMinNotMax ? fmin(x, y) : f
 #endif
 #endif

+#if defined(__CUDA_FP8_TYPES_EXIST__)
+#if __CUDA_ARCH__ >= 900
+  SPECIALIZE_REDUCE(FuncSum, __nv_fp8_e4m3, 1, __nv_fp8_e4m3, __nv_fp8_e4m3(__hadd(__half(x),__half(y))))
+  SPECIALIZE_REDUCE(FuncSum, __nv_fp8_e4m3, 2, __nv_fp8x2_e4m3, __nv_fp8x2_e4m3(__hadd2(__half2(x),__half2(y))))
+  SPECIALIZE_REDUCE(FuncProd, __nv_fp8_e4m3, 1, __nv_fp8_e4m3, __nv_fp8_e4m3(__hmul(__half(x),__half(y))))
+  SPECIALIZE_REDUCE(FuncProd, __nv_fp8_e4m3, 2, __nv_fp8x2_e4m3, __nv_fp8x2_e4m3(__hmul2(__half2(x),__half2(y))))
+  SPECIALIZE_REDUCE(FuncMinMax, __nv_fp8_e4m3, 1, __nv_fp8_e4m3, __nv_fp8_e4m3(fn.isMinNotMax ? __hmin(__half(x),__half(y)) : __hmax(__half(x),__half(y))))
+  SPECIALIZE_REDUCE(FuncMinMax, __nv_fp8_e4m3, 2, __nv_fp8x2_e4m3, __nv_fp8x2_e4m3(fn.isMinNotMax ? __hmin2(__half2(x),__half2(y)) : __hmax2(__half2(x),__half2(y))))
+
+  SPECIALIZE_REDUCE(FuncSum, __nv_fp8_e5m2, 1, __nv_fp8_e5m2, __nv_fp8_e5m2(__hadd(__half(x),__half(y))))
+  SPECIALIZE_REDUCE(FuncSum, __nv_fp8_e5m2, 2, __nv_fp8x2_e5m2, __nv_fp8x2_e5m2(__hadd2(__half2(x),__half2(y))))
+  SPECIALIZE_REDUCE(FuncProd, __nv_fp8_e5m2, 1, __nv_fp8_e5m2, __nv_fp8_e5m2(__hmul(__half(x),__half(y))))
+  SPECIALIZE_REDUCE(FuncProd, __nv_fp8_e5m2, 2, __nv_fp8x2_e5m2, __nv_fp8x2_e5m2(__hmul2(__half2(x),__half2(y))))
+  SPECIALIZE_REDUCE(FuncMinMax, __nv_fp8_e5m2, 1, __nv_fp8_e5m2, __nv_fp8_e5m2(fn.isMinNotMax ? __hmin(__half(x), __half(y)) : __hmax(__half(x), __half(y))))
+  SPECIALIZE_REDUCE(FuncMinMax, __nv_fp8_e5m2, 2, __nv_fp8x2_e5m2, __nv_fp8x2_e5m2(fn.isMinNotMax ? __hmin2(__half2(x), __half2(y)) : __hmax2(__half2(x), __half2(y))))
+#endif
+#endif
+
 #undef SPECIALIZE_REDUCE

 ////////////////////////////////////////////////////////////////////////////////
@@ -416,9 +440,9 @@ template<>
 struct FuncPreMulSum<half> {
  using EltType = half;
 #if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
-  half2 scalar;
+  __half2 scalar;
  __device__ FuncPreMulSum(uint64_t opArg=0) {
-    union { uint64_t u64; half val; };
+    union { uint64_t u64; __half val; };
    u64 = opArg;
    scalar.x = val;
    scalar.y = val;
@@ -426,9 +450,9 @@ struct FuncPreMulSum<half> {
 #else
  float scalar;
  __device__ FuncPreMulSum(uint64_t opArg=0) {
-    union { uint64_t u64; half val; };
+    union { uint64_t u64; __half val; };
    u64 = opArg;
-    scalar = __half2float(val);
+    scalar = (float)val;
  }
 #endif
 };
@@ -459,11 +483,39 @@ struct FuncPreMulSum<half> {
  };
 #endif

-template<typename T>
-struct Apply_Reduce<FuncPreMulSum<T>, /*EltPerPack=*/1> {
-  __device__ static BytePack<sizeof(T)> reduce(FuncPreMulSum<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
+#if defined(__CUDA_FP8_TYPES_EXIST__)
+#if __CUDA_ARCH__ >= 900
+  template<>
+  struct FuncPreMulSum<__nv_fp8_e4m3> {
+    using EltType = __nv_fp8_e4m3;
+    __half2 scalar2;
+    __device__ FuncPreMulSum(uint64_t opArg) {
+      union { uint64_t u64; __nv_fp8_storage_t val; };
+      u64 = opArg;
+      scalar2.x = __half(__nv_cvt_fp8_to_halfraw(val, __NV_E4M3));
+      scalar2.y = scalar2.x;
+    }
+  };
+
+  template<>
+  struct FuncPreMulSum<__nv_fp8_e5m2> {
+    using EltType = __nv_fp8_e5m2;
+    __half2 scalar2;
+    __device__ FuncPreMulSum(uint64_t opArg) {
+      union { uint64_t u64; __nv_fp8_storage_t val; };
+      u64 = opArg;
+      scalar2.x = __half(__nv_cvt_fp8_to_halfraw(val, __NV_E5M2));
+      scalar2.y = scalar2.x;
+    }
+  };
+#endif
+#endif
+
+template<typename T, int EltPerPack>
+struct Apply_Reduce<FuncPreMulSum<T>, EltPerPack> {
+  __device__ static BytePack<EltPerPack*sizeof(T)> reduce(FuncPreMulSum<T> fn, BytePack<EltPerPack*sizeof(T)> a, BytePack<EltPerPack*sizeof(T)> b) {
    // FuncPreMulSum reduce dispatches to FuncSum.
-    return Apply_Reduce<FuncSum<T>, 1>::reduce(FuncSum<T>(), a, b);
+    return Apply_Reduce<FuncSum<T>, EltPerPack>::reduce(FuncSum<T>(), a, b);
  }
 };

@@ -530,6 +582,51 @@ struct Apply_PreOp<FuncPreMulSum<half>, /*EltPerPack=*/1> {
  #endif
 #endif

+////////////////////////////////////////////////////////////////////////////////
+// Apply_PreOp of FuncPreMulSum for fp8.
+
+#if defined(__CUDA_FP8_TYPES_EXIST__)
+#if __CUDA_ARCH__ >= 900
+  template<>
+  struct Apply_PreOp<FuncPreMulSum<__nv_fp8_e4m3>, /*EltPerPack=*/1> {
+    static constexpr bool IsIdentity = false;
+    __device__ static BytePack<sizeof(__nv_fp8_e4m3)> preOp(
+        FuncPreMulSum<__nv_fp8_e4m3> fn, BytePack<sizeof(__nv_fp8_e4m3)> a
+      ) {
+      return toPack<__nv_fp8_e4m3>(__nv_fp8_e4m3(__hmul(__half(fromPack<__nv_fp8_e4m3>(a)), fn.scalar2.x)));
+    }
+  };
+  template<>
+  struct Apply_PreOp<FuncPreMulSum<__nv_fp8_e4m3>, /*EltPerPack=*/2> {
+    static constexpr bool IsIdentity = false;
+    __device__ static BytePack<sizeof(__nv_fp8x2_e4m3)> preOp(
+        FuncPreMulSum<__nv_fp8_e4m3> fn, BytePack<sizeof(__nv_fp8x2_e4m3)> a
+      ) {
+      return toPack<__nv_fp8x2_e4m3>(__nv_fp8x2_e4m3(__hmul2(__half2(fromPack<__nv_fp8x2_e4m3>(a)), fn.scalar2)));
+    }
+  };
+
+  template<>
+  struct Apply_PreOp<FuncPreMulSum<__nv_fp8_e5m2>, /*EltPerPack=*/1> {
+    static constexpr bool IsIdentity = false;
+    __device__ static BytePack<sizeof(__nv_fp8_e5m2)> preOp(
+        FuncPreMulSum<__nv_fp8_e5m2> fn, BytePack<sizeof(__nv_fp8_e5m2)> a
+      ) {
+      return toPack<__nv_fp8_e5m2>(__nv_fp8_e5m2(__hmul(__half(fromPack<__nv_fp8_e5m2>(a)), fn.scalar2.x)));
+    }
+  };
+  template<>
+  struct Apply_PreOp<FuncPreMulSum<__nv_fp8_e5m2>, /*EltPerPack=*/2> {
+    static constexpr bool IsIdentity = false;
+    __device__ static BytePack<sizeof(__nv_fp8x2_e5m2)> preOp(
+        FuncPreMulSum<__nv_fp8_e5m2> fn, BytePack<sizeof(__nv_fp8x2_e5m2)> a
+      ) {
+      return toPack<__nv_fp8x2_e5m2>(__nv_fp8x2_e5m2(__hmul2(__half2(fromPack<__nv_fp8x2_e5m2>(a)), fn.scalar2)));
+    }
+  };
+#endif
+#endif
+
 ////////////////////////////////////////////////////////////////////////////////
 // FuncSumPostDiv

@@ -541,34 +638,44 @@ struct RedOpArg<FuncSumPostDiv<T>> {
  }
 };

-template<typename T, bool IsFloating=IsFloatingPoint<T>::value>
-struct FuncSumPostDiv_IntOnly;
-
 template<typename T>
-struct FuncSumPostDiv: FuncSumPostDiv_IntOnly<T> {
-  __device__ FuncSumPostDiv(uint64_t opArg=0):
-    FuncSumPostDiv_IntOnly<T>(opArg) {
+struct FuncSumPostDiv {
+  static_assert(T(0) < T(-1), "FuncSumPostDiv is only for implementing ncclAvg on uint types.");
+  using EltType = T;
+  using UintType = typename std::conditional<sizeof(T)==8, uint64_t, uint32_t>::type;
+  uint32_t divisor:31, isSigned:1;
+  UintType recip;
+  
+  __device__ FuncSumPostDiv(uint64_t opArg=0) {
+    isSigned = opArg & 1;
+    divisor = opArg >> 1;
+    recip =  UintType(-1)/divisor;
+  }
+  __device__ T divide(T x) {
+    // x is negative iff we are in signed mode and the top bit is set
+    bool xneg = isSigned && (x & ~(T(-1)>>1));
+    // Compute abs(x):
+    // T(-x) vs -T(x) is critical. We have to negate then truncate the bits. Consider
+    // if we are doing signed 8-bit types, thus T=uint8_t. The value -1 is encoded
+    // as 0xff. -T(0xff) when promoted to 32-bit (which is implicit by compiler)
+    // gives 0xffffff01, but T(-0xff) is 0x1, and that is the abs value we want.
+    UintType xabs = xneg ? T(-x) : x;
+    // Compute quotient by multiplying by reciprical.
+    UintType q = sizeof(T)==8 ? __umul64hi(xabs, recip) : __umulhi(xabs, recip);
+    // Quotient may be off by one so do a fixup.
+    if (xabs - q*divisor >= divisor) q += 1;
+    // If original x was negative then we have to negate it back since we were
+    // working with its abs val.
+    return xneg ? -T(q) : T(q);
  }
 };

-template<typename T>
-struct FuncSumPostDiv_IntOnly<T, /*IsFloating=*/false>: FuncSum<T> {
-  using EltType = T;
-  int divisor;
-  __device__ FuncSumPostDiv_IntOnly(uint64_t opArg=0): divisor(opArg) {}
-};
-
-template<typename T>
-struct FuncSumPostDiv_IntOnly<T, /*IsFloating=*/true> {
-  static_assert(sizeof(T)!=sizeof(T), "FuncSumPostDiv is only for implementing ncclAvg on integral types.");
-};
-
-template<typename T>
-struct Apply_Reduce<FuncSumPostDiv<T>, /*EltPerPack=*/1>:
-    Apply_Reduce<FuncSum<T>, 1> {
-  __device__ static BytePack<sizeof(T)> reduce(FuncSumPostDiv<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
+template<typename T, int EltPerPack>
+struct Apply_Reduce<FuncSumPostDiv<T>, EltPerPack>:
+    Apply_Reduce<FuncSum<T>, EltPerPack> {
+  __device__ static BytePack<EltPerPack*sizeof(T)> reduce(FuncSumPostDiv<T> fn, BytePack<EltPerPack*sizeof(T)> a, BytePack<EltPerPack*sizeof(T)> b) {
    // FuncSumPostDiv reduce dispatches to FuncSum.
-    return Apply_Reduce<FuncSum<T>, 1>::reduce(FuncSum<T>(), a, b);
+    return Apply_Reduce<FuncSum<T>, EltPerPack>::reduce(FuncSum<T>(), a, b);
  }
 };

@@ -576,7 +683,7 @@ template<typename T>
 struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
  static constexpr bool IsIdentity = false;
  __device__ static BytePack<sizeof(T)> postOp(FuncSumPostDiv<T> fn, BytePack<sizeof(T)> a) {
-    return toPack<T>(fromPack<T>(a) / fn.divisor);
+    return toPack<T>(fn.divide(fromPack<T>(a)));
  }
 };

@@ -89,7 +89,7 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SI
    T *inputBuf = (T*)work->sendbuff;
    T *outputBuf = (T*)work->recvbuff;
    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
-      (tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, false, false, 0, primsModePatRs);
+      (tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, nullptr, 0, primsModePatRs);

    PatRSAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
    int last = 0;
@@ -137,6 +137,7 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_S
          nelem = min(chunkCount, channelCount - elemOffset);
          prims.scatter(offset, nvls->nHeads * count, nelem, count, -1, 0);
        }
+        // coverity[overrun-call] => Coverity think prims.index can be greater than 1
      } else if (tid < tidEndReduce) {
        // Reduce through NVLS
        using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
@@ -206,10 +207,10 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
      int nRails = direct->nHeads;
      int part = ncclShmem.channelId - work->channelLo;
      void* inbuf = (void*)work->sendbuff;
-      ssize_t sizePerRank = work->collnet.count;
+      ssize_t countPerRank = work->collnet.count;

-      ssize_t railAllBeg = min(railGridOffset + part*chunkSize, nNodes*sizePerRank);
-      ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*sizePerRank);
+      ssize_t railAllBeg = min(railGridOffset + part*chunkSize, nNodes*countPerRank);
+      ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*countPerRank);
      int railAllSize = railAllEnd - railAllBeg;
      if (tid < nDsts) dstSizes[tid] = railAllSize;

@@ -222,15 +223,15 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
        if (rail == nRails) rail = 0;
      }
      do {
-        int node = railAllBeg/sizePerRank;
+        int node = railAllBeg/countPerRank;
        int railAllOffset = 0;
        while (railAllOffset < railAllSize) {
-          ssize_t railOneBeg = node*sizePerRank;
-          ssize_t railOneEnd = railOneBeg + sizePerRank;
+          ssize_t railOneBeg = node*countPerRank;
+          ssize_t railOneEnd = railOneBeg + countPerRank;
          ssize_t railOneOffset = (railAllBeg+railAllOffset) - railOneBeg;
          int delta = min(railAllEnd, railOneEnd) - (railAllBeg+railAllOffset);
          int rank = ncclShmem.comm.collNetDenseToUserRank[node*nRails + rail];
-          ssize_t userOneBeg = rank*sizePerRank + railOneOffset;
+          ssize_t userOneBeg = rank*countPerRank + railOneOffset;
          if (nDsts != 0) {
            reduceCopy<ncclCollUnroll(), RedOp, T,
                     /*MultimemSrcs=*/0, 1+MinSrcs, 1+MaxSrcs,
@@ -239,7 +240,7 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
            (tid, tn, work->redOpArg, &work->redOpArg, false,
             /*nSrcs=*/1+nSrcs, [=]__device__(int s) {
               return s==0 ? (T*)inbuf + userOneBeg
-                           : work->regUsed && (recvDirectFlag & NCCL_DIRECT_READ)
+                           : work->regUsed && (recvDirectFlag & NCCL_P2P_READ)
                           ? (T*)srcPtrs[s-1] + userOneBeg
                           : (T*)srcPtrs[s-1] + railAllOffset;
             },
@@ -264,7 +265,8 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
    struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
    int const &nNodes = ncclShmem.comm.nNodes;
    ssize_t chunkSize = int(work->collnet.chunkCount);
-    ssize_t sizePerRank = work->collnet.count;
+    ssize_t countPerRank = work->collnet.count;
+    const int hasDn = (direct->down[0] >= 0) ? 1 : 0;

    if (direct->out == -1) __trap();
    bool isMultiRail = (direct->nHeads > 1);
@@ -281,15 +283,15 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
    int tn = nWarps1*WARP_SIZE;
    if (tid < tn) {
      // Phase 1: Scatter inputs to peers
-      Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0>
+      Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
        prims(tid, tn, nullptr, direct->heads+1, work->sendbuff, nullptr,
-              work->redOpArg, 0*Proto::MaxGroupWidth, 1, 1, work);
-      for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
+              work->redOpArg, 0*Proto::MaxGroupWidth, 1, 1);
+      for (ssize_t railGridOffset=0; railGridOffset < nNodes*countPerRank; railGridOffset += nChannels*chunkSize) {
        Scatterer</*ReduceSendNotRecv=*/true> scat;
        scat.work = work;
        scat.chunkSize = chunkSize;
        scat.railGridOffset = railGridOffset;
-        prims.template process</*Recv=*/0, /*Send=*/1>(scat, NCCL_DIRECT_READ, 0);
+        prims.template process</*Recv=*/0, /*Send=*/1>(scat, 0, 0);
      }
      return;
    }
@@ -297,23 +299,22 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC

    tn = nWarps2*WARP_SIZE;
    if (tid < tn) {
-      if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
+      if (work->netRegUsed && !hasDn) {
        if (tid == 0) {
-          int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE);
-          Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, steps);
+          Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, 1);
        }
        __syncwarp();
      } else {
        // Phase 2: Reduce from peers + local input -> send to network
-        Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/1, Proto, 0>
+        Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>
          prims(tid, tn, direct->heads + 1, &direct->out, nullptr, nullptr,
-            work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1, work);
-        for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
+            work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
+        for (ssize_t railGridOffset = 0; railGridOffset < nNodes * countPerRank; railGridOffset += nChannels * chunkSize) {
          Scatterer</*ReduceSendNotRecv=*/false> scat;
          scat.work = work;
          scat.chunkSize = chunkSize;
          scat.railGridOffset = railGridOffset;
-          prims.template process</*Recv=*/1, /*Send=*/1>(scat, 0, NCCL_DIRECT_READ);
+          prims.template process</*Recv=*/1, /*Send=*/1>(scat, 0, 0);
        }
      }
      return;
@@ -322,9 +323,9 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC

    tn = nWarps3*WARP_SIZE;
    if (tid < tn) {
-      if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
+      if (work->netRegUsed) {
        if (tid == 0) {
-          int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE);
+          int steps = hasDn ? (int)divUp(nNodes * countPerRank, nChannels * chunkSize) : 1;
          Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, steps);
        }
        __syncwarp();
@@ -333,11 +334,11 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
        Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
          prims(tid, tn, &direct->out, nullptr, nullptr, work->recvbuff,
            work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0);
-        for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
+        for (ssize_t railGridOffset = 0; railGridOffset < nNodes * countPerRank; railGridOffset += nChannels * chunkSize) {
          ssize_t railAllBeg = railGridOffset + part * chunkSize;
-          ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes * sizePerRank);
-          ssize_t railOneBeg = ncclShmem.comm.node * sizePerRank;
-          ssize_t railOneEnd = railOneBeg + sizePerRank;
+          ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes * countPerRank);
+          ssize_t railOneBeg = ncclShmem.comm.node * countPerRank;
+          ssize_t railOneEnd = railOneBeg + countPerRank;
          ssize_t beg = max(railAllBeg, railOneBeg);
          ssize_t end = min(railAllEnd, railOneEnd);
          prims.recv(beg - railOneBeg, max(ssize_t(0), end - beg), /*postOp=*/true);
@@ -15,33 +15,35 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
  template<typename Proto>
  __device__ void runSend(int tid, int tn, int group, struct ncclDevWorkP2p* work) {
    size_t bytes = work->sendBytes;
-    int chunkSize = work->sendIpcReg && ncclShmem.comm.isNvlink ? (1 << 30) : u32fp8Decode(work->sendChunkSize_u32fp8);
+    bool useLargeChunk = (work->sendIpcReg && ncclShmem.comm.isAllNvlink) || work->sendNetReg;
+    int chunkSize = useLargeChunk ? NCCL_MAX_NET_SIZE : u32fp8Decode(work->sendChunkSize_u32fp8);
+    int stepSize = useLargeChunk ? NCCL_MAX_NET_SIZE : ncclShmem.comm.p2pChunkSize;
    Primitives<T, RedOp, FanAsymmetric<0, 1>, 1, Proto, 1>
      prims(tid, tn, nullptr, &work->sendRank, work->sendAddr, nullptr,
-            /*redOpArg(ignored)=*/0, group, 1, 1, nullptr,
-            /*ipcReg=*/work->sendIpcReg, /*netReg=*/work->sendRegistered, ncclShmem.comm.p2pChunkSize);
+            /*redOpArg(ignored)=*/0, group, 1, 1, nullptr, work, stepSize);
    size_t cursor = 0;
    do {
      int n = min(size_t(chunkSize), bytes-cursor);
      prims.directSend(cursor, cursor, n);
      cursor += n;
-    } while (cursor < bytes && work->sendRegistered == 0);
+    } while (cursor < bytes);
  }

  template<typename Proto>
  __device__ void runRecv(int tid, int tn, int group, struct ncclDevWorkP2p* work) {
    size_t bytes = work->recvBytes;
-    int chunkSize = work->recvIpcReg && ncclShmem.comm.isNvlink ? (1 << 30) : u32fp8Decode(work->recvChunkSize_u32fp8);
+    bool useLargeChunk = (work->recvIpcReg && ncclShmem.comm.isAllNvlink) || work->recvNetReg;
+    int chunkSize = useLargeChunk ? NCCL_MAX_NET_SIZE : u32fp8Decode(work->recvChunkSize_u32fp8);
+    int stepSize = useLargeChunk ? NCCL_MAX_NET_SIZE : ncclShmem.comm.p2pChunkSize;
    Primitives<T, RedOp, FanAsymmetric<1, 0>, 1, Proto, 1>
      prims(tid, tn, &work->recvRank, nullptr, nullptr, work->recvAddr,
-            /*redOpArg(ignored)=*/0, group, 1, 1, nullptr,
-            /*ipcReg=*/work->recvIpcReg, /*netReg=*/work->recvRegistered, ncclShmem.comm.p2pChunkSize);
+            /*redOpArg(ignored)=*/0, group, 1, 1, nullptr, work, stepSize);
    size_t cursor = 0;
    do {
      int n = min(size_t(chunkSize), bytes-cursor);
      prims.directRecv(cursor, cursor, n);
      cursor += n;
-    } while (cursor < bytes && work->recvRegistered == 0);
+    } while (cursor < bytes);
  }

  __device__ __forceinline__ void run() {
@@ -248,11 +248,31 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
 NCCL_PARAM(IgnoreDisabledP2p, "IGNORE_DISABLED_P2P", 0);

 int ncclTopoUserP2pLevel = -1;
-ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank) {
+ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* system, int rank1, int rank2,
+                              int* p2p, int *read, int* intermediateRank) {
+  int mnnvl = 0;
+  struct ncclPeerInfo* info1 = NULL;
+  struct ncclPeerInfo* info2 = NULL;
  *p2p = 0;
  if (read) *read = 0;
  if (intermediateRank) *intermediateRank = -1;

+  // Rule out different nodes / isolated containers
+  if (comm) {
+    info1 = comm->peerInfo+rank1;
+    info2 = comm->peerInfo+rank2;
+    if (info1->hostHash != info2->hostHash) {
+      if (comm->MNNVL) {
+        NCCLCHECK(ncclTopoCheckMNNVL(comm->topo, info1, info2, &mnnvl));
+        if (!mnnvl) return ncclSuccess;
+      } else {
+        return ncclSuccess;
+      }
+    } else if (info1->shmDev != info2->shmDev) {
+      return ncclSuccess;
+    }
+  }
+
  // Get GPUs from topology
  int g1, g2;
  NCCLCHECK(ncclTopoRankToIndex(system, rank1, &g1));
@@ -297,7 +317,8 @@ compare:
  if (*p2p == 1) {
    // NCCL_IGNORE_DISABLED_P2P=2 is used by unit tests that don't want to
    // validate against NVML at all since they are pretending to be on other hw.
-    if (g1 != g2 && ncclParamIgnoreDisabledP2p() != 2) {
+    if (g1 != g2 && (comm == NULL || (info1->hostHash == comm->peerInfo[comm->rank].hostHash &&
+                                      info1->hostHash == info2->hostHash)) && ncclParamIgnoreDisabledP2p() != 2) {
      int indexes[3] = {-1,-1,-1};
      int verticeN = 0;
      NCCLCHECK(ncclNvmlEnsureInitialized());
@@ -356,14 +377,14 @@ ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerIn
 NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
 int ncclTopoUserGdrLevel = -1;

-ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int64_t netId, int read, int* useGdr) {
+ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int rank, int64_t netId, int read, int* useGdr) {
  *useGdr = 0;

  // Get GPU and NET
  int n, g;
  NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &n));
  struct ncclTopoNode* net = system->nodes[NET].nodes+n;
-  NCCLCHECK(ncclTopoIdToIndex(system, GPU, busId, &g));
+  NCCLCHECK(ncclTopoRankToIndex(system, rank, &g));
  struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;

  // Check that both the NIC and GPUs support it
@@ -404,12 +425,32 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int6
    distance = proxyGpu->paths[NET][n].type;
  }
  if (distance > netGdrLevel) {
-    INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %lx / HCA %lx (distance %d > %d)", busId, netId, distance, netGdrLevel);
+    INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %d / HCA %lx (distance %d > %d)", rank, netId, distance, netGdrLevel);
    return ncclSuccess;
  }

  *useGdr = 1;
-  INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %lx / HCA %lx (distance %d <= %d), read %d", busId, netId, distance, netGdrLevel, read);
+  INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %d / HCA %lx (distance %d <= %d), read %d", rank, netId, distance, netGdrLevel, read);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoIsGdrAvail(struct ncclTopoSystem* system, int rank, bool *avail) {
+  int netNum = system->nodes[NET].count;
+  int useGdr = 0;
+  *avail = false;
+  for (int n = 0; n < netNum; n++) {
+    int64_t netId = system->nodes[NET].nodes[n].id;
+    NCCLCHECK(ncclTopoCheckGdr(system, rank, netId, 1, &useGdr));
+    if (useGdr) {
+      *avail = true;
+      break;
+    }
+    NCCLCHECK(ncclTopoCheckGdr(system, rank, netId, 0, &useGdr));
+    if (useGdr) {
+      *avail = true;
+      break;
+    }
+  }
  return ncclSuccess;
 }

@@ -417,12 +458,17 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int6
 NCCL_PARAM(NetForceFlush, "NET_FORCE_FLUSH", 0);

 // Determine whether we need to flush the GDR recv buffers
-ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int* flush) {
+ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int netDev, int rank, int* flush) {
+  *flush = 1;
+  ncclNetProperties_t props;
+  NCCLCHECK(comm->ncclNet->getProperties(netDev, &props));
+  if (props.forceFlush == 1 || ncclParamNetForceFlush()) return ncclSuccess;
  int g;
-  NCCLCHECK(ncclTopoIdToIndex(system, GPU, busId, &g));
+  struct ncclTopoSystem* system = comm->topo;
+  NCCLCHECK(ncclTopoRankToIndex(system, rank, &g));
  struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
  // Flush is required on Ampere and earlier
-  *flush = gpu->gpu.cudaCompCap < 90 ? 1 : ncclParamNetForceFlush();
+  if (gpu->gpu.cudaCompCap >= 90) *flush = 0;
  return ncclSuccess;
 }

@@ -516,7 +562,7 @@ ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks,
    NCCLCHECK(ncclTopoGetNetDev(comm, comm->rank, NULL, 0, rank, &netId, NULL, &proxyRank));
    if (proxyRank == comm->rank) continue;
    int useGdr;
-    NCCLCHECK(ncclTopoCheckGdr(comm->topo, comm->busId, netId, 1, &useGdr));
+    NCCLCHECK(ncclTopoCheckGdr(comm->topo, comm->rank, netId, 1, &useGdr));
    if (useGdr == 0) continue;
    int found = 0;
    for (int r=0; r<nr; r++) {
@@ -562,7 +608,8 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
  for (int g=0; g<system->nodes[GPU].count; g++) {
    for (int p=0; p<system->nodes[GPU].count; p++) {
      int p2p;
-      NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].gpu.rank, system->nodes[GPU].nodes[g].gpu.rank, &p2p, NULL, NULL));
+      NCCLCHECK(ncclTopoCheckP2p(comm, system, system->nodes[GPU].nodes[p].gpu.rank,
+                                 system->nodes[GPU].nodes[g].gpu.rank, &p2p, NULL, NULL));
      if (p2p == 0) {
        // Divert all traffic through the CPU
        int cpu;
@@ -618,7 +665,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
      if (gpu->paths[NET][n].type < PATH_PHB) {
        // Update path when we dont want to / can't use GPU Direct RDMA.
        int gdr;
-        NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, netNode->id, 0, &gdr));
+        NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].gpu.rank, netNode->id, 0, &gdr));
        if (gdr == 0) {
          // We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU
          int localCpu;
@@ -1142,7 +1142,7 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr
      offset = strlen(line);
    }
    if (system->nodes[NET].count > 0) {
-      sprintf(line+offset, " %s/%lx-%lx", topoNodeTypeStr[NET], NCCL_TOPO_ID_SYSTEM_ID(graph->inter[2*c+1]), NCCL_TOPO_ID_LOCAL_ID(graph->inter[2*c]));
+      sprintf(line+offset, " %s/%lx-%lx", topoNodeTypeStr[NET], NCCL_TOPO_ID_SYSTEM_ID(graph->inter[2*c+1]), NCCL_TOPO_ID_LOCAL_ID(graph->inter[2*c+1]));
      offset = strlen(line);
    }
    INFO(NCCL_GRAPH, "%s", line);
@@ -296,7 +296,7 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN
        NCCLCHECK(ncclTopoPrintRec(link->remNode, node, line, nextOffset));
      } else {
        if (link->remNode->type == NET) {
-          sprintf(line+nextOffset, "%s/%lx-%lx (%lx/%d/%f)", topoNodeTypeStr[link->remNode->type], NCCL_TOPO_ID_SYSTEM_ID(link->remNode->id), NCCL_TOPO_ID_LOCAL_ID(link->remNode->id), link->remNode->net.asic, link->remNode->net.port, link->remNode->net.bw);
+          sprintf(line+nextOffset, "%s/%lx-%lx (%d/%lx/%d/%f)", topoNodeTypeStr[link->remNode->type], NCCL_TOPO_ID_SYSTEM_ID(link->remNode->id), NCCL_TOPO_ID_LOCAL_ID(link->remNode->id), link->remNode->net.collSupport, link->remNode->net.asic, link->remNode->net.port, link->remNode->net.bw);
        } else {
          sprintf(line+nextOffset, "%s/%lx-%lx", topoNodeTypeStr[link->remNode->type], NCCL_TOPO_ID_SYSTEM_ID(link->remNode->id), NCCL_TOPO_ID_LOCAL_ID(link->remNode->id));
        }
@@ -383,6 +383,7 @@ ncclResult_t ncclTopoAddNic(struct ncclXmlNode* xmlNic, struct ncclTopoSystem* s
    if (strcmp(xmlNet->name, "net") != 0) continue;
    int index;
    NCCLCHECK(xmlGetAttrIndex(xmlNet, "dev", &index));
+    // This means that the "dev" attribute wasn't set on this net xml node. That means it should not be added to the system topology graph
    if (index == -1) continue;
    NCCLCHECK(ncclTopoAddNet(xmlNet, system, nic, systemId));
  }
@@ -403,7 +404,7 @@ struct kvDict kvDictPciGen[] = {
  { "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, { "32 GT/s", 240 }, /* Kernel 5.6 and earlier */
  { "2.5 GT/s PCIe", 15 }, { "5.0 GT/s PCIe", 30 }, { "8.0 GT/s PCIe", 60 }, { "16.0 GT/s PCIe", 120 }, { "32.0 GT/s PCIe", 240 }, { "64.0 GT/s PCIe", 480 },
  { NULL, 60 /* Default fallback */ } }; // x100 Mbps per lane
-ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* system, struct ncclTopoNode* parent, int systemId) {
+ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* system, struct ncclTopoNode* parent, int systemId, int numaId) {
  const char* str;

  int type;
@@ -430,9 +431,9 @@ ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* s
  if (xmlNic != NULL) {
    type = NIC;
    // Ignore sub device ID and merge multi-port NICs into one PCI device.
-    busId &= 0xfffffffffffffff0;
    struct ncclTopoNode* nicNode = NULL;
-    int64_t id = NCCL_TOPO_ID(systemId, busId);
+    int64_t localNicId = NCCL_TOPO_LOCAL_NIC_ID(numaId, busId);
+    int64_t id = NCCL_TOPO_ID(systemId, localNicId);
    NCCLCHECK(ncclTopoGetNode(system, &nicNode, type, id));
    if (nicNode == NULL) {
      NCCLCHECK(ncclTopoCreateNode(system, &nicNode, type, id));
@@ -453,7 +454,7 @@ ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* s
    for (int s=0; s<xmlPci->nSubs; s++) {
      struct ncclXmlNode* xmlSubPci = xmlPci->subs[s];
      if (strcmp(xmlSubPci->name, "pcilink") != 0) { // PCI links will be added later
-        NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node, systemId));
+        NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node, systemId, numaId));
      }
    }
  }
@@ -520,12 +521,14 @@ ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* s
  }
  for (int s=0; s<xmlCpu->nSubs; s++) {
    struct ncclXmlNode* node = xmlCpu->subs[s];
-    if (strcmp(node->name, "pci") == 0) NCCLCHECK(ncclTopoAddPci(node, system, cpu, systemId));
+    if (strcmp(node->name, "pci") == 0) NCCLCHECK(ncclTopoAddPci(node, system, cpu, systemId, numaId));
    if (strcmp(node->name, "nic") == 0) {
      struct ncclTopoNode* nic = NULL;
-      NCCLCHECK(ncclTopoGetNode(system, &nic, NIC, 0));
+      int64_t localNicId = NCCL_TOPO_LOCAL_NIC_ID(numaId, 0);
+      int64_t id = NCCL_TOPO_ID(systemId, localNicId);
+      NCCLCHECK(ncclTopoGetNode(system, &nic, NIC, id));
      if (nic == NULL) {
-        NCCLCHECK(ncclTopoCreateNode(system, &nic, NIC, NCCL_TOPO_ID(systemId, 0)));
+        NCCLCHECK(ncclTopoCreateNode(system, &nic, NIC, id));
        NCCLCHECK(ncclTopoConnectNodes(cpu, nic, LINK_PCI, LOC_BW));
        NCCLCHECK(ncclTopoConnectNodes(nic, cpu, LINK_PCI, LOC_BW));
      }
@@ -725,14 +728,528 @@ ncclResult_t ncclTopoRefreshBcmP2pLinks(void) {
  return ncclSuccess;
 }

-ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) {
+// This is just checking for direct descendence
+int ncclTopoCheckPix(ncclXmlNode* common, ncclXmlNode** nodes, int nNodes) {
+  const char* tempBusId;
+  // If the common parent isn't a pci switch, then this isn't PIX
+  NCCLCHECK(xmlGetAttrStr(common, "busid", &tempBusId));
+  if (tempBusId == NULL) return 0;
+  TRACE(NCCL_GRAPH, "Checking pix for busid=%s", tempBusId);
+
+  // All the nodes must have a "nic" which is a parent, and then a pci node (busid) which must be a child of the "common"
+  for (int i = 0; i < nNodes; i++) {
+    ncclXmlNode* node = nodes[i];
+    if (strcmp(node->name, "net") == 0) {
+      node = node->parent;
+      if (node == NULL) return 0;
+      if (strcmp(node->name, "nic") == 0) {
+        node = node->parent;
+        if (node == NULL) return 0;
+        // All nodes must descend from the same first level pci switch
+        if (strcmp(node->name, "pci") == 0) {
+          TRACE(NCCL_GRAPH, "Comparing parent of node=%p to common=%p", node->parent, common);
+          if (node->parent != common) return 0;
+        }
+      }
+    }
+  }
+
+  return 1;
+}
+
+#define NCCL_TOPO_XML_DEPTH_MAX 256
+typedef struct xmlNodeStack {
+  ncclXmlNode* elems[NCCL_TOPO_XML_DEPTH_MAX];
+  int tail;
+
+  ncclXmlNode* top() {
+    if (!empty()) {
+      return elems[tail - 1];
+    } else {
+      return NULL;
+    }
+  }
+
+  ncclXmlNode* pop() {
+    ncclXmlNode* node = top();
+    if (node) {
+      tail--;
+    }
+    return node;
+  }
+
+  void push(ncclXmlNode* node) {
+    if (tail < NCCL_TOPO_XML_DEPTH_MAX) {
+      elems[tail++] = node;
+    }
+  }
+
+  bool empty() {
+    return tail == 0;
+  }
+
+} xmlNodeStack;
+
+// 1. Find the common parent xmlNode between the given set of nodes
+ncclResult_t ncclTopoGetPath(ncclXmlNode** nodes, int nNodes, int* path, ncclXmlNode** parent) {
+  // Track a stack of parents per-net node being merged
+  xmlNodeStack* parents;
+  NCCLCHECK(ncclCalloc(&parents, nNodes));
+  // Find the common parent
+  ncclXmlNode* common = NULL;
+
+  if (nNodes == 1) {
+    common = nodes[0];
+    *path = PATH_LOC;
+    goto out;
+  }
+
+  for (int i = 0; i < nNodes; i++) {
+    ncclXmlNode* temp;
+    temp = nodes[i];
+    while (temp) {
+      parents[i].push(temp);
+      temp = strcmp(temp->name, "system") == 0 ? NULL : temp->parent;
+    }
+  }
+
+  common = NULL;
+  int c;
+  c = 1;
+  while (c && !parents[0].empty()) {
+    ncclXmlNode* temp = parents[0].top();
+    for (int i = 1; i < nNodes; i++) {
+      if (!parents[i].empty()) {
+        c &= (temp == parents[i].top());
+      } else {
+        c = 0;
+        break;
+      }
+    }
+
+    if (c) {
+      common = temp;
+      if (common == NULL) TRACE(NCCL_GRAPH, "COMMON IS NULL");
+      for (int i = 0; i < nNodes; i++) {
+        parents[i].pop();
+      }
+    // Check multi-port while we still have the mismatched parents
+    // For multi-port to be true, all parents (peers) must have the busId attribute with all but the last character matching
+    } else {
+      int multiPort = 1;
+      const char* tempBusId;
+
+      NCCLCHECK(xmlGetAttr(temp, "busid", &tempBusId));
+      if (tempBusId) {
+        for (int i = 1; i < nNodes; i++) {
+          if (!parents[i].empty()) {
+            const char* busId;
+            NCCLCHECK(xmlGetAttr(parents[i].top(), "busid", &busId));
+            if (busId) {
+              if (strlen(busId) != strlen(tempBusId)) {
+                multiPort = 0;
+                break;
+              }
+              if (strncmp(busId, tempBusId, strlen(busId)-1) != 0) {
+                multiPort = 0;
+                break;
+              }
+            } else {
+              multiPort = 0;
+              break;
+            }
+          }
+        }
+      } else {
+        multiPort = 0;
+      }
+
+      if (multiPort) {
+        *path = PATH_PORT;
+        goto out;
+      }
+    }
+  }
+
+  if (common == NULL) {
+    *path = PATH_DIS;
+  } else if (strcmp(common->name,"system") == 0) {
+    *path = PATH_SYS;
+  } else if (strcmp(common->name, "cpu") == 0) {
+    *path = PATH_PHB;
+  } else if (strcmp(common->name, "nic") == 0) {
+    *path = PATH_PORT;
+  } else if (strcmp(common->name, "net") == 0) {
+    *path = PATH_PORT;
+  } else if (ncclTopoCheckPix(common, nodes, nNodes)) {
+    *path = PATH_PIX;
+  } else {
+    *path = PATH_PXB;
+  }
+
+out:
+  *parent = common;
+  free(parents);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoMakeUniqueBusId(struct ncclXml* xml, char* busId, struct ncclXmlNode** pciNode, struct ncclXmlNode* parent) {
+  int i = 0;
+  int64_t rBusId;
+  NCCLCHECK(busIdToInt64(busId, &rBusId));
+  // Try to find an unused busid - NCCL expects leaf busid to be unique
+  while (i < 100) {
+    rBusId++;
+    TRACE(NCCL_GRAPH, "Trying to make new busId %lx", rBusId);
+    int64ToBusId(rBusId, busId);
+    struct ncclXmlNode* temp = NULL;
+    NCCLCHECK(xmlFindTagKv(xml, "pci", &temp, "busid", busId));
+    if (temp == NULL) {
+      NCCLCHECK(xmlAddNode(xml, parent, "pci", pciNode));
+      NCCLCHECK(xmlSetAttr(*pciNode, "busid", busId));
+      TRACE(NCCL_GRAPH, "Made new busId %lx", rBusId);
+      return ncclSuccess;
+    }
+    TRACE(NCCL_GRAPH, "Conflicting busId %lx", rBusId);
+    i++;
+  }
+
+  WARN("TOPO/NET : Couldn't generate unique busId after %d tries", i);
+  return ncclInternalError;
+}
+
+ncclResult_t ncclTopoMakePciParent(struct ncclXml* xml, struct ncclXmlNode** parent, struct ncclXmlNode* physNetNode) {
+  struct ncclXmlNode* newBusId = NULL;
+  struct ncclXmlNode* pci = physNetNode->parent;
+  if (pci) {
+    pci = pci->parent;
+    if (pci) {
+      if (strcmp(pci->name, "pci") == 0) {
+        char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+        memset(busId, 0, sizeof(busId));
+        const char* originalBusId;
+        // Seed busId with the current NIC 0's busId to make discovering a unique hash quicker
+        NCCLCHECK(xmlGetAttrStr(pci, "busid", &originalBusId));
+        snprintf(busId, sizeof(busId), "%s", originalBusId);
+        NCCLCHECK(ncclTopoMakeUniqueBusId(xml, busId, &newBusId, *parent));
+        for (int i = 0; i < pci->nAttrs; i++) {
+          NCCLCHECK(xmlSetAttr(newBusId, pci->attrs[i].key, pci->attrs[i].value));
+        }
+        NCCLCHECK(xmlSetAttr(newBusId, "busid", busId));
+        *parent = newBusId;
+      }
+    }
+  }
+
+  if (newBusId == NULL) {
+    const char* name;
+    NCCLCHECK(xmlGetAttr(physNetNode, "name", &name));
+    WARN("TOPO/NET : Can't find busId of child 0 %s", name);
+    return ncclInternalError;
+  }
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoMakeVnic(ncclComm_t comm, struct ncclXml* xml, ncclNetVDeviceProps_t* vProps,
+struct ncclXmlNode** physNetNodes, struct ncclXmlNode** netNode, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
+  if (vProps->ndevs > NCCL_NET_MAX_DEVS_PER_NIC) {
+    WARN("TOPO/NET : Tried to merge too many NICs. %d > %d", vProps->ndevs, NCCL_NET_MAX_DEVS_PER_NIC);
+    return ncclInternalError;
+  }
+
+  // Trigger the merge, then get the new device's properties
+  int vDevIndex = 0;
+  ncclResult_t ret = makeVDevice(&vDevIndex, vProps);
+  if (ret == ncclInvalidUsage) {
+    WARN("TOPO/NET : Tried merging multiple devices together and failed. Try setting NCCL_NET_MERGE_LEVEL=LOC");
+    NCCLCHECK(ret);
+  }
+
+  INFO(NCCL_GRAPH, "TOPO/NET : Made vNic %d", vDevIndex);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoForceMerge(ncclComm_t comm, struct ncclXml* xml, char* str, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
+  INFO(NCCL_ENV|NCCL_NET, "TOPO/NET : Force-fusing NICs using NCCL_NET_FORCE_MERGE=%s", str);
+  char* semi_token;
+  char* semi = strtok_r(str, ";", &semi_token);
+  while (semi) {
+    TRACE(NCCL_NET, "Fusing %s", semi);
+    struct netIf userIfs[NCCL_NET_MAX_DEVS_PER_NIC];
+    int nUserIfs = parseStringList(semi, userIfs, NCCL_NET_MAX_DEVS_PER_NIC);
+    if (nUserIfs == 0) {
+      INFO(NCCL_NET, "NET/IB : Invalid NCCL_NET_FORCE_MERGE specified %s. Couldn't parse substring %s. Please provide a semicolon-delimited list of comma-delimited NIC groups.",
+        str, semi);
+      continue;
+    }
+
+    ncclNetVDeviceProps_t vProps = {0};
+    for (int d = 0; d < nPhysDevs; d++) {
+      if (matchIfList(propsList[d].name, propsList[d].port, userIfs, nUserIfs, 1)) {
+        vProps.devs[vProps.ndevs++] = d;
+      }
+    }
+
+    if (vProps.ndevs != nUserIfs) {
+      WARN("TOPO/NET : Only matched %d devices, %d requested from %s",
+        vProps.ndevs, nUserIfs, semi);
+      return ncclInvalidUsage;
+    }
+
+    if (vProps.ndevs > NCCL_NET_MAX_DEVS_PER_NIC) {
+      WARN("Specified fused NIC %s which has too many devices (%d). Max %d", semi, vProps.ndevs, NCCL_NET_MAX_DEVS_PER_NIC);
+      return ncclInvalidUsage;
+    }
+
+    struct ncclXmlNode* netNode;
+    NCCLCHECK(ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice));
+
+    // Only set that a device is "placed" after successfully making a vNic (it's possible to exit before this)
+    for (int i = 0; i < vProps.ndevs; i++) {
+      placedDevs[vProps.devs[i]] = 1;
+    }
+
+    semi = strtok_r(NULL, ";", &semi_token);;
+  }
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoAutoMerge(ncclComm_t comm, struct ncclXml* xml, int mergeLevel, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
+  // Compute the path type between each device
+  int* paths = NULL;
+  ncclResult_t res = ncclSuccess;
+  ncclCalloc(&paths, nPhysDevs*nPhysDevs);
+  TRACE(NCCL_GRAPH, "Allocated %d paths", nPhysDevs*nPhysDevs);
+  for (int i = 0; i < nPhysDevs; i++) {
+    for (int j = 0; j < nPhysDevs; j++) {
+      struct ncclXmlNode* nodes[2];
+      nodes[0] = physNetNodes[i];
+      nodes[1] = physNetNodes[j];
+      struct ncclXmlNode* parent;
+      NCCLCHECKGOTO(ncclTopoGetPath(nodes, 2, &paths[i*nPhysDevs + j], &parent), res, out);
+    }
+  }
+
+  // Place all remaining physical devices into a virtual device given the mergeLevel criteria
+  for (int i = 0; i < nPhysDevs; i++) {
+    // Select the first unplaced device "i" as the root
+    if (placedDevs[i] == 0) {
+      // Init a new vDevice
+      ncclNetVDeviceProps_t vProps;
+      vProps = {0};
+      vProps.devs[vProps.ndevs++] = i;
+      placedDevs[i] = 1;
+      TRACE(NCCL_GRAPH, "Placed dev %d", i);
+
+      // Select each unplaced device "j" which is at most "mergeLevel" distance from "i", but not equal to "i"
+      // (Don't merge the same device with itself)
+      for (int j = 0; j < nPhysDevs; j++) {
+        if (paths[i*nPhysDevs + j] <= mergeLevel &&
+        placedDevs[j] == 0 && j != i) {
+          vProps.devs[vProps.ndevs++] = j;
+          placedDevs[j] = 1;
+          TRACE(NCCL_GRAPH, "Placed dev %d path=%d", j, paths[i*nPhysDevs + j] );
+        }
+        if (vProps.ndevs == NCCL_NET_MAX_DEVS_PER_NIC) break;
+      }
+
+      if (vProps.ndevs > NCCL_NET_MAX_DEVS_PER_NIC) {
+        WARN("TOPO/NET : Tried to merge too many NICs. %d > %d", vProps.ndevs, NCCL_NET_MAX_DEVS_PER_NIC);
+        return ncclInternalError;
+      }
+
+      struct ncclXmlNode* netNode;
+      NCCLCHECKGOTO(ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice), res, out);
+    }
+  }
+
+out:
+  free(paths);
+  return res;
+}
+
+struct kvDict nicPathKvList[] = {
+  { "LOC",  PATH_LOC },
+  { "PORT", PATH_PORT },
+  { "PIX",  PATH_PIX },
+  { "PXB",  PATH_PXB },
+  { "PXN",  PATH_PXN },
+  { "PHB",  PATH_PHB },
+  { "SYS",  PATH_SYS },
+  { NULL, 0 }
+};
+
+ncclResult_t ncclTopoGetVNicParent(struct ncclXml* xml, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), ncclNetVDeviceProps_t* vProps, ncclXmlNode** parent) {
+  ncclNetProperties_t props[NCCL_NET_MAX_DEVS_PER_NIC];
+  ncclXmlNode* physNetNodes[NCCL_NET_MAX_DEVS_PER_NIC];
+  for (int i = 0; i < vProps->ndevs; i++) {
+    NCCLCHECK(getProperties(vProps->devs[i], props + i));
+    struct ncclXmlNode* physNetNode;
+    NCCLCHECK(xmlFindTagKv(xml, "net", &physNetNode, "name", props[i].name));
+    physNetNodes[i] = physNetNode;
+    TRACE(NCCL_GRAPH, "Re-found physical ncclNet node %d %s", i,  props[i].name);
+  }
+
+  int path = PATH_LOC;
+  NCCLCHECK(ncclTopoGetPath(physNetNodes, vProps->ndevs, &path, parent));
+  if (path == PATH_LOC) {
+    *parent = NULL;
+  } else if (parent && strcmp((*parent)->name, "pci") == 0) {
+    // If the common parent is PCI, we must reparent the new NIC under a made up busId
+    NCCLCHECK(ncclTopoMakePciParent(xml, parent, physNetNodes[0]));
+  }
+  TRACE(NCCL_GRAPH, "Selected parent %s with path %d", (*parent)->name, path);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoMakeVNics(ncclComm_t comm, struct ncclXml* xml, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*), ncclResult_t (*getProperties)(int, ncclNetProperties_t*), int physicalDevs) {
+  int* placedDevs = NULL;
+  struct ncclXmlNode** physNetNodes = NULL;
+  if (physicalDevs == 0) return ncclSuccess;
+
+  ncclCalloc(&physNetNodes, physicalDevs);
+  ncclResult_t res = ncclSuccess;
+
+  ncclNetProperties_t* props = NULL;
+  ncclCalloc(&props, physicalDevs);
+  for (int i = 0; i < physicalDevs; i++) {
+    NCCLCHECKGOTO(getProperties(i, props + i), res, out);
+    struct ncclXmlNode* physNetNode;
+    NCCLCHECKGOTO(xmlFindTagKv(xml, "net", &physNetNode, "name", props[i].name), res, out);
+    physNetNodes[i] = physNetNode;
+    TRACE(NCCL_GRAPH, "Found physical ncclNet node %d %s", i,  props[i].name);
+  }
+
+  // By default, don't merge any devices
+  int mergeLevel;
+  mergeLevel = PATH_PORT;
+  char* mergeLevelEnv;
+  mergeLevelEnv = getenv("NCCL_NET_MERGE_LEVEL");
+  if (mergeLevelEnv) kvConvertToInt(mergeLevelEnv, &mergeLevel, nicPathKvList);
+  char* forceMerge;
+  forceMerge = getenv("NCCL_NET_FORCE_MERGE");
+  NCCLCHECK(ncclCalloc(&placedDevs, physicalDevs));
+  memset(placedDevs, 0, sizeof(int)*physicalDevs);
+
+  if (forceMerge) {
+    NCCLCHECKGOTO(ncclTopoForceMerge(comm, xml, forceMerge, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out);
+  }
+  NCCLCHECKGOTO(ncclTopoAutoMerge(comm, xml, mergeLevel, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out);
+
+out:
+  free(physNetNodes);
+  free(props);
+  if (placedDevs) free(placedDevs);
+  return res;
+}
+
+static ncclResult_t ncclTopoPopulateNics(ncclComm_t comm, ncclXml* xml, int startIndex, int endIndex, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), const char* netName, int coll, int keep, int virtualNics) {
+  for (int n = startIndex; n < endIndex; n++) {
+    ncclNetProperties_t props;
+    NCCLCHECK(getProperties(n, &props));
+    struct ncclXmlNode* netNode = NULL;
+    struct ncclXmlNode* parent = NULL;
+    if (virtualNics) {
+      struct ncclXmlNode* net = NULL;
+      NCCLCHECK(xmlFindTagKv(xml, "net", &net, "name", props.name));
+      // In the event of multithreaded use case, we need to re-discover the shared parent of the given devices for this vNIC
+      // Only run this if the net doesn't exist locally - this may alter the XML state
+      if (net == NULL) NCCLCHECK(ncclTopoGetVNicParent(xml, getProperties, &props.vProps, &parent));
+    }
+
+    NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode, parent));
+
+    const char* colAttr;
+    NCCLCHECK(xmlGetAttr(netNode, "coll", &colAttr));
+
+    // If coll == 0 but the netNode is tagged as coll, don't update the keep value
+    if (colAttr == NULL || coll != 0 || strcmp(colAttr,"1") != 0) NCCLCHECK(xmlSetAttrInt(netNode, "keep", keep));
+    NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
+    NCCLCHECK(xmlInitAttrInt(netNode, "latency", props.latency));
+    NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
+    NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
+    NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
+    NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
+    bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
+    INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", netName, gdrSupport ? "Enabled" : "Disabled", n, props.name);
+    NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport));
+    // Only set coll if it's not 0
+    if (coll) NCCLCHECK(xmlInitAttrInt(netNode, "coll", coll));
+
+    const char* keepAttr;
+    NCCLCHECK(xmlGetAttr(netNode, "coll", &colAttr));
+    NCCLCHECK(xmlGetAttr(netNode, "keep", &keepAttr));
+    INFO(NCCL_GRAPH, "ncclTopoPopulateNics : Filled %s in topo with pciPath=%s keep=%s coll=%s",
+      props.name, props.pciPath, keepAttr, colAttr);
+  }
+
+  return ncclSuccess;
+}
+
+struct ncclTopoNetState {
+  int nVirtualNics;
+  int nPhysicalNics;
+  const char* name;
+};
+
+// Calls to network plugin APIs should be protected. This function should be called inside a per-process lock.
+static ncclResult_t ncclTopoProcessNet(ncclComm_t comm, ncclXml* xml, int coll, const char* dumpXmlFile, ncclTopoNetState* state, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*), ncclResult_t (*devices)(int*), const char* netName) {
+  int usePhysicalDevices = (dumpXmlFile || makeVDevice == NULL);
+  if (state->nPhysicalNics == -1) NCCLCHECK(devices(&state->nPhysicalNics));
+  // Enumerate physical devices
+  NCCLCHECK(ncclTopoPopulateNics(comm, xml, 0, state->nPhysicalNics, getProperties, netName, coll, 1, 0));
+  if (!usePhysicalDevices) {
+    if (state->nVirtualNics == -1) {
+      NCCLCHECK(ncclTopoMakeVNics(comm, xml, makeVDevice, getProperties, state->nPhysicalNics));
+      int nDevs;
+      NCCLCHECK(devices(&nDevs));
+      state->nVirtualNics = nDevs - state->nPhysicalNics;
+    }
+    // Remove keep=1 for physical collnets
+    if (state->nVirtualNics > 0) {
+      NCCLCHECK(ncclTopoPopulateNics(comm, xml, 0, state->nPhysicalNics, getProperties, netName, coll, 0, 0));
+      // Populate new devices
+      NCCLCHECK(ncclTopoPopulateNics(comm, xml, state->nPhysicalNics, state->nPhysicalNics+state->nVirtualNics, getProperties, netName, coll, 1, 1));
+    }
+  }
+
+  return ncclSuccess;
+}
+
+static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER;
+ncclTopoNetState netStates[NCCL_NET_MAX_PLUGINS] = {};
+ncclTopoNetState collNetStates[NCCL_NET_MAX_PLUGINS] = {};
+ncclResult_t ncclTopoGetSharedState(ncclTopoNetState** state, const char* name, ncclTopoNetState* states) {
+  INFO(NCCL_GRAPH, "Retrieving state for %s", name);
+  for (int i = 0; i < NCCL_NET_MAX_PLUGINS; i++) {
+    // Empty slot
+    if (states[i].name == NULL) {
+      states[i].nVirtualNics = -1;
+      states[i].nPhysicalNics = -1;
+      states[i].name = strdup(name);
+      *state = states + i;
+      INFO(NCCL_GRAPH, "Initialized state %d for %s", i, name);
+      return ncclSuccess;
+    // Found my slot
+    } else if (strcmp(states[i].name, name) == 0) {
+      *state = states + i;
+      return ncclSuccess;
+    }
+  }
+  WARN("NET/TOPO : Couldn't find net with name %s", name);
+  return ncclInternalError;
+}
+
+ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system, const char* dumpXmlFile) {
  ncclResult_t ret = ncclSuccess;
  struct ncclXml* xml;
  char* mem = NULL;
  int* localRanks = NULL;
-  int netDevCount = 0;
  struct ncclXml* rankXml;
  int localRank = -1, nLocalRanks = 0;
+  int netLockHeld = 0;
  NCCLCHECK(xmlAlloc(&xml, NCCL_TOPO_XML_MAX_NODES));
  const char* xmlTopoFile = ncclGetEnv("NCCL_TOPO_FILE");
  if (xmlTopoFile) {
@@ -761,47 +1278,24 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
    NCCLCHECKGOTO(xmlSetAttrInt(node, "rank", comm->rank), ret, fail);
    NCCLCHECKGOTO(xmlInitAttrInt(node, "gdr", comm->peerInfo[comm->rank].gdrSupport), ret, fail);
  }
+
  // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
  // so we start with collnet so that it has precedence.
+  pthread_mutex_lock(&netLock);
+  netLockHeld = 1;
+  INFO(NCCL_GRAPH, "TOPO/NET : Importing network plugins to topology");
+  ncclTopoNetState* state;
+  state = NULL;
  if (collNetSupport(comm)) {
-    NCCLCHECKGOTO(collNetDevices(comm, &netDevCount), ret, fail);
-    for (int n=0; n<netDevCount; n++) {
-      ncclNetProperties_t props;
-      NCCLCHECKGOTO(collNetGetProperties(comm, n, &props), ret, fail);
-      struct ncclXmlNode* netNode;
-      NCCLCHECKGOTO(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode), ret, fail);
-      NCCLCHECKGOTO(xmlSetAttrInt(netNode, "keep", 1), ret, fail);
-      NCCLCHECKGOTO(xmlSetAttrInt(netNode, "dev", n), ret, fail);
-      NCCLCHECKGOTO(xmlInitAttrInt(netNode, "speed", props.speed), ret, fail);
-      NCCLCHECKGOTO(xmlInitAttrInt(netNode, "port", props.port), ret, fail);
-      NCCLCHECKGOTO(xmlInitAttrUint64(netNode, "guid", props.guid), ret, fail);
-      NCCLCHECKGOTO(xmlInitAttrInt(netNode, "maxconn", props.maxComms), ret, fail);
-      bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
-      INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
-      NCCLCHECKGOTO(xmlInitAttrInt(netNode, "gdr", gdrSupport), ret, fail);
-      NCCLCHECKGOTO(xmlInitAttrInt(netNode, "coll", 1), ret, fail);
-    }
-  }
-  if (netDevCount == 0) {
-    NCCLCHECKGOTO(comm->ncclNet->devices(&netDevCount), ret, fail);
-  }
-  for (int n=0; n<netDevCount; n++) {
-    ncclNetProperties_t props;
-    NCCLCHECKGOTO(comm->ncclNet->getProperties(n, &props), ret, fail);
-    comm->netDeviceType = props.netDeviceType;
-    struct ncclXmlNode* netNode;
-    NCCLCHECKGOTO(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode), ret, fail);
-    NCCLCHECKGOTO(xmlSetAttrInt(netNode, "keep", 1), ret, fail);
-    NCCLCHECKGOTO(xmlSetAttrInt(netNode, "dev", n), ret, fail);
-    NCCLCHECKGOTO(xmlInitAttrInt(netNode, "speed", props.speed), ret, fail);
-    NCCLCHECKGOTO(xmlInitAttrInt(netNode, "port", props.port), ret, fail);
-    NCCLCHECKGOTO(xmlInitAttrFloat(netNode, "latency", props.latency), ret, fail);
-    NCCLCHECKGOTO(xmlInitAttrUint64(netNode, "guid", props.guid), ret, fail);
-    NCCLCHECKGOTO(xmlInitAttrInt(netNode, "maxconn", props.maxComms), ret, fail);
-    bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
-    INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
-    NCCLCHECKGOTO(xmlInitAttrInt(netNode, "gdr", gdrSupport), ret, fail);
+    NCCLCHECKGOTO(ncclTopoGetSharedState(&state, comm->ncclCollNet->name, collNetStates), ret, fail);
+    NCCLCHECKGOTO(ncclTopoProcessNet(comm, xml, 1, dumpXmlFile, state,
+      comm->ncclCollNet->getProperties, comm->ncclCollNet->makeVDevice, comm->ncclCollNet->devices, comm->ncclCollNet->name), ret, fail);
  }
+  NCCLCHECKGOTO(ncclTopoGetSharedState(&state, comm->ncclNet->name, netStates), ret, fail);
+  NCCLCHECKGOTO(ncclTopoProcessNet(comm, xml, 0, dumpXmlFile, state,
+    comm->ncclNet->getProperties, comm->ncclNet->makeVDevice, comm->ncclNet->devices, comm->ncclNet->name), ret, fail);
+  pthread_mutex_unlock(&netLock);
+  netLockHeld = 0;

  // Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
  NCCLCHECKGOTO(ncclTopoTrimXml(xml), ret, fail);
@@ -845,19 +1339,21 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
    NCCLCHECKGOTO(ncclTopoFuseXml(xml, peerXml), ret, fail);
  }

-  xmlTopoFile = ncclGetEnv("NCCL_TOPO_DUMP_FILE");
-  if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) {
-    INFO(NCCL_ENV, "NCCL_TOPO_DUMP_FILE set by environment to %s", xmlTopoFile);
-    NCCLCHECKGOTO(ncclTopoDumpXmlToFile(xmlTopoFile, xml), ret, fail);
+  if (dumpXmlFile && comm->rank == ncclParamTopoDumpFileRank()) {
+    INFO(NCCL_ENV, "NCCL_TOPO_DUMP_FILE set by environment to %s", dumpXmlFile);
+    NCCLCHECKGOTO(ncclTopoDumpXmlToFile(dumpXmlFile, xml), ret, fail);
  }

-  NCCLCHECKGOTO(ncclTopoGetSystemFromXml(xml, system, comm->peerInfo[comm->rank].hostHash), ret, fail);
+  // Only update our topo tracking structure if we aren't dumping (separate steps)
+  if (dumpXmlFile == NULL) NCCLCHECKGOTO(ncclTopoGetSystemFromXml(xml, system, comm->peerInfo[comm->rank].hostHash), ret, fail);
+
 exit:
  if (!comm->MNNVL && localRanks) free(localRanks);
  if (mem) free(mem);
  free(xml);
  return ret;
 fail:
+  if (netLockHeld) pthread_mutex_unlock(&netLock);
  goto exit;
 }

@@ -78,6 +78,9 @@ extern const char* topoLinkTypeStr[];
 // Connection through the network
 #define PATH_NET 8

+// New type of path which should precede PATH_PIX
+#define PATH_PORT PATH_NVL
+
 // Disconnected
 #define PATH_DIS 9
 extern const char* topoPathTypeStr[];
@@ -106,6 +109,7 @@ struct ncclTopoLinkList {
 #define NCCL_TOPO_ID_LOCAL_ID_MASK 0x00ffffffffffffff
 #define NCCL_TOPO_ID_SYSTEM_ID(id) (id >> 56)
 #define NCCL_TOPO_ID_LOCAL_ID(id) (id & NCCL_TOPO_ID_LOCAL_ID_MASK)
+#define NCCL_TOPO_LOCAL_NIC_ID(numaid, busid) (((int64_t)numaid << 56) + busid)
 #define NCCL_TOPO_ID(systemid, localid) (((int64_t)systemid << 56) + (localid & NCCL_TOPO_ID_LOCAL_ID_MASK))

 struct ncclTopoNode {
@@ -31,23 +31,87 @@ static int getNthreads(const char* name, int env, int min, int max, int def) {
  return nt;
 }

-ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* list) {
-  int def, set;
-  if (str[0] == '^') {
-    def = 1; set = 0; str++;
-  } else {
-    def = 0; set = 1;
+// Parse a map of prefixes to a list of elements. The first prefix is
+// optional and, if not present, the list of elements will be applied
+// to all prefixes. Only the first list of elements can lack a
+// prefix. Prefixes (if present) are followed by a colon. Lists of
+// elements are comma delimited. Mappings of prefix to the lists of
+// elements are semi-colon delimited.
+//
+// For example:
+//
+//     NCCL_ALGO="ring,collnetdirect;allreduce:tree,collnetdirect;broadcast:ring"
+// Enable ring and collnetdirect for all functions, then select tree
+// and collnetdirect for allreduce and ring for broadcast.
+//
+//     NCCL_PROTO="LL,Simple;allreduce:^LL"
+// Enable LL and Simple for all functions, but everything except LL
+// for allreduce.
+//
+//     NCCL_PROTO="^LL128;allreduce:LL128"
+// Enable everything but LL128, but only LL128 for allreduce.
+ncclResult_t parseList(const char* str, const char* prefixElems[], int nprefixes, const char* elems[], int nelems, int* list) {
+  char* fullStr = strdup(str);
+  char* tmpFullStr;
+  char* fullToken = strtok_r(fullStr, ";", &tmpFullStr);
+  while (fullToken) {
+    char* subToken = strdup(fullToken);
+    char* tmpSubStr;
+    char* prefix = strtok_r(subToken, ":", &tmpSubStr);
+    char* elemList = strtok_r(NULL, ":", &tmpSubStr);
+    if (elemList == NULL) {
+      if (fullToken != fullStr) {
+        // It makes no sense for any entry other than the first to not have a prefix,
+        // because then all the prefixes before the prefix-less entry would be
+        // overwritten.
+        WARN("All entries except the first must have a prefix: \"%s\"", str);
+        return ncclInvalidUsage;
+      }
+      elemList = prefix;
+      prefix = NULL;
+    }
+
+    int unset, set;
+    if (elemList[0] == '^') {
+      unset = 1; set = 0; elemList++;
+    } else {
+      unset = 0; set = 1;
+    }
+
+    bool foundPrefix = false;
+    for (int p=0; p<nprefixes; p++) {
+      if (prefix && strcasecmp(prefix, prefixElems[p]) != 0) continue;
+      foundPrefix = true;
+      for (int e=0; e<nelems; e++) list[p*nelems+e] = unset;
+
+      char* tokStr = strdup(elemList);
+      char* tmpStr;
+      char* elem = strtok_r(tokStr, ",", &tmpStr);
+      while (elem) {
+        int e;
+        for (e=0; e<nelems; e++) {
+          if (strcasecmp(elem, elems[e]) == 0) {
+            list[p*nelems+e] = set;
+            break;
+          }
+        }
+        if (e==nelems) {
+          WARN("Unrecognized element token \"%s\" when parsing \"%s\"", elem, str);
+          return ncclInvalidUsage;
+        }
+        elem = strtok_r(NULL, ",", &tmpStr);
+      }
+      free(tokStr);
+    }
+    if (!foundPrefix) {
+      WARN("Unrecognized prefix token \"%s\" when parsing \"%s\"", prefix, str);
+      return ncclInvalidUsage;
+    }
+    free(subToken);
+
+    fullToken = strtok_r(NULL, ";", &tmpFullStr);
  }
-  for (int i=0; i<nelems; i++) list[i] = def;
-  char* tokStr = strdup(str);
-  char* tmpStr;
-  char* token = strtok_r(tokStr, ",", &tmpStr);
-  while (token) {
-    for (int i=0; i<nelems; i++)
-      if (strcasecmp(token, elems[i]) == 0) list[i] = set;
-    token = strtok_r(NULL, ",", &tmpStr);
-  }
-  free(tokStr);
+  free(fullStr);
  return ncclSuccess;
 }

@@ -144,17 +208,16 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
  if (nRanks <= 1) return ncclSuccess;

  int compCapIndex = minCompCap >= 90 ? HOPPER_COMPCAP_IDX : minCompCap >= 80 ? AMPERE_COMPCAP_IDX : VOLTA_COMPCAP_IDX;
-  int cpuArch, cpuVendor, cpuModel;
-  NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
  int index2 = nNodes <= 2 ? nNodes-1 : 2;
  // LL: for single node, we look at GPU type; for multi-node, we look at CPU type
-  int index1 = nNodes == 1 ? compCapIndex : cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD ? 1 : 0;
+  int index1 = nNodes == 1 ? compCapIndex :
+               (comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD || comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_MIXED) ? 1 : 0;
  double llMaxBw = llMaxBws[index1][index2];
  double perChMaxTreeBw = perChMaxTreeBws[compCapIndex][index2];
  double perChMaxRingLL128Bw = perChMaxRingLL128Bws[compCapIndex][index2];
  double perChMaxTreeLL128Bw = perChMaxTreeLL128Bws[compCapIndex][index2];
  // De-penalize Tree/Simple latency on Power systems to favor Tree than Ring
-  if (cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE];
+  if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE];
  float ppn = (float)nRanks / nNodes;

  int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS];
@@ -190,7 +253,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
        if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw);
        if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), graphs[a]->nChannels*perChMaxTreeLL128Bw);
        if (a == NCCL_ALGO_TREE && graphs[a]->pattern == NCCL_TOPO_PATTERN_TREE) busBw *= .85;
-        if (a == NCCL_ALGO_PAT) busBw *= .85;
+        if (a == NCCL_ALGO_PAT) busBw *= .75;
        if (a == NCCL_ALGO_COLLNET_DIRECT && p != NCCL_PROTO_SIMPLE) busBw = 0;  // Not used
        if (a == NCCL_ALGO_COLLNET_CHAIN && p != NCCL_PROTO_SIMPLE) busBw = 0;  // Not used
        if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE) {
@@ -226,10 +289,6 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
          busBw *= ratio;
        }
        comm->bandwidths[coll][a][p] = busBw;
-        /* Ring bandwidth backup */
-        if (a == NCCL_ALGO_RING)
-          comm->ringbdw[coll][p] = comm->bandwidths[coll][NCCL_ALGO_RING][p];
-
        comm->latencies[coll][a][p] = baseLat[a][p];
        float intraLat = hwLat[intraHw[a]][a][p];
        // With ppn=1 latencies are fully exposed, use the Tree network latency
@@ -286,41 +345,78 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom

  // Protocols/Algorithms enable/disable, and user overrides.
  // All are enabled except ll128 which is enabled by default only in certain cases.
-  int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 };
-  int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1, 1, 1, 1 };
+  int protoEnable[NCCL_NUM_FUNCTIONS*NCCL_NUM_PROTOCOLS];
+  int algoEnable[NCCL_NUM_FUNCTIONS*NCCL_NUM_ALGORITHMS];
+  for (int f=0; f<NCCL_NUM_FUNCTIONS; f++) {
+    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+      protoEnable[f*NCCL_NUM_PROTOCOLS+p] = p == NCCL_PROTO_LL128 ? 2 : 1;
+    }
+    for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+      algoEnable[f*NCCL_NUM_ALGORITHMS+a] = 1;
+    }
+  }

  const char *protoStr = ncclGetEnv("NCCL_PROTO");
  if (protoStr) {
    INFO(NCCL_ENV, "NCCL_PROTO set by environment to %s", protoStr);
-    NCCLCHECK(parseList(protoStr, ncclProtoStr, NCCL_NUM_PROTOCOLS, protoEnable));
+    NCCLCHECK(parseList(protoStr, ncclFuncStr, NCCL_NUM_FUNCTIONS, ncclProtoStr, NCCL_NUM_PROTOCOLS, protoEnable));
  }
  const char *algoStr = ncclGetEnv("NCCL_ALGO");
  if (algoStr) {
    INFO(NCCL_ENV, "NCCL_ALGO set by environment to %s", algoStr);
-    NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable));
+    NCCLCHECK(parseList(algoStr, ncclFuncStr, NCCL_NUM_FUNCTIONS, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable));
  }

-  if (comm->nNodes == 1) algoEnable[NCCL_ALGO_NVLS_TREE] = 0;
-
-  // Disable CollNet if it is not supported
-  if (comm->collNetSupport == 0) {
-    algoEnable[NCCL_ALGO_COLLNET_DIRECT] = 0;
-    algoEnable[NCCL_ALGO_COLLNET_CHAIN] = 0;
-    if (nNodes > 1) algoEnable[NCCL_ALGO_NVLS] = 0;
-    // If user has hard set NCCL_ALGO=COLLNET, ignore it
-    if (algoEnable[NCCL_ALGO_RING] == 0 && algoEnable[NCCL_ALGO_TREE] == 0 &&
-        algoEnable[NCCL_ALGO_NVLS] == 0 && algoEnable[NCCL_ALGO_NVLS_TREE] == 0) {
-      algoEnable[NCCL_ALGO_RING] = algoEnable[NCCL_ALGO_TREE] = 1;
+  if (comm->rank == 0 && (algoStr||protoStr)) {
+    constexpr int strLength = 1024;
+    char funcAlgoProtoTuningStr[strLength];
+    int offset = 0;
+    offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "\n     Function | ");
+    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+      offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "%8s  ", ncclProtoStr[p]);
+    }
+    offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), " | ");
+    for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+      offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "%13s  ", ncclAlgoStr[a]);
+    }
+    offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "\n");
+
+    for (int f=0; f<NCCL_NUM_FUNCTIONS; f++) {
+      offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "%13s | ", ncclFuncStr[f]);
+      for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+        offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "%8d  ", protoEnable[f*NCCL_NUM_PROTOCOLS+p]);
+      }
+      offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), " | ");
+      for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+        offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "%13d  ", algoEnable[f*NCCL_NUM_ALGORITHMS+a]);
+      }
+      offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "\n");
+    }
+
+    INFO(NCCL_ENV, "Enabled NCCL Func/Proto/Algo Matrix:%s", funcAlgoProtoTuningStr);
+  }
+
+  int nvsCount = 0;
+  NCCLCHECK(ncclTopoGetNvsCount(comm->topo, &nvsCount));
+
+  for (int f=0; f<NCCL_NUM_FUNCTIONS; f++) {
+    for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+      int disable = 0;
+      // Disable NVLS Tree on a single node
+      if (comm->nNodes == 1 && a == NCCL_ALGO_NVLS_TREE) disable = 1;
+      // Disable Collnet+Direct, Collnet+Chain or Collnet+NVLS if collnet is not supported.
+      if (comm->collNetSupport == 0 &&
+          (a == NCCL_ALGO_COLLNET_DIRECT ||
+           a == NCCL_ALGO_COLLNET_CHAIN ||
+           (a == NCCL_ALGO_NVLS && comm->nNodes > 1))) disable = 1;
+      // Disable CollNet+Direct if not on an NVSwitch system
+      if (nvsCount == 0 && a == NCCL_ALGO_COLLNET_DIRECT) disable = 1;
+      if (disable) algoEnable[f*NCCL_NUM_ALGORITHMS+a] = 0;
    }
-  } else {
-    // Disable CollNet+Direct if not on an NVSwitch system
-    int nvsCount = 0;
-    NCCLCHECK(ncclTopoGetNvsCount(comm->topo, &nvsCount));
-    if (nvsCount == 0) algoEnable[NCCL_ALGO_COLLNET_DIRECT] = 0;
  }

  for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-    int pEnable = protoEnable[p];
+    int pEnable = protoEnable[c*NCCL_NUM_PROTOCOLS+p];
    if (pEnable == 2 && p == NCCL_PROTO_LL128) {
      // Enable LL128 by default only on Volta/Ampere/Hopper+NVLink. Other cases are not tested and may cause silent data corruption.
      pEnable = 1;
@@ -335,66 +431,51 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
      }
    }
    if (pEnable == 0) comm->bandwidths[c][a][p] = 0;
-    if (algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
-    if (a == NCCL_ALGO_RING && pEnable == 0) comm->ringbdw[c][p] = 0;
-  }
-
-  for (int c = 0; c < NCCL_NUM_FUNCTIONS; c++) {
-    bool available = false;
-    for (int a = 0; a < NCCL_NUM_ALGORITHMS; a++)
-      for (int p = 0; p < NCCL_NUM_PROTOCOLS; p++)
-        if (comm->bandwidths[c][a][p] != 0) {
-          available = true;
-          goto check_avail;
-        }
-  check_avail:
-    if (available == false) {
-      /* at least set ring algo available */
-      for (int p = 0; p < NCCL_NUM_PROTOCOLS; p++)
-        comm->bandwidths[c][NCCL_ALGO_RING][p] = comm->ringbdw[c][p];
-    }
+    if (algoEnable[c*NCCL_NUM_ALGORITHMS+a] == 0) comm->bandwidths[c][a][p] = 0;
  }

  if (comm->rank == 0) {
-    char line[1024];
+    constexpr int lineLen = 1024;
+    char line[lineLen];
+    int offset = 0;
    for (int block=0; block<DIVUP(NCCL_NUM_ALGORITHMS, 3); block++) {
-      sprintf(line, "  Algorithm   |");
+      offset = snprintf(line, lineLen, "  Algorithm   |");
      for (int ba=0; ba<3; ba++) {
-	int a = block*3+ba;
+        int a = block*3+ba;
        if (a >= NCCL_NUM_ALGORITHMS) continue;
-        sprintf(line+strlen(line), " %14s   %14s   %14s |", "", ncclAlgoStr[a], "");
+        offset += snprintf(line+offset, std::max(0, lineLen-offset), " %14s   %14s   %14s |", "", ncclAlgoStr[a], "");
      }
      INFO(NCCL_TUNING, "%s", line);
-      sprintf(line, "  Protocol    |");
+      offset = snprintf(line, lineLen, "  Protocol    |");
      for (int ba=0; ba<3; ba++) {
        for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-          sprintf(line+strlen(line), " %14s |", ncclProtoStr[p]);
+          offset += snprintf(line+offset, std::max(0, lineLen-offset), " %14s |", ncclProtoStr[p]);
        }
      }
      INFO(NCCL_TUNING, "%s", line);
-      sprintf(line, " Max NThreads |");
+      offset = snprintf(line, lineLen, " Max NThreads |");
      for (int ba=0; ba<3; ba++) {
-	int a = block*3+ba;
+        int a = block*3+ba;
        if (a >= NCCL_NUM_ALGORITHMS) continue;
        for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-          sprintf(line+strlen(line), " %14d |", comm->maxThreads[a][p]);
+          offset += snprintf(line+offset, std::max(0, lineLen-offset), " %14d |", comm->maxThreads[a][p]);
        }
      }
      INFO(NCCL_TUNING, "%s", line);
      for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) {
-        sprintf(line, "%13s |", ncclFuncStr[c]);
+        offset = snprintf(line, lineLen, "%13s |", ncclFuncStr[c]);
        for (int ba=0; ba<3; ba++) {
-	  int a = block*3+ba;
+          int a = block*3+ba;
          if (a >= NCCL_NUM_ALGORITHMS) continue;
          for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-            sprintf(line+strlen(line), "%8.1f/%6.1f |", comm->latencies[c][a][p], comm->bandwidths[c][a][p]);
+            offset += snprintf(line+offset, std::max(0, lineLen-offset), "%8.1f/%6.1f |", comm->latencies[c][a][p], comm->bandwidths[c][a][p]);
          }
        }
        INFO(NCCL_TUNING, "%s", line);
      }
    }
  }
-
+ 
  // Set per-thread amount of work before we increase nThreads and nChannels
  for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
    comm->threadThresholds[a][NCCL_PROTO_LL] = NCCL_LL_THREAD_THRESHOLD;
@@ -438,19 +519,10 @@ static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][23] = {
  {  .9,  .9,  .9,  .9,  .9,  .9,  .9,  .8,  .7,  .6,  .6,  .5,  .5,  .5,  .5,  .6,  .7,  .8,  .7,  .7,  .8,  .9,  .9 }
 };

-ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, int protocol, size_t nBytes, int numPipeOps, float* time, bool* backup) {
+ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, int protocol, size_t nBytes, int numPipeOps, float* time) {
  float bw = comm->bandwidths[coll][algorithm][protocol];
  float lat = comm->latencies[coll][algorithm][protocol];

-  if (backup) {
-    *backup = false;
-    if (algorithm == NCCL_ALGO_RING && bw == 0.0f) {
-      /* try back up RING algorithm */
-      bw = comm->ringbdw[coll][protocol];
-      *backup = true;
-    }
-  }
-
  if (bw == 0) {
    *time = -1.0; return ncclSuccess;
  }
@@ -17,6 +17,9 @@
 #include <cpuid.h>
 #endif

+// Arbitrarily large number for constructing virtual topology string
+#define NCCL_MAX_XML_DEPTH 1024
+
 /*******************/
 /* XML File Parser */
 /*******************/
@@ -430,7 +433,7 @@ static ncclResult_t getBcmLinks(const char* busId, int* nlinks, char** peers) {

 ncclResult_t ncclTopoGetStrFromSys(const char* path, const char* fileName, char* strValue) {
  char filePath[PATH_MAX];
-  sprintf(filePath, "%s/%s", path, fileName);
+  snprintf(filePath, sizeof(filePath), "%s/%s", path, fileName);
  int offset = 0;
  FILE* file;
  if ((file = fopen(filePath, "r")) != NULL) {
@@ -883,7 +886,7 @@ ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct nccl
 // where sysPath/subsystem points to.
 ncclResult_t ncclTopoGetSubsystem(const char* sysPath, char* subSys) {
  char subSysPath[PATH_MAX];
-  sprintf(subSysPath, "%s/subsystem", sysPath);
+  snprintf(subSysPath, sizeof(subSysPath), "%s/subsystem", sysPath);
  char* path = realpath(subSysPath, NULL);
  if (path == NULL) {
    subSys[0] = '\0';
@@ -896,8 +899,9 @@ ncclResult_t ncclTopoGetSubsystem(const char* sysPath, char* subSys) {
  return ncclSuccess;
 }

-ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const char* netName, struct ncclXmlNode** netNode) {
+ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const char* netName, struct ncclXmlNode** netNode, struct ncclXmlNode* forceParent) {
  NCCLCHECK(xmlFindTagKv(xml, "net", netNode, "name", netName));
+
  if (*netNode != NULL) return ncclSuccess;

  const char* pciSysPath = pciPath;
@@ -906,13 +910,15 @@ ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const cha
    NCCLCHECK(ncclTopoGetSubsystem(pciSysPath, subSystem));
    // This is not a PCI device (virtual, usb, ...).
    if (strcmp(subSystem, "pci") != 0) {
-      INFO(NCCL_GRAPH, "Topology detection: network path %s is not a PCI device (%s). Attaching to first CPU", pciSysPath, subSystem);
+      INFO(NCCL_NET|NCCL_GRAPH, "Topology detection: network path %s is not a PCI device (%s). Attaching to first CPU", pciSysPath, subSystem);
      pciSysPath = NULL;
    }
  }

  struct ncclXmlNode* parent = NULL;
-  if (pciSysPath) {
+  if (forceParent) {
+    parent = forceParent;
+  } else if (pciSysPath) {
    int offset;
    for (offset=strlen(pciSysPath)-1; pciSysPath[offset] != '/'; offset--);
    char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
@@ -50,7 +50,7 @@ ncclResult_t ncclTopoGetXmlGraphFromFile(const char* xmlGraphFile, struct ncclXm

 /* Auto-detect functions */
 ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct ncclXmlNode** gpuNode);
-ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const char* netName, struct ncclXmlNode** netNode);
+ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const char* netName, struct ncclXmlNode** netNode, struct ncclXmlNode* forceParent=NULL);

 /* Remove unneeded parts */
 ncclResult_t ncclTopoTrimXml(struct ncclXml* xml);
@@ -132,6 +132,13 @@ static ncclResult_t xmlGetAttrFloat(struct ncclXmlNode* node, const char* attrNa
  return ncclSuccess;
 }

+static ncclResult_t xmlGetAttrFloatDefault(struct ncclXmlNode* node, const char* attrName, float* value, float defaultValue) {
+  const char* str;
+  NCCLCHECK(xmlGetAttr(node, attrName, &str));
+  *value = str ? strtof(str, NULL) : defaultValue;
+  return ncclSuccess;
+}
+
 static ncclResult_t xmlFindTag(struct ncclXml* xml, const char* tagName, struct ncclXmlNode** node) {
  *node = NULL;
  for (int i=0; i<xml->maxIndex; i++) {
@@ -208,6 +215,24 @@ static ncclResult_t xmlSetAttr(struct ncclXmlNode* node, const char* attrName, c
  return ncclSuccess;
 }

+static ncclResult_t xmlPrintNodeRecursive(struct ncclXmlNode* node, const char* name) {
+  while (node) {
+    char line[1024*8];
+    int cursor = 0;
+    snprintf(line, sizeof(line), "<name=%s", node->name);
+    for (int i = 0; i < node->nAttrs; i++) {
+      cursor = strlen(line);
+      snprintf(line + cursor, sizeof(line) - cursor, " %s=%s", node->attrs[i].key, node->attrs[i].value);
+    }
+    cursor = strlen(line);
+    snprintf(line + cursor, sizeof(line) - cursor, ">");
+    INFO(NCCL_GRAPH, "%s", line);
+    node = node->parent;
+  }
+  return ncclSuccess;
+}
+
+
 static ncclResult_t xmlSetAttrIfUnset(struct ncclXmlNode* node, const char* attrName, const char* value) {
  int index;
  NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
@@ -323,7 +323,7 @@ static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** g
  /* reset everything */
  while (!ncclIntruQueueEmpty(asyncJobsPtr)) {
    struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsPtr);
-    if (job->comm && !job->comm->config.blocking)
+    if (!job->destroyFlag && job->comm && !job->comm->config.blocking)
      (void) ncclCommSetAsyncError(job->comm, error);
    if (job->undo) job->undo(job);
    if (job->destructor) job->destructor((void*)job);
@@ -392,7 +392,6 @@ fail:
 }

 static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInfo = NULL) {
-  int savedDev;
  ncclResult_t ret = ncclSuccess;
  struct ncclGroupJob *gjob = (struct ncclGroupJob*) job_;
  struct ncclComm *groupCommHeadMain = *gjob->groupCommHeadPtr;
@@ -401,8 +400,6 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf

  bool *groupAbortFlag = gjob->abortFlagPtr;

-  CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, fail);
-
  if (!simInfo && groupCommPreconnectHeadMain != nullptr) {
    struct ncclComm* comm = groupCommPreconnectHeadMain;
    do {
@@ -454,12 +451,19 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
      }
      comm = comm->groupNext;
    } while (comm);
-
    NCCLCHECKGOTO(asyncJobLaunch(&asyncCollJobs, groupAbortFlag), ret, fail);
    while (!ncclIntruQueueEmpty(&asyncCollJobs)) {
      struct ncclAsyncJob* job = ncclIntruQueueDequeue(&asyncCollJobs);
      if (job->destructor) job->destructor((void*)job);
    }
+
+    // done with all buffer allocation, start registration and enqueue
+    comm = groupCommHeadMain;
+    do {
+      CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
+      NCCLCHECKGOTO(ncclTasksRegAndEnqueue(comm), ret, fail);
+      comm = comm->groupNext;
+    } while (comm);
  }

  if ((!simInfo) && (groupCommHeadMain != nullptr)) {
@@ -476,6 +480,9 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
  while (groupCommHeadMain != nullptr) {
    struct ncclComm* comm = groupCommHeadMain;
    struct ncclComm* next = comm->groupNext;
+    // Poll for callbacks sent to us from other threads. Typically these free
+    // resources from to our memory pools and UB
+    NCCLCHECKGOTO(ncclCommPollCallbacks(comm, /*waitSome=*/false), ret, fail);
    (void) ncclGroupCommLeave(comm);
    if (!comm->config.blocking) {
      (void) ncclCommSetAsyncError(comm, ret);
@@ -483,8 +490,6 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
    groupCommHeadMain = next;
  }

-  CUDACHECK(cudaSetDevice(savedDev));
-
 exit:
  return ret;
 fail:
@@ -563,7 +568,10 @@ ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo) {
      ret = ncclInProgress;
    } else {
      /* blocking group */
+      int savedDev;
+      CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, fail);
      NCCLCHECKGOTO(groupLaunch(&ncclGroupJobMainPtr->base, internalSimInfoPtr), ret, fail);
+      CUDACHECKGOTO(cudaSetDevice(savedDev), ret, fail);
      if (simInfo) memcpy((void*)simInfo, (void*)internalSimInfoPtr, realSize);
      groupResetJobState(ncclGroupJobMainPtr);
    }
@@ -10,6 +10,7 @@
 #include "nccl.h"
 #include "nccl_common.h"
 #include "device.h"
+#define NCCL_MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two.

 // CHUNKSIZE must be a multiple of SLICESIZE
 #define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4)
@@ -23,6 +24,7 @@
 #define REDUCE_SLICESTEPS 1
 #define REDUCE_CHUNKSTEPS 1
 #define NCCL_MAX_SLICE_PER_CHUNK 2  // max value for CHUNKSTEPS/SLICESTEPS, must accord with above
+#define NCCL_MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two.

 const char* ncclFuncToString(ncclFunc_t op);
 const char* ncclDevRedOpToString(ncclDevRedOp_t op);
@@ -34,11 +36,11 @@ inline int ncclTypeSize(ncclDataType_t type) {
  switch (type) {
  case ncclInt8:
  case ncclUint8:
+  case ncclFloat8e4m3:
+  case ncclFloat8e5m2:
    return 1;
  case ncclFloat16:
-  #if defined(__CUDA_BF16_TYPES_EXIST__)
  case ncclBfloat16:
-  #endif
    return 2;
  case ncclInt32:
  case ncclUint32:
@@ -67,6 +69,319 @@ struct ncclConnFifo {

 #include <stdio.h>

+class RingAlgorithm {
+protected:
+  int refCount;
+  int nRanks;
+  int nStepsPerLoop;
+  int chunkSteps;
+  int sliceSteps;
+  ssize_t sliceSize;
+  ssize_t loopSize;
+  ssize_t channelSize;
+  uint8_t *sendbuff;
+  uint8_t *recvbuff;
+  void *sendMhandle;
+  void *recvMhandle;
+  void *srecvMhandle;
+public:
+  // this ring class is used by proxy thread to retrieve the send and recv buffer, size as well as corresponding
+  // mem handle based on the current step of the proxy args. The derived ring algo class is AR, AG, and BC which
+  // would be allocated during enqueue stage and copied to proxy side through shared memory. For each copy, we will
+  // increase the refCount by incRefCount() since the same ring algo object can be referenced multiple times for send
+  // and recv progress. After all steps are done, we decrease the refCount and only delete the ring object when
+  // refCount == 0.
+  virtual void getNextSendAddr(int curStep, uint8_t **sendbuffOut, size_t *sizeOut, void **mhandleOut) = 0;
+  virtual void getNextRecvAddr(int curStep, uint8_t **recvbuffOut, size_t *sizeOut, void **mhandleOut) = 0;
+  int incRefCount() {
+    return __atomic_add_fetch(&refCount, 1, __ATOMIC_RELAXED);
+  }
+  int decRefCount() {
+    return __atomic_sub_fetch(&refCount, 1, __ATOMIC_RELEASE);
+  }
+  RingAlgorithm() { refCount = 0; }
+  virtual ~RingAlgorithm() {};
+};
+
+class RingARAlgorithm : public RingAlgorithm {
+private:
+  int ringIndex;
+  int elemSize;
+  ssize_t chunkSize;
+  int slicePerChunk;
+public:
+  void getNextSendAddr(int curStep, uint8_t **sendbuffOut, size_t *sizeOut, void **mhandleOut) {
+    int curLoop = curStep / nStepsPerLoop;
+    int curLoopStage = (curStep % nStepsPerLoop) / chunkSteps;
+    int chunkStage = curLoopStage % nRanks;
+    int sliceStage = (curStep % chunkSteps) / sliceSteps;
+    ssize_t elemOffset = curLoop * loopSize;
+    ssize_t remSize = channelSize - elemOffset;
+    ssize_t chunkOffset;
+    ssize_t sliceOffset;
+    ssize_t curSliceSize;
+    ssize_t curChunkSize;
+    ssize_t size;
+    ssize_t nelem;
+    int chunkId;
+
+    if (remSize < loopSize) {
+      curChunkSize = alignUp(divUp(remSize / elemSize, nRanks), 16 / elemSize) * elemSize;
+    } else {
+      curChunkSize = chunkSize;
+    }
+    chunkId = (ringIndex + nRanks - 1 - chunkStage) % nRanks;
+    chunkOffset = chunkId * curChunkSize;
+    nelem = std::min(remSize - chunkOffset, curChunkSize);
+    curSliceSize = std::max(divUp(nelem / elemSize, 16 * slicePerChunk) * 16, sliceSize / elemSize / 32) * elemSize;
+    sliceOffset = sliceStage * curSliceSize;
+
+    if (nelem <= sliceOffset) {
+      *sendbuffOut = sendbuff;
+      *mhandleOut = sendMhandle;
+    } else {
+      if (curLoopStage == 0) {
+        *sendbuffOut = sendbuff + elemOffset + chunkOffset + sliceOffset;
+        *mhandleOut = sendMhandle;
+      } else {
+        *sendbuffOut = recvbuff + elemOffset + chunkOffset + sliceOffset;
+        *mhandleOut = srecvMhandle;
+      }
+    }
+    size = std::min(curSliceSize, nelem - sliceOffset);
+    *sizeOut = size < 0 ? 0 : size;
+    return;
+  }
+
+  void getNextRecvAddr(int curStep, uint8_t **recvbuffOut, size_t *sizeOut, void **mhandleOut) {
+    int curLoop = curStep / nStepsPerLoop;
+    int curLoopStage = ((curStep + chunkSteps) % nStepsPerLoop) / chunkSteps;
+    int chunkStage = curLoopStage % nRanks;
+    int sliceStage = (curStep % chunkSteps) / sliceSteps;
+    ssize_t elemOffset = curLoop * loopSize;
+    ssize_t remSize = channelSize - elemOffset;
+    ssize_t chunkOffset;
+    ssize_t sliceOffset;
+    ssize_t curSliceSize;
+    ssize_t curChunkSize;
+    ssize_t size;
+    ssize_t nelem;
+    int chunkId;
+
+    if (remSize < loopSize) {
+      curChunkSize = alignUp(divUp(remSize / elemSize, nRanks), 16 / elemSize) * elemSize;
+    } else {
+      curChunkSize = chunkSize;
+    }
+
+    if (curLoopStage == 0) {
+      chunkId = (ringIndex + 1) % nRanks;
+    } else {
+      chunkId = (ringIndex + nRanks - 1 - chunkStage) % nRanks;
+    }
+
+    chunkOffset = chunkId * curChunkSize;
+    nelem = std::min(remSize - chunkOffset, curChunkSize);
+    curSliceSize = std::max(divUp(nelem / elemSize, 16 * slicePerChunk) * 16, sliceSize / elemSize / 32) * elemSize;
+    sliceOffset = sliceStage * curSliceSize;
+    if (nelem <= sliceOffset) {
+      *recvbuffOut = recvbuff;
+    } else {
+      *recvbuffOut = recvbuff + elemOffset + chunkOffset + sliceOffset;
+    }
+    if (sizeOut) {
+      size = std::min(curSliceSize, nelem - sliceOffset);
+      *sizeOut = size < 0 ? 0 : size;
+    }
+    *mhandleOut = recvMhandle;
+    return;
+  }
+
+  RingARAlgorithm(const void *sendbuff, void *recvbuff, int nRanks, int ringIndex, int chunkSteps, int sliceSteps, size_t chunkSize, size_t sliceSize, size_t gridOffset, size_t channelSize, int elemSize, void *sendMhandle, void *recvMhandle, void *srecvMhandle) {
+    this->ringIndex = ringIndex;
+    this->nRanks = nRanks;
+    this->nStepsPerLoop = 2 * (nRanks - 1) * chunkSteps;
+    this->chunkSteps = chunkSteps;
+    this->sliceSteps = sliceSteps;
+    this->chunkSize = chunkSize;
+    this->sliceSize = sliceSize;
+    this->loopSize = nRanks * chunkSize;
+    this->sendbuff = (uint8_t*)sendbuff + gridOffset;
+    this->recvbuff = (uint8_t*)recvbuff + gridOffset;
+    this->channelSize = channelSize;
+    this->elemSize = elemSize;
+    this->sendMhandle = sendMhandle;
+    this->recvMhandle = recvMhandle;
+    this->srecvMhandle = srecvMhandle;
+    this->slicePerChunk = chunkSteps / sliceSteps;
+  }
+  ~RingARAlgorithm() {}
+};
+
+class RingAGAlgorithm : public RingAlgorithm {
+private:
+  int *ringRanks;
+  int elemSize;
+  ssize_t sendSize;
+  int slicePerChunk;
+public:
+  void getNextSendAddr(int curStep, uint8_t **sendbuffOut, size_t *sizeOut, void **mhandleOut) {
+    int curLoop = curStep / nStepsPerLoop;
+    int chunkStage = (curStep % nStepsPerLoop) / chunkSteps;
+    int sliceStage = (curStep % chunkSteps) / sliceSteps;
+    ssize_t sliceOffset;
+    ssize_t curSliceSize;
+    ssize_t offset;
+    ssize_t elemOffset = curLoop * loopSize;
+    ssize_t chunkSize = std::min(loopSize, channelSize - elemOffset);
+    ssize_t size;
+    int rankDest;
+    uint8_t *buff;
+    void *mhandle;
+
+    curSliceSize = std::max(divUp(chunkSize / elemSize, 16 * slicePerChunk) * 16, sliceSize / elemSize / 32) * elemSize;
+    sliceOffset = sliceStage * curSliceSize;
+    if (chunkStage == 0) {
+      rankDest = ringRanks[0];
+      offset = elemOffset + sliceOffset;
+      buff = sendbuff + offset;
+      mhandle = sendMhandle;
+    } else {
+      rankDest = ringRanks[nRanks - chunkStage];
+      offset = elemOffset + rankDest * sendSize + sliceOffset;
+      buff = recvbuff + offset;
+      mhandle = srecvMhandle;
+    }
+    *sendbuffOut = buff;
+    size = std::min(curSliceSize, channelSize - elemOffset - sliceOffset);
+    *sizeOut = size < 0 ? 0 : size;
+    *mhandleOut = mhandle;
+    return;
+  }
+
+  void getNextRecvAddr(int curStep, uint8_t **recvbuffOut, size_t *sizeOut, void **mhandleOut) {
+    int curLoop = curStep / nStepsPerLoop;
+    int chunkStage = ((curStep + chunkSteps) % nStepsPerLoop) / chunkSteps;
+    int sliceStage = (curStep % chunkSteps) / sliceSteps;
+    ssize_t sliceOffset;
+    ssize_t curSliceSize;
+    ssize_t offset;
+    ssize_t elemOffset = curLoop * loopSize;
+    ssize_t chunkSize = std::min(loopSize, channelSize - elemOffset);
+    ssize_t size;
+    int rankDest;
+
+    curSliceSize = std::max(divUp(chunkSize / elemSize, 16 * slicePerChunk) * 16, sliceSize / elemSize / 32) * elemSize;
+    sliceOffset = sliceStage * curSliceSize;
+    if (chunkStage == 0) {
+      rankDest = ringRanks[1];
+    } else {
+      rankDest = ringRanks[nRanks - chunkStage];
+    }
+    offset = elemOffset + rankDest * sendSize + sliceOffset;
+    *recvbuffOut = recvbuff + offset;
+    if (sizeOut) {
+      size = std::min(sliceSize, channelSize - elemOffset - sliceOffset);
+      *sizeOut = size < 0 ? 0 : size;
+    }
+    *mhandleOut = recvMhandle;
+  }
+
+  RingAGAlgorithm(const void *sendbuff, void *recvbuff, int nRanks, int *ringRanks, int chunkSteps, int sliceSteps, size_t chunkSize, size_t sliceSize, size_t gridOffset, size_t channelSize, int elemSize, size_t sendSize, void *sendMhandle, void *recvMhandle, void *srecvMhandle) {
+    this->ringRanks = ringRanks;
+    this->nRanks = nRanks;
+    this->nStepsPerLoop = (nRanks - 1) * chunkSteps;
+    this->chunkSteps = chunkSteps;
+    this->sliceSteps = sliceSteps;
+    this->elemSize = elemSize;
+    this->sliceSize = sliceSize;
+    this->loopSize = chunkSize;
+    this->sendSize = sendSize;
+    this->channelSize = channelSize;
+    this->sendbuff = (uint8_t*)sendbuff + gridOffset;
+    this->recvbuff = (uint8_t*)recvbuff + gridOffset;
+    this->sendMhandle = sendMhandle;
+    this->recvMhandle = recvMhandle;
+    this->srecvMhandle = srecvMhandle;
+    this->slicePerChunk = chunkSteps / sliceSteps;
+  }
+  ~RingAGAlgorithm() {}
+};
+
+class RingBCAlgorithm : public RingAlgorithm {
+private:
+  int root;
+  int rank;
+  int nextRank;
+public:
+  void getNextSendAddr(int curStep, uint8_t **sendbuffOut, size_t *sizeOut, void **mhandleOut) {
+    int curLoop = curStep / nStepsPerLoop;
+    int sliceStage = (curStep % chunkSteps) / sliceSteps;
+    ssize_t sliceOffset = sliceStage * sliceSize;
+    ssize_t offset;
+    ssize_t elemOffset = curLoop * loopSize;
+    ssize_t size;
+    uint8_t *buff;
+    void *mhandle;
+
+    offset = elemOffset + sliceOffset;
+    if (offset >= channelSize) {
+      buff = sendbuff;
+      mhandle = sendMhandle;
+    } else if (rank == root) {
+      buff = sendbuff + offset;
+      mhandle = sendMhandle;
+    } else {
+      buff = recvbuff + offset;
+      mhandle = srecvMhandle;
+    }
+    *sendbuffOut = buff;
+    size = std::min(sliceSize, channelSize - offset);
+    *sizeOut = size < 0 ? 0 : size;
+    *mhandleOut = mhandle;
+    return;
+  }
+
+  void getNextRecvAddr(int curStep, uint8_t **recvbuffOut, size_t *sizeOut, void **mhandleOut) {
+    int curLoop = curStep / nStepsPerLoop;
+    int sliceStage = (curStep % chunkSteps) / sliceSteps;
+    ssize_t sliceOffset = sliceStage * sliceSize;
+    ssize_t offset;
+    ssize_t elemOffset = curLoop * loopSize;
+    ssize_t size;
+    offset = elemOffset + sliceOffset;
+    if (offset >= channelSize) {
+      *recvbuffOut = recvbuff;
+    } else {
+      *recvbuffOut = recvbuff + offset;
+    }
+    if (sizeOut) {
+      size = std::min(sliceSize, channelSize - offset);
+      *sizeOut = size < 0 ? 0 : size;
+    }
+    *mhandleOut = recvMhandle;
+    return;
+  }
+
+  RingBCAlgorithm(const void* sendbuff, void* recvbuff, int rank, int root, int nRanks, int *ringRanks, int chunkSteps, int sliceSteps, size_t chunkSize, size_t sliceSize, size_t gridOffset, size_t channelSize, void *sendMhandle, void *recvMhandle, void *srecvMhandle) {
+    this->root = root;
+    this->rank = rank;
+    this->nextRank = ringRanks[1];
+    this->nStepsPerLoop = chunkSteps;
+    this->chunkSteps = chunkSteps;
+    this->sliceSteps = sliceSteps;
+    this->sliceSize = sliceSize;
+    this->loopSize = chunkSize;
+    this->channelSize = channelSize;
+    this->sendbuff = (uint8_t*)sendbuff + gridOffset;
+    this->recvbuff = (uint8_t*)recvbuff + gridOffset;
+    this->sendMhandle = sendMhandle;
+    this->recvMhandle = recvMhandle;
+    this->srecvMhandle = srecvMhandle;
+  }
+  ~RingBCAlgorithm() {}
+};
+
 template<typename T>
 class PatRSAlgorithm{
  size_t offset;
@@ -532,10 +847,10 @@ restart:
      int sendDataRank = (rank + nranks + s) % nranks;
      outIx = sendDataRank * count + offset;
      recvDim = s ? firstBitSet(s, nrPow2) : -1;
-      s -= (1<<recvDim);
      if (recvDim == -1) {
        recvOffset = -1;
      } else {
+        s -= (1<<recvDim);
        int foffset = (a*2*scale*aggDelta) >> (recvDim+1);
        recvOffset = (foffset%postFreq)*nelem;
        recvStepOffset = foffset / postFreq;
@@ -197,12 +197,15 @@ struct ncclTaskColl {
  int32_t algorithm:8, protocol:8;
  uint32_t isCollnet:1, isNvls:1;
  uint32_t devFuncId:30;
-  enum ncclRegBufferType regBufType;
+  int regBufType;
  // number of elements in planner->ipcMemQueue associated with this collective
  int nCleanupQueueElts;

  void* sendMhandle;
  void* recvMhandle;
+  void** sendNetHandles;
+  void** recvNetHandles;
+  void** srecvNetHandles;
  // index for IPC record lookup
  uintptr_t sendbuffOffset;
  uintptr_t recvbuffOffset;
@@ -236,6 +239,7 @@ struct ncclKernelPlan {
  struct ncclKernelPlan* next;

  bool persistent; // aka captured in a graph
+  bool isHostCbEnq;
  enum ncclDevWorkStorageType workStorageType;
  bool kernelSpecialized;
  void *kernelFn;
@@ -365,6 +369,7 @@ struct ncclKernelPlanner {

  struct ncclIntruQueue<struct ncclTaskColl, &ncclTaskColl::next> collTaskQueue;
  struct ncclIntruQueue<struct ncclWorkList, &ncclWorkList::next> collWorkQueue;
+  struct ncclIntruQueue<struct ncclWorkList, &ncclWorkList::next> tmpCollWorkQueue;
  struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next> collCleanupQueue;

  //////////////////////////////////////////////////////////////////////////////
@@ -463,6 +468,8 @@ struct ncclComm {

  // Counter for tracking CUDA launches (P2P and collectives included)
  uint64_t opCount;
+  // Collective operation counter
+  uint64_t collOpCount;

  // Channels for collectives
  int nChannels; // connection nChannels
@@ -486,7 +493,6 @@ struct ncclComm {
  ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
  float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
  float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
-  float ringbdw[NCCL_NUM_FUNCTIONS][NCCL_NUM_PROTOCOLS];
  int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];

  /* This attribute can indicate the states of communicators and return code of
@@ -532,7 +538,7 @@ struct ncclComm {
  int proxyRefCountOld; /* store proxy post-atomic-sub refcount */
  // Whether this communicator uses collNet
  int collNetSupport;
-  bool collNetRegSupport;
+  bool isOneRPN;
  uint8_t collNetSupportMatrix[4/*sum,prod,max,min*/][ncclNumTypes];
  bool intraNodeP2pSupport;
  int* collNetHeads;
@@ -560,6 +566,7 @@ struct ncclComm {
  // Subset of those in groupNext list. Holds 0x1 if not needing preconnect.
  struct ncclComm* preconnectNext;
  int persistentRefs; // number of persistent plan-lists capturing this comm
+  int noncapturedRefs; // number of non-captured hostStreamPlanCallback on the stream
  struct P2pSchedulePair { int sendRank; int recvRank; } *p2pSchedule;

  struct ncclKernelPlanner planner;
@@ -599,9 +606,16 @@ struct ncclComm {

  // buffer registration cache
  struct ncclRegCache regCache;
+  int isAllNvlink;
+  bool useNetPXN;
+  bool useGdr;
+  int splitCount;
  uint64_t endMagic;
 };

+static_assert(offsetof(struct ncclComm, startMagic) == 0, "startMagic must be the first field of ncclComm");
+static_assert(offsetof(struct ncclComm, endMagic) == sizeof(struct ncclComm) - sizeof(uint64_t), "endMagic must be the last field of ncclComm");
+
 enum ncclLaunchMode {
  ncclLaunchModeInvalid=0,
  ncclLaunchModeParallel,
@@ -644,7 +658,7 @@ inline ncclResult_t ncclCommPollEventCallbacks(struct ncclComm *comm) {
    }
  }
 finish:
-  cudaThreadExchangeStreamCaptureMode(&mode);
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
  return ncclSuccess;
 }

@@ -38,4 +38,6 @@ extern char ncclLastError[];

 void ncclSetThreadName(pthread_t thread, const char *fmt, ...);

+void ncclResetDebugInit();
+
 #endif
@@ -88,24 +88,18 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK
 #define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 8
 #define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)

-#define NCCL_DIRECT_WRITE 0x01
-#define NCCL_DIRECT_READ  0x02
+#define NCCL_P2P_WRITE 0x01
+#define NCCL_P2P_READ  0x02
 #define NCCL_DIRECT_NIC   0x04
-#define NCCL_IPC_WRITE    0x08
-#define NCCL_IPC_READ     0x10
-#define NCCL_NVLS_MIN_POLL 0x20
+#define NCCL_NVLS_MIN_POLL 0x80

 // Number of named barriers supported by CUDA
 #define NCCL_MAX_GROUPS 16

-#define NCCL_MAX_COLLNET_SIZE (1L << 29)
-
-enum ncclRegBufferType {
-  NCCL_REGULAR_BUFFER = 0,
-  NCCL_IPC_REG_BUFFER = 1,
-  NCCL_NVLS_REG_BUFFER = 2,
-  NCCL_COLLNET_REG_BUFFER = 3
-};
+#define NCCL_REGULAR_BUFFER 0x00
+#define NCCL_IPC_REG_BUFFER 0x01
+#define NCCL_NVLS_REG_BUFFER 0x02
+#define NCCL_NET_REG_BUFFER 0x04

 struct ncclConnInfo {
  // Regular comm mechanism
@@ -143,8 +137,6 @@ struct ncclConnector {
  struct ncclTransportComm* transportComm;
  void* transportResources;
  struct ncclConnInfo conn;
-  int sendMemSameProcess;
-  int recvMemSameProcess;
 };

 struct ncclRing {
@@ -228,7 +220,7 @@ struct alignas(16) ncclDevWorkP2p {
  uint8_t sendChunkSize_u32fp8, recvChunkSize_u32fp8;

  uint8_t sendProtoLL:1, recvProtoLL:1;
-  uint8_t sendRegistered:1, recvRegistered:1;
+  uint8_t sendNetReg:1, recvNetReg:1;
  uint8_t sendIpcReg:1, recvIpcReg:1;
 };

@@ -267,7 +259,7 @@ struct alignas(16) ncclDevWorkColl {
  //   nChannels == (channelHi - channelLo) + 1
  uint32_t channelLo:8, channelHi:8;
  uint32_t nWarps:8;
-  uint32_t redOpArgIsPtr:1, regUsed:2, oneNode:1, direct:4;
+  uint32_t redOpArgIsPtr:1, regUsed:1, netRegUsed:1, oneNode:1, direct:2, isOneRPN:1;
  uint32_t root;
  void* recvbuff;
  void* sendbuff;
@@ -393,7 +385,7 @@ struct ncclDevComm {
  int nNodes;
  int buffSizes[NCCL_NUM_PROTOCOLS];
  int p2pChunkSize;
-  int isNvlink;
+  int isAllNvlink;

  // Work fifo return credits
  uint32_t* workConsumed/*[MAXCHANNELS]*/;
@@ -525,9 +517,7 @@ inline bool ncclNvlsSupported(int devRedOp, int type) {
  case ncclInt64:
  case ncclUint64:
  case ncclFloat16:
-  #if defined(__CUDA_BF16_TYPES_EXIST__)
  case ncclBfloat16:
-  #endif
    return devRedOp == ncclDevSum || devRedOp == ncclDevMinMax;
  case ncclFloat:
  case ncclDouble:
@@ -25,5 +25,16 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
 ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
 ncclResult_t ncclLaunchFinish(struct ncclComm* comm);
 ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool* needConnect, ncclSimInfo_t* simInfo);
+ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm);
+
+static inline size_t ncclFuncSendCount(ncclFunc_t func, int nRanks, size_t count) {
+  return func == ncclFuncReduceScatter ? nRanks*count : count;
+}
+static inline size_t ncclFuncRecvCount(ncclFunc_t func, int nRanks, size_t count) {
+  return func == ncclFuncAllGather ? nRanks*count : count;
+}
+static inline size_t ncclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count) {
+  return func == ncclFuncAllGather || func == ncclFuncReduceScatter ? nRanks*count : count;
+}

 #endif // End include guard
@@ -19,7 +19,7 @@ ncclResult_t ncclTopoCudaPath(int cudaDev, char** path);

 struct ncclTopoSystem;
 // Build the topology
-ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system);
+ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system, const char* dumpXmlFile=NULL);
 ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system);
 ncclResult_t ncclTopoPrint(struct ncclTopoSystem* system);

@@ -33,10 +33,11 @@ ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm);

 // Query topology
 ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int64_t* id, int* dev, int* proxyRank);
-ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank);
+ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank);
 ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* ret);
-ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int64_t netId, int read, int* useGdr);
-ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int* flush);
+ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int rank, int64_t netId, int read, int* useGdr);
+ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int netDev, int rank, int* flush);
+ncclResult_t ncclTopoIsGdrAvail(struct ncclTopoSystem* system, int rank, bool *avail);
 ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int rank1, int rank2, int* net);
 int ncclPxnDisable(struct ncclComm* comm);
 ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks);
@@ -118,6 +119,6 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
    struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, struct ncclComm* parent);

 ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs);
-ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, int protocol, size_t nBytes, int numPipeOps, float* time, bool* backup=nullptr);
+ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, int protocol, size_t nBytes, int numPipeOps, float* time);

 #endif
@@ -12,6 +12,8 @@
 #ifndef NCCL_IBVWRAP_H_
 #define NCCL_IBVWRAP_H_

+#include <arpa/inet.h>
+#include <netinet/in.h>
 #ifdef NCCL_BUILD_RDMA_CORE
 #include <infiniband/verbs.h>
 #else
@@ -89,4 +91,14 @@ static inline ncclResult_t wrap_ibv_post_recv(struct ibv_qp *qp, struct ibv_recv

 ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event);

+// converts a GID into a readable string. On success, returns a non-null pointer to gidStr.
+// NULL is returned if there was an error, with errno set to indicate the error.
+// errno = ENOSPC if the converted string would exceed strLen.
+static inline const char* ibvGetGidStr(union ibv_gid* gid, char* gidStr, size_t strLen) {
+  // GID is a 16B handle, to convert it to a readable form, we use inet_ntop
+  // sizeof(ibv_gid) == sizeof(struct in6_addr), so using AF_INET6
+  static_assert(sizeof(union ibv_gid) == sizeof(struct in6_addr), "the sizeof struct ibv_gid must be the size of struct in6_addr");
+  return inet_ntop(AF_INET6, gid->raw, gidStr, strLen);
+}
+
 #endif //End include guard
@@ -32,6 +32,7 @@ typedef enum {
  NCCL_BOOTSTRAP = 0x1000,
  NCCL_REG = 0x2000,
  NCCL_PROFILE = 0x4000,
+  NCCL_RAS = 0x8000,
  NCCL_ALL = ~0
 } ncclDebugLogSubSys;

@@ -13,6 +13,9 @@
 #include <stdint.h>

 #define NCCL_NET_HANDLE_MAXSIZE 128
+//Maximum value NCCL can accept for maxP2pBytes and maxCollBytes net properties
+#define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L)
+#define NCCL_NET_OPTIONAL_RECV_COMPLETION 0x1

 #define NCCL_PTR_HOST 0x1
 #define NCCL_PTR_CUDA 0x2
@@ -21,6 +24,161 @@
 // Maximum number of requests per comm object
 #define NCCL_NET_MAX_REQUESTS 32

+// Max number of ncclNet objects which can live in the same process
+#define NCCL_NET_MAX_PLUGINS 3
+
+#define NCCL_NET_MAX_DEVS_PER_NIC_V9 4
+#define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V9
+
+typedef struct {
+  int ndevs;
+  int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9];
+} ncclNetVDeviceProps_v9_t;
+typedef ncclNetVDeviceProps_v9_t ncclNetVDeviceProps_t;
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int regIsGlobal;                 // regMr is not tied to a particular comm
+  int forceFlush;                  // Force a flush on receives
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+  ncclNetVDeviceProps_v9_t vProps;
+  size_t maxP2pBytes;              // Max transfer size for point-to-point operations
+  size_t maxCollBytes;             // Max transfer size for collective operations
+} ncclNetProperties_v9_t;
+typedef ncclNetProperties_v9_t ncclNetProperties_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+
+  // Create a virtual NIC given the specified properties, which can be accessed at device index d
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props);
+} ncclNet_v9_t;
+
+typedef ncclNet_v9_t ncclNet_t;
+
+#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v9
+
+typedef struct {
+  void* mhandle;
+  void* address;
+  size_t size;
+} ncclNetSGE_v9_t;
+
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, size_t count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v9_t* recvParts,
+                             size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                             void* sendMhandle, void** request);
+  ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v9_t* sendParts, void* recvData,
+                                 size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                                 ncclDataType_t dataType, ncclRedOp_t redOp,
+                                 void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Create a virtual NIC given the specified properties, which can be accessed at device index d
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props);
+} ncclCollNet_v9_t;
+
+typedef ncclCollNet_v9_t ncclCollNet_t;
+
+#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v9
+
 typedef struct {
  char* name;                      // Used mostly for logging.
  char* pciPath;                   // Path to the PCI device in /sys.
@@ -37,8 +195,6 @@ typedef struct {
  int netDeviceVersion;            // Version number for network offload
 } ncclNetProperties_v8_t;

-typedef ncclNetProperties_v8_t ncclNetProperties_t;
-
 typedef struct {
  // Name of the network (mainly for logs)
  const char* name;
@@ -94,10 +250,6 @@ typedef struct {
  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
 } ncclNet_v8_t;

-typedef ncclNet_v8_t ncclNet_t;
-
-#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v8
-
 typedef struct {
  void* mhandle;
  void* address;
@@ -151,10 +303,6 @@ typedef struct {
  ncclResult_t (*closeListen)(void* listenComm);
 } ncclCollNet_v8_t;

-typedef ncclCollNet_v8_t ncclCollNet_t;
-
-#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v8
-
 typedef struct {
  char* name;                      // Used mostly for logging.
  char* pciPath;                   // Path to the PCI device in /sys.
@@ -16,9 +16,133 @@ enum {
  ncclProfileProxyOp   = (1 << 3),  // proxy operation event type
  ncclProfileProxyStep = (1 << 4),  // proxy step event type
  ncclProfileProxyCtrl = (1 << 5),  // proxy control event type
-  ncclProfileNumEvents = (     6),
 };

+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      size_t trafficBytes;
+      uint8_t nMaxChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+    } coll;
+
+    struct {
+      const char* name;
+      uint64_t commHash;
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+  };
+} ncclProfilerEventDescr_v2_t;
+
+typedef enum {
+  ncclProfilerProxyOpSendPosted,
+  ncclProfilerProxyOpSendRemFifoWait,
+  ncclProfilerProxyOpSendTransmitted,
+  ncclProfilerProxyOpSendDone,
+  ncclProfilerProxyOpRecvPosted,
+  ncclProfilerProxyOpRecvReceived,
+  ncclProfilerProxyOpRecvTransmitted,
+  ncclProfilerProxyOpRecvDone,
+
+  /* Legacy proxy profiler states */
+  ncclProfilerProxyStepSendGPUWait,
+  ncclProfilerProxyStepSendWait,
+  ncclProfilerProxyStepRecvWait,
+  ncclProfilerProxyStepRecvFlushWait,
+  ncclProfilerProxyStepRecvGPUWait,
+
+  /* Legacy proxy control states */
+  ncclProfilerProxyCtrlIdle,
+  ncclProfilerProxyCtrlActive,
+  ncclProfilerProxyCtrlSleep,
+  ncclProfilerProxyCtrlWakeup,
+  ncclProfilerProxyCtrlAppend,
+  ncclProfilerProxyCtrlAppendEnd,
+} ncclProfilerEventState_v2_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+    int steps;
+  } proxyOp;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+} ncclProfilerEventStateArgs_v2_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v2_t;
+
+typedef ncclProfilerEventDescr_v2_t ncclProfilerEventDescr_t;
+typedef ncclProfilerEventState_v2_t ncclProfilerEventState_t;
+typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_t;
+typedef ncclProfiler_v2_t ncclProfiler_t;
+
 typedef struct {
  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
@@ -69,42 +193,8 @@ typedef struct {
  };
 } ncclProfilerEventDescr_v1_t;

-typedef enum {
-  ncclProfilerProxyOpSendPosted,
-  ncclProfilerProxyOpSendRemFifoWait,
-  ncclProfilerProxyOpSendTransmitted,
-  ncclProfilerProxyOpSendDone,
-  ncclProfilerProxyOpRecvPosted,
-  ncclProfilerProxyOpRecvReceived,
-  ncclProfilerProxyOpRecvTransmitted,
-  ncclProfilerProxyOpRecvDone,
-
-  /* Legacy proxy profiler states */
-  ncclProfilerProxyStepSendGPUWait,
-  ncclProfilerProxyStepSendWait,
-  ncclProfilerProxyStepRecvWait,
-  ncclProfilerProxyStepRecvFlushWait,
-  ncclProfilerProxyStepRecvGPUWait,
-
-  /* Legacy proxy control states */
-  ncclProfilerProxyCtrlIdle,
-  ncclProfilerProxyCtrlActive,
-  ncclProfilerProxyCtrlSleep,
-  ncclProfilerProxyCtrlWakeup,
-  ncclProfilerProxyCtrlAppend,
-  ncclProfilerProxyCtrlAppendEnd,
-} ncclProfilerEventState_v1_t;
-
-typedef union {
-  struct {
-    size_t transSize;
-    int steps;
-  } proxyOp;
-
-  struct {
-    int appendedProxyOps;
-  } proxyCtrl;
-} ncclProfilerEventStateArgs_v1_t;
+typedef ncclProfilerEventState_v2_t ncclProfilerEventState_v1_t;
+typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_v1_t;

 typedef struct {
  const char* name;
@@ -142,9 +232,4 @@ typedef struct {
  ncclResult_t (*finalize)(void* context);
 } ncclProfiler_v1_t;

-typedef ncclProfilerEventDescr_v1_t ncclProfilerEventDescr_t;
-typedef ncclProfilerEventState_v1_t ncclProfilerEventState_t;
-typedef ncclProfilerEventStateArgs_v1_t ncclProfilerEventStateArgs_t;
-typedef ncclProfiler_v1_t ncclProfiler_t;
-
 #endif
@@ -11,6 +11,55 @@
 #include "nccl.h"
 #include "nccl_common.h"

+// API to be implemented by external tuner
+typedef struct {
+  // Name of the tuner
+  const char* name;
+
+  // Initializes tuner states.
+  // Inputs:
+  //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
+  //   - nNodes: number of nodes in current communicator.
+  //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
+  // Outputs:
+  //   - context: tuner context object
+  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
+
+  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
+  // Inputs:
+  //   - context: tuner context object
+  //   - collType: collective type , e.g., allreduce, allgather…
+  //   - nBytes: collective size in bytes
+  //   - numPipeOps: number of operations in the group
+  //   - numAlgo: number of algorithms in collCostTable
+  //   - numProto: number of protocols in collCostTable
+  //   - regBuff: can register user buffer
+  //
+  // Outputs:
+  //   - nChannels: number of channels (hence SMs) to be used.
+  //
+  // InOut:
+  //   - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
+  //                    NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
+  //
+  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
+  // default tuning for the given collective.
+  // Also, the plugin is allowed to not set any output, or set only the
+  // algorithm and protocol, but not only the algorithm or only the protocol.
+  // Unset fields will be set automatically by NCCL.
+  ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
+                              int numPipeOps, float** collCostTable, int numAlgo, int numProto,
+                              int regBuff, int* nChannels);
+
+  // Terminates the plugin and cleans up any resources that the plugin allocated.
+  // context: tuner context object
+  ncclResult_t (*destroy)(void* context);
+} ncclTuner_v4_t;
+
+typedef ncclTuner_v4_t ncclTuner_t;
+
+#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4"
+
 // API to be implemented by external tuner
 typedef struct {
  // Name of the tuner
@@ -55,10 +104,6 @@ typedef struct {
  ncclResult_t (*destroy)(void* context);
 } ncclTuner_v3_t;

-typedef ncclTuner_v3_t ncclTuner_t;
-
-#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v3"
-
 // API to be implemented by external tuner
 typedef struct {
  // Name of the tuner
@@ -25,6 +25,7 @@ typedef struct {
 } ncclNetDeviceHandle_v7_t;

 typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
-typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_t;
+typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t;
+typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_t;

 #endif
@@ -302,7 +302,7 @@ extern ncclNvmlDevicePairInfo ncclNvmlDevicePairs[ncclNvmlMaxDevices][ncclNvmlMa

 struct ncclNvmlCCStatus {
    bool CCEnabled;
-    bool multiGpuCCEnabled;
+    bool multiGpuProtectedPCIE;
 };

 // All ncclNvmlFoo() functions call ncclNvmlEnsureInitialized() implicitly.
@@ -36,9 +36,9 @@ ncclResult_t ncclProfilerStartRecvProxyOpEvent(int sub, struct ncclProxyArgs* ar
 ncclResult_t ncclProfilerStopProxyOpEvent(int sub, struct ncclProxyArgs* args);

 // Proxy Step Start/Stop Event Wrappers
-ncclResult_t ncclProfilerStartSendProxyStepEvents(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi);
-ncclResult_t ncclProfilerStartRecvProxyStepEvents(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi);
-ncclResult_t ncclProfilerStopProxyStepEvents(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi);
+ncclResult_t ncclProfilerStartSendProxyStepEvent(int sub, struct ncclProxyArgs* args, int stepId);
+ncclResult_t ncclProfilerStartRecvProxyStepEvent(int sub, struct ncclProxyArgs* args, int stepId);
+ncclResult_t ncclProfilerStopProxyStepEvent(int sub, struct ncclProxyArgs* args, int stepId);

 // Proxy Control Start/Stop Events Wrappers
 ncclResult_t ncclProfilerStartProxyCtrlEvent(void* profilerContext, void** eHandle);
@@ -46,7 +46,7 @@ ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle);

 // Record Event Wrappers
 ncclResult_t ncclProfilerRecordProxyOpEventState(int sub, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState);
-ncclResult_t ncclProfilerRecordProxyStepEventStates(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi, ncclProfilerEventState_t eState);
+ncclResult_t ncclProfilerRecordProxyStepEventState(int sub, struct ncclProxyArgs* args, int stepId, ncclProfilerEventState_t eState);
 ncclResult_t ncclProfilerRecordProxyCtrlEventState(void*eHandle, int appended, ncclProfilerEventState_t eState);

 // Profiler utility functions
@@ -15,6 +15,7 @@
 #include <pthread.h>
 #include "shmutils.h"
 #include "p2p.h"
+#include "collectives.h"

 typedef enum : uint8_t {
  ncclPatternRing,
@@ -56,7 +57,11 @@ struct ncclProxyOp {
  int root;
  int next;
  int nsteps;
-  int chunkSize;
+  size_t chunkSize;
+  size_t sliceSize;
+  size_t loopSize;
+  size_t loopOffset;
+  size_t channelSize;
  uint8_t sliceSteps;
  uint8_t chunkSteps;
  uint8_t channelId;
@@ -65,13 +70,15 @@ struct ncclProxyOp {
  uint8_t /*ncclFunc_t*/ coll;
  uint8_t /*ncclPattern_t*/ pattern;
  uint8_t protocol;
+  uint8_t algorithm;
  uint8_t reg;
-  // collnet buffer reg handles
+  // collnet/p2p/coll buffer reg handles
  void* sendMhandle;
  void* recvMhandle;
  uint8_t* sendbuff;
  uint8_t* recvbuff;
-
+  int isOneRPN;
+  RingAlgorithm *ringAlgo;
  union ncclProxyOpSpecifics specifics;

  // Profiler plugin
@@ -93,19 +100,21 @@ struct ncclProxyOp {
 struct ncclProxySubArgs {
  struct ncclProxyConnection* connection;
  int reg;
-  // p2p mhandle
-  void* mhandle;
  // collnet handles
  void* sendMhandle;
  void* recvMhandle;
  uint8_t* sendbuff;
  uint8_t* recvbuff;
  size_t offset;
+  ssize_t loopSize;
+  ssize_t loopOffset;
  int channelId;
  int nsteps;
  ssize_t nbytes;
+  ssize_t chunkSize;
  int peer;
-
+  int isOneRPN;
+  RingAlgorithm *ringAlgo;
  int groupSize; // Number of consecutive sub operations sharing the same recvComm
  uint64_t base;
  uint64_t posted;
@@ -114,11 +123,14 @@ struct ncclProxySubArgs {
  uint64_t transmitted;
  uint64_t done;
  uint64_t end;
+  int regBufferReady;
  void* requests[NCCL_STEPS];

  // Profiler plugin
  int eActivationMask;
  int rank;
+  pid_t pid;
+  void* profilerContext;
  void* taskEventHandle;
  void* opEventHandle;
  void* stepEventHandles[NCCL_STEPS];
@@ -133,10 +145,11 @@ struct ncclProxyArgs {
  proxyProgressFunc_t progress;
  int nsubs;
  int done;
+  int onePPN;
  uint64_t opCount;
  int sliceSteps;
  int chunkSteps;
-  int chunkSize;
+  size_t chunkSize;
  size_t totalSendSize;
  size_t totalRecvSize;
  size_t sendSizePerRound;
@@ -146,16 +159,13 @@ struct ncclProxyArgs {
  uint8_t /*ncclPattern_t*/ pattern;
  uint8_t /*ncclFunc_t*/ coll;
  uint8_t protocol;
+  uint8_t algorithm;
  int state;
  char* sharedBuff[NCCL_STEPS];
  int sharedSize[NCCL_STEPS];

  int idle;

-  // Profiler plugin
-  pid_t pid;
-  void* profilerContext;
-
  // Element linking
  struct ncclProxyArgs* next;
  struct ncclProxyArgs* nextPeer;
@@ -0,0 +1,24 @@
+/*************************************************************************
+ * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_RAS_H_
+#define NCCL_RAS_H_
+
+#include "socket.h"
+
+// Structure used to communicate data about NCCL ranks from NCCL threads to RAS.
+struct rasRankInit {
+  union ncclSocketAddress addr;
+  pid_t pid;
+  int cudaDev;
+  int nvmlDev;
+};
+
+ncclResult_t ncclRasCommInit(struct ncclComm* comm, struct rasRankInit* myRank);
+ncclResult_t ncclRasCommFini(const struct ncclComm* comm);
+ncclResult_t ncclRasAddRanks(struct rasRankInit* ranks, int nranks);
+
+#endif // !NCCL_RAS_H_
@@ -6,6 +6,9 @@
 #include <cuda.h>
 #include <stdint.h>

+int64_t ncclParamLocalRegister();
+int64_t ncclParamGraphRegister();
+
 enum {
  NET_REG_COMPLETE = 0x01,
  NVLS_REG_COMPLETE = 0x02,
@@ -20,16 +23,21 @@ struct ncclPeerRegIpcAddr {
  uintptr_t* hostPeerRmtAddrs;
 };

+struct ncclRegNetHandles {
+  void* handle;
+  struct ncclProxyConnector* proxyConn;
+  struct ncclRegNetHandles* next;
+};
+
 struct ncclReg {
  // common attributes
  size_t pages;
-  int refs;
+  int localRefs;
+  int graphRefs;
  uintptr_t addr;
  uint32_t state;
  // net reg
-  int nDevs;
-  int devs[MAXCHANNELS];
-  void** handles;
+  struct ncclRegNetHandles* netHandleHead;
  // nvls reg
  uintptr_t baseAddr;
  size_t baseSize;
@@ -50,11 +58,12 @@ struct ncclRegCache {
  struct ncclReg **slots;
  int capacity, population;
  uintptr_t pageSize;
-  void* sComms[MAXCHANNELS];
-  void* rComms[MAXCHANNELS];
 };

 ncclResult_t ncclRegCleanup(struct ncclComm* comm);
 ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** reg);
+ncclResult_t ncclCommGraphRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
+ncclResult_t ncclCommGraphDeregister(const ncclComm_t comm, struct ncclReg *handle);
+ncclResult_t ncclRegLocalIsValid(struct ncclReg *reg, bool *isValid);

 #endif
@@ -10,7 +10,7 @@
 #include "nccl.h"

 typedef void* ncclShmHandle_t;
-ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle);
+ncclResult_t ncclShmOpen(char* shmPath, size_t shmPathSize, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle);
 ncclResult_t ncclShmClose(ncclShmHandle_t handle);
 ncclResult_t ncclShmUnlink(ncclShmHandle_t handle);

@@ -17,9 +17,6 @@

 #define MAX_IFS 16
 #define MAX_IF_NAME_SIZE 16
-#define SLEEP_INT            1000 // connection retry sleep interval in usec
-#define RETRY_REFUSED_TIMES   2e4 // connection refused retry times before reporting a timeout (20 sec)
-#define RETRY_TIMEDOUT_TIMES    3 // connection timed out retry times (each one can take 20s)
 #define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV)
 #define NCCL_SOCKET_MAGIC 0x564ab9f2fc4b9d6cULL

@@ -39,9 +36,10 @@ enum ncclSocketState {
  ncclSocketStateConnectPolling = 5,
  ncclSocketStateConnected = 6,
  ncclSocketStateReady = 7,
-  ncclSocketStateClosed = 8,
-  ncclSocketStateError = 9,
-  ncclSocketStateNum = 10
+  ncclSocketStateTerminating = 8,
+  ncclSocketStateClosed = 9,
+  ncclSocketStateError = 10,
+  ncclSocketStateNum = 11
 };

 enum ncclSocketType {
@@ -49,14 +47,14 @@ enum ncclSocketType {
  ncclSocketTypeBootstrap = 1,
  ncclSocketTypeProxy = 2,
  ncclSocketTypeNetSocket = 3,
-  ncclSocketTypeNetIb = 4
+  ncclSocketTypeNetIb = 4,
+  ncclSocketTypeRasNetwork = 5
 };

 struct ncclSocket {
  int fd;
  int acceptFd;
-  int timedOutRetries;
-  int refusedRetries;
+  int errorRetries;
  union ncclSocketAddress addr;
  volatile uint32_t* abortFlag;
  int asyncFlag;
@@ -64,15 +62,18 @@ struct ncclSocket {
  int salen;
  uint64_t magic;
  enum ncclSocketType type;
+  int customRetry;
+  int finalizeCounter; // Used to keep track of initial handshake for async sockets.
+  char finalizeBuffer[sizeof(uint64_t)]; // Used to keep track of initial handshake for async sockets.
 };

-const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1);
+const char *ncclSocketToString(const union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1);
 ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair);
 int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs);
 int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs);

 // Initialize a socket
-ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr = NULL, uint64_t magic = NCCL_SOCKET_MAGIC, enum ncclSocketType type = ncclSocketTypeUnknown, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0);
+ncclResult_t ncclSocketInit(struct ncclSocket* sock, const union ncclSocketAddress* addr = NULL, uint64_t magic = NCCL_SOCKET_MAGIC, enum ncclSocketType type = ncclSocketTypeUnknown, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0, int customRetry = 0);
 // Create a listening socket. sock->addr can be pre-filled with IP & port info. sock->fd is set after a successful call
 ncclResult_t ncclSocketListen(struct ncclSocket* sock);
 ncclResult_t ncclSocketGetAddr(struct ncclSocket* sock, union ncclSocketAddress* addr);
@@ -88,11 +89,12 @@ ncclResult_t ncclSocketSetFd(int fd, struct ncclSocket* sock);
 #define NCCL_SOCKET_SEND 0
 #define NCCL_SOCKET_RECV 1

-ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset);
+ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int* closed = NULL);
 ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset);
 ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size);
 ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size);
 ncclResult_t ncclSocketSendRecv(struct ncclSocket* sendSock, void* sendPtr, int sendSize, struct ncclSocket* recvSock, void* recvPtr, int recvSize);
 ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking);
+ncclResult_t ncclSocketShutdown(struct ncclSocket* sock, int how);
 ncclResult_t ncclSocketClose(struct ncclSocket* sock);
 #endif
@@ -28,7 +28,6 @@ extern struct ncclTransport netTransport;
 extern struct ncclTransport collNetTransport;

 extern struct ncclTransport* ncclTransports[];
-
 // Forward declarations
 struct ncclRing;
 struct ncclConnector;
@@ -115,16 +114,16 @@ struct ncclTransport {
 };

 ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
-ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL);
+ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex);
 ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* intraNodeP2pSupport, bool* directMode);

 ncclResult_t ncclNvlsInit(struct ncclComm* comm);
 ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent);
 ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm);
 ncclResult_t ncclNvlsTreeConnect(struct ncclComm* comm);
-ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, int* nCleanupQueueElts);
-ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv);
-ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size);
+ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, int* nCleanupQueueElts);
+ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv);
+ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size);
 ncclResult_t ncclNvlsFree(struct ncclComm* comm);

 enum { collNetRecv=0, collNetSend=1 };
@@ -143,4 +142,13 @@ ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTop
 ncclResult_t ncclCollNetChainBufferSetup(ncclComm_t comm);
 ncclResult_t ncclCollNetDirectBufferSetup(ncclComm_t comm);

+ncclResult_t ncclNetDeregBuffer(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void* handle);
+ncclResult_t ncclNetLocalRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, struct ncclConnector** peerConns, int nPeers, int* outRegBufFlag, void** outHandle);
+ncclResult_t ncclNetGraphRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, struct ncclConnector** peerConns, int nPeers, int* outRegBufFlag, void** outHandle, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, int* nCleanupQueueElts);
+
+ncclResult_t ncclRegisterP2pIpcBuffer(struct ncclComm* comm, void* userbuff, size_t size, int peerRank, int* regFlag, void** regAddr, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue);
+ncclResult_t ncclRegisterP2pNetBuffer(struct ncclComm* comm, void* userbuff, size_t size, struct ncclConnector* conn, int* regFlag, void** handle, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue);
+ncclResult_t ncclRegisterCollBuffers(struct ncclComm* comm, struct ncclTaskColl* info, void* outRegBufSend[NCCL_MAX_LOCAL_RANKS], void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS], struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, bool* regNeedConnect);
+ncclResult_t ncclRegisterCollNvlsBuffers(struct ncclComm* comm, struct ncclTaskColl* info, void* outRegBufSend[NCCL_MAX_LOCAL_RANKS], void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS], struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, bool* regNeedConnect);
+
 #endif
@@ -49,8 +49,7 @@ inline uint64_t clockNano() {
  return uint64_t(ts.tv_sec)*1000*1000*1000 + ts.tv_nsec;
 }

-/* get any bytes of random data from /dev/urandom, return 0 if it succeeds; else
- * return -1 */
+/* get any bytes of random data from /dev/urandom, return ncclSuccess (0) if it succeeds. */
 inline ncclResult_t getRandomData(void* buffer, size_t bytes) {
  ncclResult_t ret = ncclSuccess;
  if (bytes > 0) {
@@ -17,6 +17,7 @@
 #include "graph.h"
 #include "argcheck.h"
 #include "tuner.h"
+#include "ras.h"
 #include <fcntl.h>
 #include <string.h>
 #include <errno.h>
@@ -182,6 +183,8 @@ static ncclResult_t commFree(ncclComm_t comm) {
  if (comm == NULL)
    return ncclSuccess;

+  NCCLCHECK(ncclRasCommFini(comm));
+
  /* in commReclaim, we have guaranteed only last rank which calls ncclCommDestroy() will
   * free all intra-process communicators; therefore, we only need to focus on local
   * resource cleanup in commFree(). */
@@ -193,7 +196,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
    }
  }

-  CUDACHECK(cudaMemPoolDestroy(comm->memPool));
+  if (comm->memPool) CUDACHECK(cudaMemPoolDestroy(comm->memPool));

  delete[] comm->userRedOps;

@@ -421,11 +424,6 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in

  ncclIntruQueueConstruct(&comm->eventCallbackQueue);

-  //  setup intraComm0 and intraRanks 0 to default values to ensure proper cleanup of the communicator
-  comm->intraComm0 = comm;
-  comm->intraRank = 0;
-  comm->intraRanks = 1;
-
  return ncclSuccess;
 }

@@ -435,6 +433,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
  struct ncclDevCommAndChannels tmpCommAndChans;
  struct ncclDevCommAndChannels *devCommAndChans = NULL;
  struct ncclNvmlCCStatus ccStatus;
+  bool ccEnable;

  NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), ret, fail);
  NCCLCHECKGOTO(ncclCudaCallocAsync(&devCommAndChans, 1, comm->sharedRes->deviceStream.cudaStream), ret, fail);
@@ -448,7 +447,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
  tmpCommAndChans.comm.node = comm->node;
  tmpCommAndChans.comm.nNodes = comm->nNodes;
  tmpCommAndChans.comm.abortFlag = comm->abortFlagDev;
-  tmpCommAndChans.comm.isNvlink = ncclTopoPathAllNVLink(comm->topo);
+  tmpCommAndChans.comm.isAllNvlink = comm->isAllNvlink;
  for (int p=0; p < NCCL_NUM_PROTOCOLS; p++) {
    tmpCommAndChans.comm.buffSizes[p] = comm->buffSizes[p];
  }
@@ -458,11 +457,9 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
  comm->workArgsBytes = std::min<size_t>(ncclParamWorkArgsBytes(), ncclMaxKernelArgsSize(comm->cudaArch));

  memset(&ccStatus, 0, sizeof(ccStatus));
-  if (ncclNvmlGetCCStatus(&ccStatus) == ncclSuccess && ccStatus.CCEnabled) {
+  ccEnable = (ncclSuccess == ncclNvmlGetCCStatus(&ccStatus)) && (ccStatus.CCEnabled || ccStatus.multiGpuProtectedPCIE);
+  if (ccEnable) {
    comm->workFifoBytes = 0;
-    if (ccStatus.multiGpuCCEnabled == false && comm->rank == 0) {
-      WARN("CC On, Multi-GPU CC Off (No inter-GPU communication protection)");
-    }
  } else {
    comm->workFifoBytes = ncclParamWorkFifoBytes();
    if (0 != (comm->workFifoBytes & (comm->workFifoBytes-1))) {
@@ -473,7 +470,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
  }

  if (comm->rank == 0) {
-    INFO(NCCL_INIT, "CC %s, Multi-GPU CC %s, workFifoBytes %d", ccStatus.CCEnabled ? "On" : "Off", ccStatus.multiGpuCCEnabled ? "On" : "Off", comm->workFifoBytes);
+    INFO(NCCL_INIT, "CC %s, workFifoBytes %d", ccEnable ? "On" : "Off", comm->workFifoBytes);
  }

  if (ncclGdrCopy != NULL && ncclParamGdrCopyFifoEnable() == 1) {
@@ -608,9 +605,6 @@ NCCL_PARAM(P2pPciChunkSize, "P2P_PCI_CHUNKSIZE", (1 << 17)); /* 128 kB */
 NCCL_PARAM(P2pNvlChunkSize, "P2P_NVL_CHUNKSIZE", (1 << 19)); /* 512 kB */

 static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
-  int cpuArch, cpuVendor, cpuModel;
-  NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
-
  int64_t envs[NCCL_NUM_PROTOCOLS] = { ncclParamLlBuffSize(), ncclParamLl128BuffSize(), ncclParamBuffSize() };
  int defaults[NCCL_NUM_PROTOCOLS] = { DEFAULT_LL_BUFFSIZE, DEFAULT_LL128_BUFFSIZE, DEFAULT_BUFFSIZE };

@@ -619,7 +613,7 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
  }

  if (comm->nNodes > 1) comm->p2pChunkSize = ncclParamP2pNetChunkSize();
-  else if (ncclTopoPathAllNVLink(comm->topo)) comm->p2pChunkSize = ncclParamP2pNvlChunkSize();
+  else if (comm->isAllNvlink) comm->p2pChunkSize = ncclParamP2pNvlChunkSize();
  else comm->p2pChunkSize = ncclParamP2pPciChunkSize();

  // Make sure P2P chunksize is not larger than coll chunksize.
@@ -850,6 +844,14 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
  } while(0);

  timers[TIMER_INIT_TOPO] = clockNano();
+
+  // Dump XML if requested by user
+  const char* dumpXmlFile;
+  dumpXmlFile = ncclGetEnv("NCCL_TOPO_DUMP_FILE");
+  if (dumpXmlFile) {
+    NCCLCHECKGOTO(ncclTopoGetSystem(comm, NULL, dumpXmlFile), ret, fail);
+  }
+
  // Topo detection / System graph creation
  NCCLCHECKGOTO(ncclTopoGetSystem(comm, &comm->topo), ret, fail);
  // Compute paths between GPUs and NICs
@@ -1076,9 +1078,9 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
      INFO(NCCL_INIT, "Communicator has %d nodes which is less than CollNet node threshold %d, disabling CollNet", comm->nNodes, collNetNodeThreshold);
      comm->collNetSupport = 0;
    }
-    // As long as there is more than 1 rank on any node, we need to disable collnet reg
-    comm->collNetRegSupport = (comm->maxLocalRanks == 1);
  }
+  comm->isAllNvlink = ncclTopoPathAllNVLink(comm->topo);
+  comm->isOneRPN = (comm->maxLocalRanks == 1);

  NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail);
  NCCLCHECKGOTO(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, graphs, parent), ret, fail);
@@ -1293,7 +1295,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
  // launch NCCL kernels before all cuda mem allocation is complete. That could cause a deadlock.
  NCCLCHECKGOTO(devCommSetup(comm), ret, fail);
  timers[TIMER_INIT_CONNECT] = clockNano() -  timers[TIMER_INIT_CONNECT];
-
  /* Local intra-node barrier */
  NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);

@@ -1338,6 +1339,7 @@ struct ncclCommInitRankAsyncJob {
  // for ncclCommSplit
  struct ncclComm* parent;
  int color, key;
+  int splitCount;
  // name of the function calling
  char funcName[NCCL_COMMINIT_FUNCNAME_LEN];
 };
@@ -1432,13 +1434,14 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
    timers[TIMER_INIT_ALLOC] = clockNano();
    NCCLCHECKGOTO(commAlloc(comm, job->parent, job->nranks, job->myrank), res, fail);
    timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC];
-    // obtain a unique hash for the comm, re-using part of the parent's hash, commHash is a 64bit struct (=16 hex), add the color
+    // obtain a unique hash for the comm, re-using part of the parent's hash, commHash is a 64bit struct (=16 hex),
+    // add unique split counter and the color
    ncclUniqueId tmpId;
    memset(&tmpId,0,sizeof(ncclUniqueId));// must set 0 here to avoid undefined bits
-    snprintf((char*)&tmpId, NCCL_UNIQUE_ID_BYTES, "%016lx-%d", job->parent->commHash, job->color);
+    snprintf((char*)&tmpId, NCCL_UNIQUE_ID_BYTES, "%016lx-%d-%d", job->parent->commHash, job->splitCount, job->color);
    comm->commHash = getHash(tmpId.internal, NCCL_UNIQUE_ID_BYTES);
-    INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p color %d key %d- Init START", job->funcName,
-         comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->color, job->key);
+    INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p splitCount %d color %d key %d- Init START", job->funcName,
+         comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->splitCount, job->color, job->key);
    timers[TIMER_INIT_BOOTSTRAP] = clockNano();
    NCCLCHECKGOTO(bootstrapSplit(comm->commHash, comm, job->parent, job->color, job->key, parentRanks), res, fail);
    timers[TIMER_INIT_BOOTSTRAP] = clockNano() - timers[TIMER_INIT_BOOTSTRAP];
@@ -1474,8 +1477,8 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
    /* unlink child abort flag. */
    __atomic_store_n(&job->parent->childAbortFlag, NULL, __ATOMIC_RELEASE);
    TRACE_CALL("ncclCommSplit(%p, %d, %d, %p, %d, %d)", job->parent, job->color, job->key, comm, comm->rank, comm->nRanks);
-    INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p color %d key %d - Init COMPLETE", job->funcName,
-         comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->color, job->key);
+    INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p splitCount %d color %d key %d - Init COMPLETE", job->funcName,
+         comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->splitCount, job->color, job->key);
  } else {
    // the name for the replay tool is ncclCommInitRank for all the variations
    TRACE_CALL("ncclCommInitRank(%p, %d, 0x%llx, %d, %d)", comm, comm->nRanks, commIdHash, comm->rank, comm->cudaDev);
@@ -1716,8 +1719,8 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, int nId
  comm->startMagic = comm->endMagic = NCCL_MAGIC; // Used to detect comm corruption.
  *comm->abortFlagRefCount = 1;
  NCCLCHECKGOTO(parseCommConfig(comm, config), res, fail);
-  /* start with ncclInternalError and will be changed to ncclSuccess if init succeeds. */
-  comm->initState = ncclInternalError;
+  /* start with ncclInProgress and will be changed to ncclSuccess if init succeeds. */
+  comm->initState = ncclInProgress;
  *newcomm = comm;

  NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail);
@@ -1749,6 +1752,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, int nId
 exit:
  return ncclGroupErrCheck(res);
 fail:
+  if (job) ncclCommInitJobFree(job);
  if (comm) {
    free(comm->abortFlag);
    if (comm->abortFlagDev) (void)ncclCudaHostFree((void*)comm->abortFlagDev);
@@ -1846,7 +1850,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
  NCCLCHECKGOTO(ncclGroupEnd(), ret, fail);

 exit:
-  cudaSetDevice(oldDev);
+  (void)cudaSetDevice(oldDev);
  free(gpuFlags);
  return ret;
 fail:
@@ -1926,14 +1930,9 @@ fail:
 static ncclResult_t commDestroySync(struct ncclAsyncJob* job_) {
  struct ncclCommFinalizeAsyncJob* job = (struct ncclCommFinalizeAsyncJob*) job_;
  ncclComm_t comm = job->comm;
-  int savedDevice;
-  int commDevice = comm->cudaDev;
  ncclResult_t ret = ncclSuccess;

-  CUDACHECKGOTO(cudaGetDevice(&savedDevice), ret, fail);
-  if (savedDevice != commDevice) {
-    CUDACHECKGOTO(cudaSetDevice(commDevice), ret, fail);
-  }
+  CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);

  TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d asyncResult %d", comm, comm->rank, *comm->abortFlag, comm->asyncResult);

@@ -1963,10 +1962,6 @@ static ncclResult_t commDestroySync(struct ncclAsyncJob* job_) {
    WARN("ncclProxyStop: comm %p (rank = %d) destroys proxy resource error %d", comm, comm->rank, ret);
  }

-  if (savedDevice != commDevice) {
-    CUDACHECKGOTO(cudaSetDevice(savedDevice), ret, fail);
-  }
-
 exit:
  return ret;
 fail:
@@ -1974,25 +1969,12 @@ fail:
 }

 static ncclResult_t commCleanup(ncclComm_t comm) {
-  int savedDevice;
-  int commDevice = comm->cudaDev;
-
-  CUDACHECK(cudaGetDevice(&savedDevice));
-  if (savedDevice != commDevice) {
-    CUDACHECK(cudaSetDevice(commDevice));
-  }
-
+  CUDACHECK(cudaSetDevice(comm->cudaDev));
  if (comm->tuner != NULL) {
    NCCLCHECK(comm->tuner->destroy(comm->tunerContext));
    NCCLCHECK(ncclTunerPluginUnload(comm));
  }
-
  NCCLCHECK(commFree(comm));
-
-  if (savedDevice != commDevice) {
-    CUDACHECK(cudaSetDevice(savedDevice));
-  }
-
  return ncclSuccess;
 }

@@ -2099,6 +2081,7 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
  NVTX3_FUNC_WITH_PARAMS(CommDestroy, CommInitRankSchema, payload)

  TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId);
+  NCCLCHECK(ncclGroupStartInternal());
  // Try and prevent a double free of the comm struct (user error)
  if (comm->rank == -1 || comm->nRanks == -1 || comm->cudaDev == -1 || comm->busId == -1) {
    WARN("comm %p has already been destroyed", comm);
@@ -2113,6 +2096,8 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
  NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, commReclaim, NULL, free, comm), res, fail);

 exit:
+  ncclGroupErrCheck(res);
+  NCCLCHECK(ncclGroupEndInternal());
  return res;
 fail:
  goto exit;
@@ -2124,7 +2109,7 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) {
    NVTX3_FUNC_RANGE_IN(nccl_domain);
    return ncclSuccess;
  }
-
+  NCCLCHECK(ncclGroupStartInternal());
  // Ask anything that might still be running on the device to quit
  if (comm->childAbortFlag != nullptr) {
    __atomic_store_n(comm->childAbortFlag, 1, __ATOMIC_RELEASE);
@@ -2152,6 +2137,8 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) {
  NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, commReclaim, NULL, free, comm), res, fail);

 exit:
+  ncclGroupErrCheck(res);
+  NCCLCHECK(ncclGroupEndInternal());
  return ncclSuccess;
 fail:
  goto exit;
@@ -2218,14 +2205,15 @@ ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newc
      NCCLCHECKGOTO(parseCommConfig(childComm, config), res, fail);
    }

-    /* start with ncclInternalError and will be changed to ncclSuccess if init succeeds. */
-    childComm->initState = ncclInternalError;
+    /* start with ncclInProgress and will be changed to ncclSuccess if init succeeds. */
+    childComm->initState = ncclInProgress;
  }

  NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail);
  job->comm = childComm;
  job->newcomm = newcomm;
  job->parent = comm;
+  job->splitCount = ++comm->splitCount;
  job->color = color;
  job->key = key;
  job->cudaDev = comm->cudaDev;
@@ -2233,13 +2221,13 @@ ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newc
  NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, ncclCommInitRankFunc, NULL, free, comm), res, fail);

 exit:
-  cudaSetDevice(oldDev);
+  (void)cudaSetDevice(oldDev);
  (void)ncclGroupErrCheck(res);
  NCCLCHECK(ncclGroupEndInternal());
  return res;
 fail:
  if (childComm) {
-    if (comm && !comm->config.splitShare) {
+    if (!comm->config.splitShare) {
      free(childComm->abortFlag);
      if (childComm->abortFlagDev) ncclCudaHostFree(childComm->abortFlagDev);
      free(childComm->abortFlagRefCount);
@@ -2347,14 +2335,12 @@ ncclResult_t  ncclMemAlloc(void **ptr, size_t size) {

  CUDACHECK(cudaGetDevice(&cudaDev));
  CUCHECK(cuDeviceGet(&currentDev, cudaDev));
-  if (CUPFN(cuMulticastCreate) != NULL)
-    CUCHECK(cuDeviceGetAttribute(&mcSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, currentDev));

-  if (mcSupport) {
+  if (ncclCuMemEnable()) {
    int requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
    // Query device to see if FABRIC handle support is available
    flag = 0;
-    (void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev));;
+    (void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev));
    if (flag) requestedHandleTypes |= CU_MEM_HANDLE_TYPE_FABRIC;
    memprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
    memprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
@@ -2365,18 +2351,24 @@ ncclResult_t  ncclMemAlloc(void **ptr, size_t size) {
    CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev));
    if (flag) memprop.allocFlags.gpuDirectRDMACapable = 1;
    CUCHECK(cuMemGetAllocationGranularity(&memGran, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
-
-    /* mc property */
    CUDACHECK(cudaGetDeviceCount(&dcnt));
-    mcprop.size = size;
-    /* device cnt is a dummy value right now, it might affect mc granularity in the future. */
-    mcprop.numDevices = dcnt;
-    mcprop.handleTypes = requestedHandleTypes;
-    mcprop.flags = 0;
-    CUCHECK(cuMulticastGetGranularity(&mcGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED));

-    /* only size needs to be aligned to mcGran */
-    ALIGN_SIZE(size, mcGran);
+    if (CUPFN(cuMulticastCreate) != NULL) CUCHECK(cuDeviceGetAttribute(&mcSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, currentDev));
+    if (mcSupport) {
+      /* mc property */
+      mcprop.size = size;
+      /* device cnt is a dummy value right now, it might affect mc granularity in the future. */
+      mcprop.numDevices = dcnt;
+      mcprop.handleTypes = requestedHandleTypes;
+      mcprop.flags = 0;
+      CUCHECK(cuMulticastGetGranularity(&mcGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
+
+      /* only size needs to be aligned to mcGran */
+      ALIGN_SIZE(size, mcGran);
+    } else {
+      ALIGN_SIZE(size, memGran);
+    }
+
    if (requestedHandleTypes & CU_MEM_HANDLE_TYPE_FABRIC) {
      /* First try cuMemCreate() with FABRIC handle support and then remove if it fails */
      CUresult err = CUPFN(cuMemCreate(&handle, size, &memprop, 0));
@@ -2403,6 +2395,7 @@ ncclResult_t  ncclMemAlloc(void **ptr, size_t size) {
        accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
        CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
      }
+      if (0 == p2p && i != cudaDev) INFO(NCCL_ALLOC, "P2P not supported between GPU%d and GPU%d", cudaDev, i);
    }
    goto exit;
  }
@@ -2429,18 +2422,13 @@ ncclResult_t  ncclMemFree(void *ptr) {
  CUDACHECK(cudaGetDevice(&saveDevice));
 #if CUDART_VERSION >= 12010
  CUdevice ptrDev = 0;
-  int mcSupport = 0;

  if (ptr == NULL) goto fallback;
-
  if (ncclCudaLibraryInit() != ncclSuccess) goto fallback;

  CUCHECKGOTO(cuPointerGetAttribute((void*)&ptrDev, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, (CUdeviceptr)ptr), ret, fail);
-  if (CUPFN(cuMulticastCreate) != NULL)
-    CUCHECKGOTO(cuDeviceGetAttribute(&mcSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, ptrDev), ret, fail);
-
  CUDACHECKGOTO(cudaSetDevice((int)ptrDev), ret, fail);
-  if (mcSupport) {
+  if (ncclCuMemEnable()) {
    NCCLCHECKGOTO(ncclCuMemFree(ptr), ret, fail);
    goto exit;
  }
@@ -11,7 +11,7 @@

 // This env var (NCCL_CUMEM_ENABLE) toggles cuMem API usage
 NCCL_PARAM(CuMemEnable, "CUMEM_ENABLE", -2);
-NCCL_PARAM(CuMemHostEnable, "CUMEM_HOST_ENABLE", 0);
+NCCL_PARAM(CuMemHostEnable, "CUMEM_HOST_ENABLE", -1);
 // Handle type used for cuMemCreate()
 CUmemAllocationHandleType ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;

@@ -35,9 +35,6 @@ int ncclIsCuMemSupported() {
  // Query device to see if CUMEM VMM support is available
  CUCHECKGOTO(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, currentDev), ret, error);
  if (!flag) return 0;
-  // Query device to see if CUMEM RDMA support is available
-  CUCHECKGOTO(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev), ret, error);
-  if (!flag) return 0;
 error:
  return (ret == ncclSuccess);
 #endif
@@ -49,11 +46,31 @@ int ncclCuMemEnable() {
  return  param >= 0 ? param : (param == -2 && ncclCuMemSupported);
 }

+static int ncclCumemHostEnable = -1;
 int ncclCuMemHostEnable() {
+  if (ncclCumemHostEnable != -1)
+    return ncclCumemHostEnable;
 #if CUDART_VERSION < 12020
-  return 0;
+  ncclCumemHostEnable = 0;
+  return ncclCumemHostEnable;
 #else
-  return ncclParamCuMemHostEnable();
+  ncclResult_t ret = ncclSuccess;
+  int cudaDriverVersion;
+  int paramValue = -1;
+  CUDACHECKGOTO(cudaDriverGetVersion(&cudaDriverVersion), ret, error);
+  if (cudaDriverVersion < 12020) {
+    ncclCumemHostEnable = 0;
+  }
+  else {
+    paramValue = ncclParamCuMemHostEnable();
+    if (paramValue != -1)
+      ncclCumemHostEnable = paramValue;
+    else
+      ncclCumemHostEnable = (cudaDriverVersion >= 12060) ? 1 : 0;
+  }
+  return ncclCumemHostEnable;
+error:
+  return (ret == ncclSuccess);
 #endif
 }

@@ -218,10 +235,9 @@ static void initOnceFunc() {
  // Determine whether we support the cuMem APIs or not
  ncclCuMemSupported = ncclIsCuMemSupported();

-#if 12020 <= CUDART_VERSION && CUDART_VERSION <= 12030
-  /* To use cuMem* for host memory allocation, we need to create context on each
-   * visible device. This is workaround needed in CUDA 12.3 which is fixed in 12.4. */
-  if (ncclCuMemSupported && ncclCuMemHostEnable()) {
+  /* To use cuMem* for host memory allocation, we need to create context on each visible device.
+   * This is a workaround needed in CUDA 12.2 and CUDA 12.3 which is fixed in 12.4. */
+  if (ncclCuMemSupported && ncclCuMemHostEnable() && 12020 <= driverVersion && driverVersion <= 12030) {
    int deviceCnt, saveDevice;
    cudaGetDevice(&saveDevice);
    cudaGetDeviceCount(&deviceCnt);
@@ -231,7 +247,6 @@ static void initOnceFunc() {
    }
    cudaSetDevice(saveDevice);
  }
-#endif
  initResult = ret;
  return;
 error:
@@ -8,6 +8,7 @@
 #include <sys/types.h>
 #include <unistd.h>

+#include "ibvcore.h"
 #include "ibvsymbols.h"

 static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
@@ -53,7 +54,7 @@ ncclResult_t wrap_ibv_symbols(void) {
  } \
  int ret = container.call; \
  if (ret == ENOTSUP || ret == EOPNOTSUPP) { \
-    INFO(NCCL_NET, "Call to " name " failed with error %s errno %d", strerror(ret), ret); \
+    INFO(NCCL_NET, "Call to " name " not supported"); \
    *supported = 0; \
    return ncclSuccess; \
  } else if (ret != success_retval) { \
@@ -87,6 +88,14 @@ ncclResult_t wrap_ibv_symbols(void) {
  container.call; \
  return ncclSuccess;

+NCCL_PARAM(IbMQpRetryAll, "IB_MQP_RETRY_ALL", 0);
+NCCL_PARAM(IbMQpRetryCnt, "IB_MQP_RETRY_CNT", 34);
+NCCL_PARAM(IbMQpRetryTimeout, "IB_MQP_RETRY_SLEEP_MSEC", 100); // in milliseconds
+
+#define IBV_ERR_EQ(e, code)        (e == code || e == (-code))
+#define IBV_MQP_RETRY_ERRNO(e)     (IBV_ERR_EQ(e, ETIMEDOUT))
+#define IBV_MQP_RETRY_ERRNO_ALL(e) (ncclParamIbMQpRetryAll() ? (e != 0) : IBV_MQP_RETRY_ERRNO(e))
+
 ncclResult_t wrap_ibv_fork_init() {
  IBV_INT_CHECK(ibvSymbols, ibv_internal_fork_init, ibv_internal_fork_init(), -1, "ibv_fork_init");
 }
@@ -202,8 +211,87 @@ ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct i
  IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_create_qp, ibv_internal_create_qp(pd, qp_init_attr), *ret, NULL, "ibv_create_qp");
 }

-ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
-  IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_modify_qp, ibv_internal_modify_qp(qp, attr, attr_mask), 0, "ibv_modify_qp");
+static void ibvQpStateName(enum ibv_qp_state state, char* msg, const size_t len) {
+  switch (state) {
+  case (IBV_QPS_RESET): snprintf(msg, len, "RESET"); break;
+  case (IBV_QPS_INIT): snprintf(msg, len, "INIT"); break;
+  case (IBV_QPS_RTR): snprintf(msg, len, "RTR"); break;
+  case (IBV_QPS_RTS): snprintf(msg, len, "RTS"); break;
+  case (IBV_QPS_SQD): snprintf(msg, len, "SQD"); break;
+  case (IBV_QPS_SQE): snprintf(msg, len, "SQE"); break;
+  case (IBV_QPS_ERR): snprintf(msg, len, "ERR"); break;
+  case (IBV_QPS_UNKNOWN): snprintf(msg, len, "UNKNOWN"); break;
+  default: snprintf(msg, len, "NOT RECOGNIZED (%d)", state); break;
+  }
+}
+
+#define QP_ATTR(attr, userAttr, userFlag, mask) ((userFlag & mask) ? (userAttr) : (attr))
+
+static void ibvModifyQpLog(struct ibv_qp* qp, enum ibv_qp_state qpState, struct ibv_qp_attr* userAttr, int userFlag, char* msg, size_t msgLen) {
+  ncclResult_t res;
+  int portNum = -1, gidIndex = -1;
+  char localGidName[INET6_ADDRSTRLEN], remoteGidName[INET6_ADDRSTRLEN];
+  const char *localGidRes = NULL, *remoteGidRes = NULL;
+
+  char nextState[32], currState[32];
+  ibvQpStateName(qp->state, currState, sizeof(currState));
+  ibvQpStateName(qpState, nextState, sizeof(nextState));
+  char devName[IBV_SYSFS_NAME_MAX] = "";
+  snprintf(devName, sizeof(devName), "%s", (qp->pd->context) ? wrap_ibv_get_device_name(qp->pd->context->device) : "N/A");
+
+  struct ibv_qp_attr attr;
+  struct ibv_qp_init_attr init_attr;
+  int attr_mask = IBV_QP_PORT | IBV_QP_AV;
+  res = wrap_ibv_query_qp(qp, &attr, attr_mask, &init_attr);
+  struct ibv_qp_attr *qpAttr = (res == ncclSuccess) ? &attr : NULL;
+
+  // port info, portAttr can be NULL if not given by the user and query_qp failed
+  struct ibv_qp_attr *portAttr = QP_ATTR(qpAttr, userAttr, userFlag, IBV_QP_PORT);
+  portNum = portAttr ? portAttr->port_num : -1;
+
+  // address info, avAttr can be NULL if not given by the user and query_qp failed
+  struct ibv_qp_attr *avAttr = QP_ATTR(qpAttr, userAttr, userFlag, IBV_QP_AV);
+  if (avAttr && avAttr->ah_attr.is_global) {
+    union ibv_gid *remoteGid = &avAttr->ah_attr.grh.dgid;
+    remoteGidRes = ibvGetGidStr(remoteGid, remoteGidName, sizeof(remoteGidName));
+    // we need pd->context to retrieve local GID, skip if not there
+    if (!qp->pd->context) goto print;
+    gidIndex =  avAttr->ah_attr.grh.sgid_index;
+    union ibv_gid localGid;
+    NCCLCHECKGOTO(wrap_ibv_query_gid(qp->pd->context, portNum, gidIndex, &localGid), res, print);
+    localGidRes = ibvGetGidStr(&localGid, localGidName, sizeof(localGidName));
+  }
+
+print:
+  snprintf(msg, msgLen, "on dev %s:%d, curr state %s, next state %s, local GID index %d, local GID %s, remote GID %s",
+           devName, portNum, currState, nextState, gidIndex, localGidRes ? localGidName : "N/A", remoteGidRes ? remoteGidName : "N/A");
+  return;
+}
+
+ncclResult_t wrap_ibv_modify_qp(struct ibv_qp* qp, struct ibv_qp_attr* attr, int attr_mask) {
+  char qpMsg[1024];
+  int ret = 0, attempts = 0;
+  int maxCnt = (int)ncclParamIbMQpRetryCnt() + 1; // number of attempts = number of retry + 1
+  int timeOut = (int)ncclParamIbMQpRetryTimeout();
+  CHECK_NOT_NULL(ibvSymbols, ibv_internal_modify_qp);
+  do {
+    if (attempts > 0) {
+      unsigned int sleepTime = timeOut * attempts;
+      ibvModifyQpLog(qp, attr->qp_state, attr, attr_mask, qpMsg, sizeof(qpMsg));
+      INFO(NCCL_NET, "Call to ibv_modify_qp failed with %d %s, %s, retrying %d/%d after %u msec of sleep", ret, strerror(ret), qpMsg, attempts, maxCnt, sleepTime);
+      // sleep before retrying
+      struct timespec tv = {.tv_sec = sleepTime / 1000, .tv_nsec = (sleepTime % 1000) * ((long)1e6)};
+      nanosleep(&tv, NULL);
+    }
+    ret = ibvSymbols.ibv_internal_modify_qp(qp, attr, attr_mask);
+    attempts++;
+  } while (IBV_MQP_RETRY_ERRNO_ALL(ret) && attempts < maxCnt);
+  if (ret != 0) {
+    ibvModifyQpLog(qp, attr->qp_state, attr, attr_mask, qpMsg, sizeof(qpMsg));
+    WARN("Call to ibv_modify_qp failed with %d %s, %s", ret, strerror(ret), qpMsg);
+    return ncclSystemError;
+  }
+  return ncclSuccess;
 }

 ncclResult_t wrap_ibv_query_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
@@ -189,14 +189,16 @@ ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen,

  TRACE(NCCL_INIT, "UDS: Sending hdr %p len %d fd %d to UDS socket %s", hdr, hdrLen, sendFd, temp);

-  msg.msg_control = control_un.control;
-  msg.msg_controllen = sizeof(control_un.control);
+  if (sendFd != -1) {
+    msg.msg_control = control_un.control;
+    msg.msg_controllen = sizeof(control_un.control);

-  cmptr = CMSG_FIRSTHDR(&msg);
-  cmptr->cmsg_len = CMSG_LEN(sizeof(int));
-  cmptr->cmsg_level = SOL_SOCKET;
-  cmptr->cmsg_type = SCM_RIGHTS;
-  memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd));
+    cmptr = CMSG_FIRSTHDR(&msg);
+    cmptr->cmsg_len = CMSG_LEN(sizeof(int));
+    cmptr->cmsg_level = SOL_SOCKET;
+    cmptr->cmsg_type = SCM_RIGHTS;
+    memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd));
+  }

  msg.msg_name = (void *)&cliaddr;
  msg.msg_namelen = sizeof(struct sockaddr_un);
@@ -311,19 +311,19 @@ ncclResult_t ncclNvmlGetCCStatus(struct ncclNvmlCCStatus *status) {
      status->CCEnabled = false;

    if (ccInfo.settingV12040.multiGpuMode == NVML_CC_SYSTEM_MULTIGPU_PROTECTED_PCIE)
-      status->multiGpuCCEnabled = true;
+      status->multiGpuProtectedPCIE = true;
    else
-      status->multiGpuCCEnabled = false;
+      status->multiGpuProtectedPCIE = false;
  } else if (pfn_nvmlSystemGetConfComputeState != NULL) {
    NVMLTRY(nvmlSystemGetConfComputeState, &ccInfo.settingV12020);
    if (ccInfo.settingV12020.ccFeature == NVML_CC_SYSTEM_FEATURE_ENABLED)
      status->CCEnabled = true;
    else
      status->CCEnabled = false;
-    status->multiGpuCCEnabled = false;
+    status->multiGpuProtectedPCIE = false;
  } else {
    status->CCEnabled = false;
-    status->multiGpuCCEnabled = false;
+    status->multiGpuProtectedPCIE = false;
  }
  return ncclSuccess;
 }
@@ -16,9 +16,110 @@ static pthread_mutex_t profilerLock = PTHREAD_MUTEX_INITIALIZER;
 static int profilerPluginRefCount;
 static void* profilerPluginLib;
 static ncclProfiler_t* ncclProfiler;
+static ncclProfiler_v2_t ncclProfiler_v1_as_v2;
+static ncclProfiler_v1_t* ncclProfiler_v1;
+
+static uint8_t ncclStringToFunc(const char* func) {
+  if (0 == strcmp(func, "AllGather")) return ncclFuncAllGather;
+  if (0 == strcmp(func, "AllReduce")) return ncclFuncAllReduce;
+  if (0 == strcmp(func, "Broadcast")) return ncclFuncBroadcast;
+  if (0 == strcmp(func, "Recv")) return ncclFuncRecv;
+  if (0 == strcmp(func, "Reduce")) return ncclFuncReduce;
+  if (0 == strcmp(func, "ReduceScatter")) return ncclFuncReduceScatter;
+  if (0 == strcmp(func, "SendRecv")) return ncclFuncSendRecv;
+  return ncclFuncSend;
+}
+
+static uint8_t ncclStringToAlgo(const char* algo) {
+  if (0 == strcmp(algo, "TREE")) return NCCL_ALGO_TREE;
+  if (0 == strcmp(algo, "RING")) return NCCL_ALGO_RING;
+  if (0 == strcmp(algo, "COLLNET_DIRECT")) return NCCL_ALGO_COLLNET_DIRECT;
+  if (0 == strcmp(algo, "COLLNET_CHAIN")) return NCCL_ALGO_COLLNET_CHAIN;
+  if (0 == strcmp(algo, "NVLS")) return NCCL_ALGO_NVLS;
+  if (0 == strcmp(algo, "NVLS_TREE")) return NCCL_ALGO_NVLS_TREE;
+  return NCCL_ALGO_PAT;
+}
+
+static uint8_t ncclStringToProto(const char* proto) {
+  if (0 == strcmp(proto, "LL")) return NCCL_PROTO_LL;
+  if (0 == strcmp(proto, "LL128")) return NCCL_PROTO_LL128;
+  return NCCL_PROTO_SIMPLE;
+}
+
+static uint8_t ncclStringToDatatype(const char* dt) {
+  if (0 == strcmp(dt, "ncclInt8")) return ncclInt8;
+  if (0 == strcmp(dt, "ncclInt32")) return ncclInt32;
+  if (0 == strcmp(dt, "ncclUint32")) return ncclUint32;
+  if (0 == strcmp(dt, "ncclInt64")) return ncclInt64;
+  if (0 == strcmp(dt, "ncclUint64")) return ncclUint64;
+  if (0 == strcmp(dt, "ncclFloat16")) return ncclFloat16;
+  if (0 == strcmp(dt, "ncclFloat32")) return ncclFloat32;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+  if (0 == strcmp(dt, "ncclBfloat16")) return ncclBfloat16;
+#endif
+  return ncclFloat64;
+}
+
+static ncclResult_t ncclProfiler_v1_as_v2_startEvent(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr) {
+  ncclProfilerEventDescr_v1_t eDescr_v1 = { 0 };
+  eDescr_v1.type = eDescr->type;
+  eDescr_v1.parentObj = eDescr->parentObj;
+  eDescr_v1.rank = eDescr->rank;
+  switch(eDescr->type) {
+    case ncclProfileGroup: break;
+    case ncclProfileColl: {
+      eDescr_v1.coll.name = eDescr->coll.name;
+      eDescr_v1.coll.commHash = eDescr->coll.commHash;
+      eDescr_v1.coll.seqNumber = eDescr->coll.seqNumber;
+      eDescr_v1.coll.func = ncclStringToFunc(eDescr->coll.func);
+      eDescr_v1.coll.sendBuff = eDescr->coll.sendBuff;
+      eDescr_v1.coll.recvBuff = eDescr->coll.recvBuff;
+      eDescr_v1.coll.count = eDescr->coll.count;
+      eDescr_v1.coll.root = eDescr->coll.root;
+      eDescr_v1.coll.datatype = ncclStringToDatatype(eDescr->coll.datatype);
+      eDescr_v1.coll.op = 0; // removed in v2
+      eDescr_v1.coll.trafficBytes = eDescr->coll.trafficBytes;
+      eDescr_v1.coll.nMaxChannels = eDescr->coll.nMaxChannels;
+      eDescr_v1.coll.nWarps = eDescr->coll.nWarps;
+      eDescr_v1.coll.algo = ncclStringToAlgo(eDescr->coll.algo);
+      eDescr_v1.coll.proto = ncclStringToProto(eDescr->coll.proto);
+    } break;
+    case ncclProfileP2p: {
+      eDescr_v1.p2p.name = eDescr->p2p.name;
+      eDescr_v1.p2p.commHash = eDescr->p2p.commHash;
+      eDescr_v1.p2p.func = ncclStringToFunc(eDescr->p2p.func);
+      eDescr_v1.p2p.buff = eDescr->p2p.buff;
+      eDescr_v1.p2p.count = eDescr->p2p.count;
+      eDescr_v1.p2p.datatype = ncclStringToDatatype(eDescr->p2p.datatype);
+      eDescr_v1.p2p.peer = eDescr->p2p.peer;
+    } break;
+    case ncclProfileProxyOp: {
+      eDescr_v1.proxyOp.pid = eDescr->proxyOp.pid;
+      eDescr_v1.proxyOp.channelId = eDescr->proxyOp.channelId;
+      eDescr_v1.proxyOp.peer = eDescr->proxyOp.peer;
+      eDescr_v1.proxyOp.nSteps = eDescr->proxyOp.nSteps;
+      eDescr_v1.proxyOp.chunkSize = eDescr->proxyOp.chunkSize;
+      eDescr_v1.proxyOp.isSend = eDescr->proxyOp.isSend;
+    } break;
+    case ncclProfileProxyStep: {
+      eDescr_v1.proxyStep.step = eDescr->proxyStep.step;
+    } break;
+    case ncclProfileProxyCtrl: break;
+    default:;
+  }
+  return ncclProfiler_v1->startEvent(context, eHandle, &eDescr_v1);
+}
+
+static ncclResult_t ncclProfiler_v1_as_v2_init(void** context, int* eActivationMask) {
+  ncclProfiler_v1->init(context, eActivationMask);
+  ncclProfiler_v1_as_v2.startEvent = ncclProfiler_v1_as_v2_startEvent;
+  ncclProfiler_v1_as_v2.stopEvent = ncclProfiler_v1->stopEvent;
+  ncclProfiler_v1_as_v2.recordEventState = ncclProfiler_v1->recordEventState;
+  ncclProfiler_v1_as_v2.finalize = ncclProfiler_v1->finalize;
+  return ncclSuccess;
+}

 #define MAX_STR_LEN 256
-#define NCCL_PROFILER_PLUGIN_SYMBOL "ncclProfiler_v1"

 static void* tryOpenLib(char* name, int *err, char* errStr) {
  if (nullptr == name || strlen(name) == 0) {
@@ -33,7 +134,7 @@ static void* tryOpenLib(char* name, int *err, char* errStr) {
  if (nullptr == handle) {
    strncpy(errStr, dlerror(), MAX_STR_LEN);
    errStr[MAX_STR_LEN] = 0;
-    if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) {
+    if (name && strstr(errStr, name) && strstr(errStr, "No such file or directory")) {
      *err = ENOENT;
    }
  }
@@ -116,10 +217,21 @@ static ncclResult_t ncclProfilerPluginLoad(void) {
    goto fail;
  }

-  ncclProfiler = (ncclProfiler_t*)dlsym(profilerPluginLib, NCCL_PROFILER_PLUGIN_SYMBOL);
+  ncclProfiler = (ncclProfiler_v2_t*)dlsym(profilerPluginLib, "ncclProfiler_v2");
  if (ncclProfiler == nullptr) {
-    INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find " NCCL_PROFILER_PLUGIN_SYMBOL ".");
-    goto fail;
+    INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v2.");
+    ncclProfiler_v1 = (ncclProfiler_v1_t*)dlsym(profilerPluginLib, "ncclProfiler_v1");
+    if (ncclProfiler_v1 == nullptr) {
+      INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v1.");
+      goto fail;
+    } else {
+      ncclProfiler = &ncclProfiler_v1_as_v2;
+      ncclProfiler_v1_as_v2.name = ncclProfiler_v1->name;
+      ncclProfiler_v1_as_v2.init = ncclProfiler_v1_as_v2_init;
+      INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded ncclProfiler_v1.");
+    }
+  } else {
+    INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded ncclProfiler_v2.");
  }

  ++profilerPluginRefCount;
@@ -247,7 +359,7 @@ ncclResult_t ncclProfilerStartGroupEvent(struct ncclKernelPlan* plan) {
  eActivationMaskGroup = __atomic_load_n(&eActivationMask, __ATOMIC_RELAXED);
  if (__builtin_expect(ncclProfiler != NULL, 0)) {
    if (eActivationMaskGroup & (ncclProfileColl | ncclProfileP2p | ncclProfileProxyOp | ncclProfileProxyStep)) {
-      ncclProfilerEventDescr_v1_t eDescr = { 0 };
+      ncclProfilerEventDescr_t eDescr = { 0 };
      eDescr.type = ncclProfileGroup;
      ncclProfiler->startEvent(plan->comm->profilerContext, &plan->groupEventHandle, &eDescr);
    }
@@ -279,20 +391,17 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) {
        eDescr.coll.name = plan->comm->commName;
        eDescr.coll.commHash = plan->comm->commHash;
        eDescr.coll.seqNumber = plan->comm->seqNumber[ct->func]++;
-        eDescr.coll.func = ct->func;
+        eDescr.coll.func = ncclFuncToString(ct->func);
        eDescr.coll.sendBuff = ct->sendbuff;
        eDescr.coll.recvBuff = ct->recvbuff;
        eDescr.coll.count = ct->count;
        eDescr.coll.root = ct->root;
-        eDescr.coll.datatype = ct->datatype;
-        eDescr.coll.op = ct->opHost;
+        eDescr.coll.datatype = ncclDatatypeToString(ct->datatype);
        eDescr.coll.trafficBytes = ct->trafficBytes;
        eDescr.coll.nMaxChannels = ct->nMaxChannels;
        eDescr.coll.nWarps = ct->nWarps;
-        eDescr.coll.algo = ct->algorithm;
-        eDescr.coll.proto = ct->protocol;
-        eDescr.coll.isCollnet = ct->isCollnet;
-        eDescr.coll.isNvls = ct->isNvls;
+        eDescr.coll.algo = ncclAlgoToString(ct->algorithm);
+        eDescr.coll.proto = ncclProtoToString(ct->protocol);
        ncclProfiler->startEvent(plan->comm->profilerContext, &ct->eventHandle, &eDescr);

        // update collective task with group event activation mask
@@ -307,10 +416,10 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) {
        eDescr.rank = plan->comm->rank;
        eDescr.p2p.name = plan->comm->commName;
        eDescr.p2p.commHash = plan->comm->commHash;
-        eDescr.p2p.func = pt->func;
+        eDescr.p2p.func = ncclFuncToString(pt->func);
        eDescr.p2p.buff = pt->buff;
        eDescr.p2p.count = pt->count;
-        eDescr.p2p.datatype = pt->datatype;
+        eDescr.p2p.datatype = ncclDatatypeToString(pt->datatype);
        eDescr.p2p.peer = pt->root;
        ncclProfiler->startEvent(plan->comm->profilerContext, &pt->eventHandle, &eDescr);

@@ -345,6 +454,11 @@ ncclResult_t ncclProfilerStopTaskEvents(struct ncclKernelPlan* plan) {
  return ncclSuccess;
 }

+// Bellow we set the proxy descriptor step number to DIVUP(step, args->sliceSteps).
+// The reason is that for some ncclOp (e.g. AllReduce) one network transfer is
+// made of sliceSteps steps rather than one step. In the profiler we are still
+// interested in whole network transfers though, so we account for this when
+// computing the actual network step number.
 ncclResult_t ncclProfilerStartSendProxyOpEvent(int s, struct ncclProxyArgs* args) {
  TIME_START_EVENT(proxyOpStart);
  struct ncclProxySubArgs* sub = &args->subs[s];
@@ -354,13 +468,13 @@ ncclResult_t ncclProfilerStartSendProxyOpEvent(int s, struct ncclProxyArgs* args
      eDescr.type = ncclProfileProxyOp;
      eDescr.parentObj = sub->taskEventHandle;
      eDescr.rank = sub->rank;
-      eDescr.proxyOp.pid = args->pid;
+      eDescr.proxyOp.pid = sub->pid;
      eDescr.proxyOp.channelId = sub->channelId;
      eDescr.proxyOp.peer = sub->peer;
-      eDescr.proxyOp.nSteps = sub->nsteps;
-      eDescr.proxyOp.chunkSize = args->chunkSize;
+      eDescr.proxyOp.nSteps = DIVUP(sub->nsteps, args->sliceSteps);
+      eDescr.proxyOp.chunkSize = args->chunkSize * args->sliceSteps;
      eDescr.proxyOp.isSend = 1;
-      ncclProfiler->startEvent(args->profilerContext, &sub->opEventHandle, &eDescr);
+      ncclProfiler->startEvent(sub->profilerContext, &sub->opEventHandle, &eDescr);
    }
  }
  TIME_STOP_EVENT(proxyOpStart);
@@ -376,13 +490,13 @@ ncclResult_t ncclProfilerStartRecvProxyOpEvent(int s, struct ncclProxyArgs* args
      eDescr.type = ncclProfileProxyOp;
      eDescr.parentObj = sub->taskEventHandle;
      eDescr.rank = sub->rank;
-      eDescr.proxyOp.pid = args->pid;
+      eDescr.proxyOp.pid = sub->pid;
      eDescr.proxyOp.channelId = sub->channelId;
      eDescr.proxyOp.peer = sub->peer;
-      eDescr.proxyOp.nSteps = sub->nsteps;
-      eDescr.proxyOp.chunkSize = args->chunkSize;
+      eDescr.proxyOp.nSteps = DIVUP(sub->nsteps, args->sliceSteps);
+      eDescr.proxyOp.chunkSize = args->chunkSize * args->sliceSteps;
      eDescr.proxyOp.isSend = 0;
-      ncclProfiler->startEvent(args->profilerContext, &sub->opEventHandle, &eDescr);
+      ncclProfiler->startEvent(sub->profilerContext, &sub->opEventHandle, &eDescr);
    }
  }
  TIME_STOP_EVENT(proxyOpStart);
@@ -400,53 +514,50 @@ ncclResult_t ncclProfilerStopProxyOpEvent(int s, struct ncclProxyArgs* args) {
  return ncclSuccess;
 }

-ncclResult_t ncclProfilerStartSendProxyStepEvents(int s, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi) {
+ncclResult_t ncclProfilerStartSendProxyStepEvent(int s, struct ncclProxyArgs* args, int stepId) {
  TIME_START_EVENT(proxyStepStart);
  struct ncclProxySubArgs* sub = &args->subs[s];
  if (__builtin_expect(ncclProfiler != NULL, 0)) {
    if (sub->opEventHandle && (sub->eActivationMask & ncclProfileProxyStep)) {
-      for (uint64_t step = stepLo; step < stepHi; step++) {
-        ncclProfilerEventDescr_t eDescr = { 0 };
-        eDescr.type = ncclProfileProxyStep;
-        eDescr.parentObj = sub->opEventHandle;
-        eDescr.rank = sub->rank;
-        eDescr.proxyStep.step = step;
-        ncclProfiler->startEvent(args->profilerContext, &sub->stepEventHandles[step%NCCL_STEPS], &eDescr);
-      }
+      int step_ = DIVUP(stepId, args->sliceSteps);
+      ncclProfilerEventDescr_t eDescr = { 0 };
+      eDescr.type = ncclProfileProxyStep;
+      eDescr.parentObj = sub->opEventHandle;
+      eDescr.rank = sub->rank;
+      eDescr.proxyStep.step = step_;
+      ncclProfiler->startEvent(sub->profilerContext, &sub->stepEventHandles[step_%NCCL_STEPS], &eDescr);
    }
  }
  TIME_STOP_EVENT(proxyStepStart);
  return ncclSuccess;
 }

-ncclResult_t ncclProfilerStartRecvProxyStepEvents(int s, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi) {
+ncclResult_t ncclProfilerStartRecvProxyStepEvent(int s, struct ncclProxyArgs* args, int stepId) {
  TIME_START_EVENT(proxyStepStart);
  struct ncclProxySubArgs* sub = &args->subs[s];
  if (__builtin_expect(ncclProfiler != NULL, 0)) {
    if (sub->opEventHandle && (sub->eActivationMask & ncclProfileProxyStep)) {
-      for (uint64_t step = stepLo; step < stepHi; step++) {
-        ncclProfilerEventDescr_t eDescr = { 0 };
-        eDescr.type = ncclProfileProxyStep;
-        eDescr.parentObj = sub->opEventHandle;
-        eDescr.rank = sub->rank;
-        eDescr.proxyStep.step = step;
-        ncclProfiler->startEvent(args->profilerContext, &sub->stepEventHandles[step%NCCL_STEPS], &eDescr);
-      }
+      int step_ = DIVUP(stepId, args->sliceSteps);
+      ncclProfilerEventDescr_t eDescr = { 0 };
+      eDescr.type = ncclProfileProxyStep;
+      eDescr.parentObj = sub->opEventHandle;
+      eDescr.rank = sub->rank;
+      eDescr.proxyStep.step = step_;
+      ncclProfiler->startEvent(sub->profilerContext, &sub->stepEventHandles[step_%NCCL_STEPS], &eDescr);
    }
  }
  TIME_STOP_EVENT(proxyStepStart);
  return ncclSuccess;
 }

-ncclResult_t ncclProfilerStopProxyStepEvents(int s, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi) {
+ncclResult_t ncclProfilerStopProxyStepEvent(int s, struct ncclProxyArgs* args, int stepId) {
  TIME_START_EVENT(proxyStepStop);
  struct ncclProxySubArgs* sub = &args->subs[s];
  if (__builtin_expect(ncclProfiler != NULL, 0)) {
-    for (uint64_t step = stepLo; step < stepHi; step++) {
-      if (sub->stepEventHandles[step%NCCL_STEPS]) {
-        ncclProfiler->stopEvent(sub->stepEventHandles[step%NCCL_STEPS]);
-        sub->stepEventHandles[step%NCCL_STEPS] = NULL;
-      }
+    int step_ = DIVUP(stepId, args->sliceSteps);
+    if (sub->stepEventHandles[step_%NCCL_STEPS]) {
+      ncclProfiler->stopEvent(sub->stepEventHandles[step_%NCCL_STEPS]);
+      sub->stepEventHandles[step_%NCCL_STEPS] = NULL;
    }
  }
  TIME_STOP_EVENT(proxyStepStop);
@@ -484,8 +595,8 @@ ncclResult_t ncclProfilerRecordProxyOpEventState(int s, struct ncclProxyArgs* ar
  TIME_START_EVENT(proxyOpRecord);
  struct ncclProxySubArgs* sub = &args->subs[s];
  if (__builtin_expect(ncclProfiler != NULL, 0) && sub->opEventHandle) {
-    ncclProfilerEventStateArgs_t a = { 0 };
-    a.proxyOp.steps = steps;
+    ncclProfilerEventStateArgs_t a = { };
+    a.proxyOp.steps = DIVUP(steps, args->sliceSteps);
    a.proxyOp.transSize = transSize;
    ncclProfiler->recordEventState(sub->opEventHandle, eState, &a);
  }
@@ -493,14 +604,13 @@ ncclResult_t ncclProfilerRecordProxyOpEventState(int s, struct ncclProxyArgs* ar
  return ncclSuccess;
 }

-ncclResult_t ncclProfilerRecordProxyStepEventStates(int s, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi, ncclProfilerEventState_t eState) {
+ncclResult_t ncclProfilerRecordProxyStepEventState(int s, struct ncclProxyArgs* args, int stepId, ncclProfilerEventState_t eState) {
  TIME_START_EVENT(proxyStepRecord);
  struct ncclProxySubArgs* sub = &args->subs[s];
  if (__builtin_expect(ncclProfiler != NULL, 0) && sub->opEventHandle) {
-    for (uint64_t step = stepLo; step < stepHi; step++) {
-      if (sub->stepEventHandles[step%NCCL_STEPS]) {
-        ncclProfiler->recordEventState(sub->stepEventHandles[step%NCCL_STEPS], eState, 0);
-      }
+    int step_ = DIVUP(stepId, args->sliceSteps);
+    if (sub->stepEventHandles[step_%NCCL_STEPS]) {
+      ncclProfiler->recordEventState(sub->stepEventHandles[step_%NCCL_STEPS], eState, 0);
    }
  }
  TIME_STOP_EVENT(proxyStepRecord);
@@ -510,7 +620,7 @@ ncclResult_t ncclProfilerRecordProxyStepEventStates(int s, struct ncclProxyArgs*
 ncclResult_t ncclProfilerRecordProxyCtrlEventState(void* eHandle, int appended, ncclProfilerEventState_t eState) {
  TIME_START_EVENT(proxyCtrlRecord);
  if (__builtin_expect(ncclProfiler != NULL, 0) && eHandle && __atomic_load_n(&eActivationMask, __ATOMIC_RELAXED) & ncclProfileProxyCtrl) {
-    ncclProfilerEventStateArgs_t args = { 0 };
+    ncclProfilerEventStateArgs_t args = { };
    args.proxyCtrl.appendedProxyOps = appended;
    ncclProfiler->recordEventState(eHandle, eState, &args);
  }
@@ -45,7 +45,7 @@ static void shmHandleInit(int fd, char* shmPath, size_t shmSize, size_t realShmS
  return;
 }

-ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle) {
+ncclResult_t ncclShmOpen(char* shmPath, size_t shmPathSize, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle) {
  int fd = -1;
  char* hptr = NULL;
  void* dptr = NULL;
@@ -62,7 +62,7 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de
     * refcount references; when the peer attaches, it should pass -1 to reduce one reference count. When it
     * goes down to 0, unlink should be called in order to delete shared memory file. */
    if (shmPath[0] == '\0') {
-      sprintf(shmPath, "/dev/shm/nccl-XXXXXX");
+      snprintf(shmPath, shmPathSize, "/dev/shm/nccl-XXXXXX");
    retry_mkstemp:
      fd = mkstemp(shmPath);
      if (fd < 0) {
@@ -70,7 +70,7 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de
          INFO(NCCL_ALL, "mkstemp: Failed to create %s, error: %s (%d) - retrying", shmPath, strerror(errno), errno);
          goto retry_mkstemp;
        }
-        WARN("Error: failed to create shared memory file %p, error %s (%d)", shmPath, strerror(errno), errno);
+        WARN("Error: failed to create shared memory file %s, error %s (%d)", shmPath, strerror(errno), errno);
        ret = ncclSystemError;
        goto fail;
      }
@@ -12,6 +12,18 @@
 #include <ifaddrs.h>
 #include <net/if.h>
 #include "param.h"
+#include <time.h>
+
+NCCL_PARAM(RetryCnt, "SOCKET_RETRY_CNT", 34);
+NCCL_PARAM(RetryTimeOut, "SOCKET_RETRY_SLEEP_MSEC", 100);
+static void msleep(unsigned int time_msec) {
+  const long c_1e6 = 1e6;
+  struct timespec tv = (struct timespec){
+      .tv_sec = time_msec / 1000,
+      .tv_nsec = (time_msec % 1000) * c_1e6,
+  };
+  nanosleep(&tv, NULL);
+}

 static ncclResult_t socketProgressOpt(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int block, int* closed) {
  int bytes = 0;
@@ -26,8 +38,13 @@ static ncclResult_t socketProgressOpt(int op, struct ncclSocket* sock, void* ptr
      return ncclSuccess;
    }
    if (bytes == -1) {
+      if ((op == NCCL_SOCKET_SEND && errno == EPIPE) || (op == NCCL_SOCKET_RECV && errno == ECONNRESET)) {
+        *closed = 1;
+        return ncclSuccess;
+      }
      if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
-        WARN("socketProgressOpt: Call to recv from %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno));
+        WARN("socketProgressOpt: Call to %s %s failed : %s", (op == NCCL_SOCKET_RECV ? "recv from" : "send to"),
+             ncclSocketToString(&sock->addr, line), strerror(errno));
        return ncclRemoteError;
      } else {
        bytes = 0;
@@ -38,17 +55,22 @@ static ncclResult_t socketProgressOpt(int op, struct ncclSocket* sock, void* ptr
      INFO(NCCL_NET, "socketProgressOpt: abort called");
      return ncclInternalError;
    }
-  } while (bytes > 0 && (*offset) < size);
+  } while (sock->asyncFlag == 0 && bytes > 0 && (*offset) < size);
  return ncclSuccess;
 }

-static ncclResult_t socketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
+static ncclResult_t socketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int* pclosed = NULL) {
  int closed;
  NCCLCHECK(socketProgressOpt(op, sock, ptr, size, offset, 0 /*block*/, &closed));
  if (closed) {
-    char line[SOCKET_NAME_MAXLEN+1];
-    WARN("socketProgress: Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line, 0));
-    return ncclRemoteError;
+    if (pclosed) {
+      *pclosed = closed;
+      return ncclSuccess;
+    } else {
+      char line[SOCKET_NAME_MAXLEN+1];
+      WARN("socketProgress: Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line, 0));
+      return ncclRemoteError;
+    }
  }
  return ncclSuccess;
 }
@@ -63,9 +85,9 @@ static ncclResult_t socketWait(int op, struct ncclSocket* sock, void* ptr, int s
 *
 * Output: "IPv4/IPv6 address<port>"
 */
-const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm /*= 1*/) {
+const char *ncclSocketToString(const union ncclSocketAddress *addr, char *buf, const int numericHostForm /*= 1*/) {
  if (buf == NULL || addr == NULL) return NULL;
-  struct sockaddr *saddr = &addr->sa;
+  const struct sockaddr *saddr = &addr->sa;
  if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) { buf[0]='\0'; return buf; }
  char host[NI_MAXHOST], service[NI_MAXSERV];
  /* NI_NUMERICHOST: If set, then the numeric form of the hostname is returned.
@@ -370,10 +392,9 @@ ncclResult_t ncclSocketListen(struct ncclSocket* sock) {
  if (socketToPort(&sock->addr)) {
    // Port is forced by env. Make sure we get the port.
    int opt = 1;
-#if defined(SO_REUSEPORT)
-    SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt");
-#else
    SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)), "setsockopt");
+#if defined(SO_REUSEPORT)
+    SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt");
 #endif
  }

@@ -412,6 +433,15 @@ static ncclResult_t socketTryAccept(struct ncclSocket* sock) {
  sock->fd = accept(sock->acceptFd, (struct sockaddr*)&sock->addr, &socklen);
  if (sock->fd != -1) {
    sock->state = ncclSocketStateAccepted;
+  } else if (errno == ENETDOWN || errno == EPROTO || errno == ENOPROTOOPT || errno == EHOSTDOWN ||
+             errno == ENONET || errno == EHOSTUNREACH || errno == EOPNOTSUPP || errno == ENETUNREACH) {
+    /* per accept's man page, for linux sockets, the following errors might be already pending errors
+     * and should be considered as EAGAIN. To avoid infinite loop in case of errors, we use the retry count*/
+    if (++sock->errorRetries == ncclParamRetryCnt()) {
+      WARN("socketTryAccept: exceeded error retry count (%d), %s", sock->errorRetries, strerror(errno));
+      return ncclSystemError;
+    }
+    INFO(NCCL_ALL, "Call to accept returned %s, retrying", strerror(errno));
  } else if (errno != EAGAIN && errno != EWOULDBLOCK) {
    WARN("socketTryAccept: Accept failed: %s", strerror(errno));
    return ncclSystemError;
@@ -419,72 +449,118 @@ static ncclResult_t socketTryAccept(struct ncclSocket* sock) {
  return ncclSuccess;
 }

+static ncclResult_t socketSetFlags(struct ncclSocket* sock) {
+  const int one = 1;
+  /* Set socket as non-blocking if async or if we need to be able to abort */
+  if ((sock->asyncFlag || sock->abortFlag) && sock->fd >= 0) {
+    int flags;
+    SYSCHECK(flags = fcntl(sock->fd, F_GETFL), "fcntl");
+    SYSCHECK(fcntl(sock->fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
+  }
+  SYSCHECK(setsockopt(sock->fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt");
+  return ncclSuccess;
+}
+
 static ncclResult_t socketFinalizeAccept(struct ncclSocket* sock) {
  uint64_t magic;
  enum ncclSocketType type;
-  int received = 0;
-  const int one = 1;
-  SYSCHECK(setsockopt(sock->fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt");
+  int received;
+  // once accepted, linux sockets do NOT inherit file status flags such as O_NONBLOCK (BSD ones do)
+  NCCLCHECK(socketSetFlags(sock));

-  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, sock, &magic, sizeof(magic), &received));
-  if (received == 0) return ncclSuccess;
-  NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, &magic, sizeof(magic), &received));
-  if (magic != sock->magic) {
-    WARN("socketFinalizeAccept: wrong magic %lx != %lx", magic, sock->magic);
-    close(sock->fd);
-    sock->fd = -1;
-    // Ignore spurious connection and accept again
-    sock->state = ncclSocketStateAccepting;
-    return ncclSuccess;
-  } else {
-    received = 0;
-    NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, &type, sizeof(type), &received));
-    if (type != sock->type) {
-      WARN("socketFinalizeAccept: wrong type %d != %d", type, sock->type);
-      sock->state = ncclSocketStateError;
+  if (sock->asyncFlag == 0 || sock->finalizeCounter < sizeof(magic)) {
+    if (sock->asyncFlag == 0) {
+      received = 0;
+      NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, &magic, sizeof(magic), &received));
+    } else {
+      received = sock->finalizeCounter;
+      NCCLCHECK(socketProgress(NCCL_SOCKET_RECV, sock, sock->finalizeBuffer, sizeof(magic), &received));
+      sock->finalizeCounter = received;
+      if (received < sizeof(magic)) return ncclSuccess;
+      memcpy(&magic, sock->finalizeBuffer, sizeof(magic));
+    }
+    if (magic != sock->magic) {
+      WARN("socketFinalizeAccept: wrong magic %lx != %lx", magic, sock->magic);
      close(sock->fd);
      sock->fd = -1;
-      return ncclInternalError;
-    } else {
-      sock->state = ncclSocketStateReady;
+      // Ignore spurious connection and accept again
+      sock->state = ncclSocketStateAccepting;
+      return ncclSuccess;
    }
  }
+  if (sock->asyncFlag == 0) {
+    received = 0;
+    NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, &type, sizeof(type), &received));
+  } else {
+    received = sock->finalizeCounter - sizeof(magic);
+    NCCLCHECK(socketProgress(NCCL_SOCKET_RECV, sock, sock->finalizeBuffer, sizeof(type), &received));
+    sock->finalizeCounter = received + sizeof(magic);
+    if (received < sizeof(type)) return ncclSuccess;
+    memcpy(&type, sock->finalizeBuffer, sizeof(type));
+  }
+  if (type != sock->type) {
+    WARN("socketFinalizeAccept: wrong type %d != %d", type, sock->type);
+    sock->state = ncclSocketStateError;
+    close(sock->fd);
+    sock->fd = -1;
+    return ncclInternalError;
+  } else {
+    sock->state = ncclSocketStateReady;
+  }
  return ncclSuccess;
 }

-static ncclResult_t socketStartConnect(struct ncclSocket* sock) {
-  /* blocking/non-blocking connect() is determined by asyncFlag. */
-  int ret = connect(sock->fd, &sock->addr.sa, sock->salen);
-
-  if (ret == 0) {
+static ncclResult_t socketResetFd(struct ncclSocket* sock) {
+  ncclResult_t ret = ncclSuccess;
+  int fd = -1;
+  SYSCHECKGOTO(fd = socket(sock->addr.sa.sa_family, SOCK_STREAM, 0), "socket", ret, cleanup);
+  // if sock->fd is valid, close it and reuse its number
+  if (sock->fd != -1) {
+    SYSCHECKGOTO(dup2(fd, sock->fd), "dup2", ret, cleanup);
+    SYSCHECKGOTO(close(fd), "close", ret, cleanup);
+  } else {
+    sock->fd = fd;
+  }
+  NCCLCHECKGOTO(socketSetFlags(sock), ret, exit);
+exit:
+  return ret;
+cleanup:
+  // cleanup fd, leave sock->fd untouched
+  if (fd != -1) {
+    (void)close(fd);
+  }
+  goto exit;
+}
+static ncclResult_t socketConnectCheck(struct ncclSocket* sock, int errCode, const char funcName[]) {
+  if (errCode == 0) {
    sock->state = ncclSocketStateConnected;
-    return ncclSuccess;
-  } else if (errno == EINPROGRESS) {
+  } else if (errCode == EINPROGRESS) {
    sock->state = ncclSocketStateConnectPolling;
-    return ncclSuccess;
-  } else if (errno == ECONNREFUSED) {
-    if (++sock->refusedRetries == RETRY_REFUSED_TIMES) {
-      sock->state = ncclSocketStateError;
-      WARN("socketStartConnect: exceeded retries (%d)", sock->refusedRetries);
-      return ncclRemoteError;
+  } else if (errCode == ETIMEDOUT || errCode == EHOSTUNREACH || errCode == ECONNREFUSED) {
+    if (sock->customRetry == 0) {
+      if (sock->errorRetries++ == ncclParamRetryCnt()) {
+        sock->state = ncclSocketStateError;
+        WARN("%s: connect returned %s, exceeded error retry count (%d)", funcName, strerror(errCode), sock->errorRetries);
+        return ncclRemoteError;
+      }
+      unsigned int sleepTime = sock->errorRetries * ncclParamRetryTimeOut();
+      INFO(NCCL_ALL, "%s: connect returned %s, retrying (%d/%ld) after sleep for %u msec", funcName, strerror(errCode), sock->errorRetries, ncclParamRetryCnt(), sleepTime);
+      msleep(sleepTime);
    }
-    usleep(SLEEP_INT);
-    if (sock->refusedRetries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno));
-    return ncclSuccess;
-  } else if (errno == ETIMEDOUT) {
-    if (++sock->timedOutRetries == RETRY_TIMEDOUT_TIMES) {
-      sock->state = ncclSocketStateError;
-      WARN("socketStartConnect: exceeded timeouts (%d)", sock->timedOutRetries);
-      return ncclRemoteError;
-    }
-    usleep(SLEEP_INT);
-    return ncclSuccess;
+    NCCLCHECK(socketResetFd(sock)); /* in case of failure in connect, socket state is unspecified */
+    sock->state = ncclSocketStateConnecting;
  } else {
    char line[SOCKET_NAME_MAXLEN+1];
    sock->state = ncclSocketStateError;
-    WARN("socketStartConnect: Connect to %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno));
+    WARN("%s: Connect to %s failed : %s", funcName, ncclSocketToString(&sock->addr, line), strerror(errCode));
    return ncclSystemError;
  }
+  return ncclSuccess;
+}
+static ncclResult_t socketStartConnect(struct ncclSocket* sock) {
+  /* blocking/non-blocking connect() is determined by asyncFlag. */
+  int ret = connect(sock->fd, &sock->addr.sa, sock->salen);
+  return socketConnectCheck(sock, (ret == -1) ? errno : 0, __func__);
 }

 static ncclResult_t socketPollConnect(struct ncclSocket* sock) {
@@ -509,33 +585,7 @@ static ncclResult_t socketPollConnect(struct ncclSocket* sock) {

  /* check socket status */
  SYSCHECK(getsockopt(sock->fd, SOL_SOCKET, SO_ERROR, (void*)&ret, &rlen), "getsockopt");
-
-  if (ret == 0) {
-    sock->state = ncclSocketStateConnected;
-  } else if (ret == ECONNREFUSED) {
-    if (++sock->refusedRetries == RETRY_REFUSED_TIMES) {
-      sock->state = ncclSocketStateError;
-      WARN("socketPollConnect: exceeded retries (%d)", sock->refusedRetries);
-      return ncclRemoteError;
-    }
-    if (sock->refusedRetries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno));
-    usleep(SLEEP_INT);
-    sock->state = ncclSocketStateConnecting;
-  } else if (ret == ETIMEDOUT) {
-    if (++sock->timedOutRetries == RETRY_TIMEDOUT_TIMES) {
-      sock->state = ncclSocketStateError;
-      WARN("socketPollConnect: exceeded timeouts (%d)", sock->timedOutRetries);
-      return ncclRemoteError;
-    }
-    usleep(SLEEP_INT);
-    sock->state = ncclSocketStateConnecting;
-  } else if (ret != EINPROGRESS) {
-    sock->state = ncclSocketStateError;
-    char line[SOCKET_NAME_MAXLEN+1];
-    WARN("socketPollConnect: Connect to %s returned %d(%s) errno %d(%s)", ncclSocketToString(&sock->addr, line), ret, strerror(ret), errno, strerror(errno));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
+  return socketConnectCheck(sock, ret, __func__);
 }

 ncclResult_t ncclSocketPollConnect(struct ncclSocket* sock) {
@@ -548,12 +598,24 @@ ncclResult_t ncclSocketPollConnect(struct ncclSocket* sock) {
 }

 static ncclResult_t socketFinalizeConnect(struct ncclSocket* sock) {
-  int sent = 0;
-  NCCLCHECK(socketProgress(NCCL_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent));
-  if (sent == 0) return ncclSuccess;
-  NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent));
-  sent = 0;
-  NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, &sock->type, sizeof(sock->type), &sent));
+  int sent;
+  if (sock->asyncFlag == 0) {
+    sent = 0;
+    NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent));
+    sent = 0;
+    NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, &sock->type, sizeof(sock->type), &sent));
+  } else {
+    if (sock->finalizeCounter < sizeof(sock->magic)) {
+      sent = sock->finalizeCounter;
+      NCCLCHECK(socketProgress(NCCL_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent));
+      sock->finalizeCounter = sent;
+      if (sent < sizeof(sock->magic)) return ncclSuccess;
+    }
+    sent = sock->finalizeCounter - sizeof(sock->magic);
+    NCCLCHECK(socketProgress(NCCL_SOCKET_SEND, sock, &sock->type, sizeof(sock->type), &sent));
+    sock->finalizeCounter = sent + sizeof(sock->magic);
+    if (sent < sizeof(sock->type)) return ncclSuccess;
+  }
  sock->state = ncclSocketStateReady;
  return ncclSuccess;
 }
@@ -598,7 +660,6 @@ ncclResult_t ncclSocketConnect(struct ncclSocket* sock) {
 #ifdef ENABLE_TRACE
  char line[SOCKET_NAME_MAXLEN+1];
 #endif
-  const int one = 1;

  if (sock == NULL) {
    WARN("ncclSocketConnect: pass NULL socket");
@@ -616,9 +677,8 @@ ncclResult_t ncclSocketConnect(struct ncclSocket* sock) {
  }
  TRACE(NCCL_INIT|NCCL_NET,"Connecting to socket %s", ncclSocketToString(&sock->addr, line));

-  SYSCHECK(setsockopt(sock->fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt");
-
  sock->state = ncclSocketStateConnecting;
+  sock->finalizeCounter = 0;
  do {
    NCCLCHECK(socketProgressState(sock));
  } while (sock->asyncFlag == 0 &&
@@ -664,6 +724,7 @@ ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listen
    memcpy(sock, listenSock, sizeof(struct ncclSocket));
    sock->acceptFd = listenSock->fd;
    sock->state = ncclSocketStateAccepting;
+    sock->finalizeCounter = 0;
  }

  do {
@@ -694,12 +755,11 @@ exit:
  return ret;
 }

-ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr, uint64_t magic, enum ncclSocketType type, volatile uint32_t* abortFlag, int asyncFlag) {
+ncclResult_t ncclSocketInit(struct ncclSocket* sock, const union ncclSocketAddress* addr, uint64_t magic, enum ncclSocketType type, volatile uint32_t* abortFlag, int asyncFlag, int customRetry) {
  ncclResult_t ret = ncclSuccess;

  if (sock == NULL) goto exit;
-  sock->timedOutRetries = 0;
-  sock->refusedRetries = 0;
+  sock->errorRetries = 0;
  sock->abortFlag = abortFlag;
  sock->asyncFlag = asyncFlag;
  sock->state = ncclSocketStateInitialized;
@@ -707,6 +767,7 @@ ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* ad
  sock->type = type;
  sock->fd = -1;
  sock->acceptFd = -1;
+  sock->customRetry = customRetry;

  if (addr) {
    /* IPv4/IPv6 support */
@@ -718,28 +779,14 @@ ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* ad
      WARN("ncclSocketInit: connecting to address %s with family %d is neither AF_INET(%d) nor AF_INET6(%d)",
          ncclSocketToString(&sock->addr, line), family, AF_INET, AF_INET6);
      ret = ncclInternalError;
-      goto fail;
+      goto exit;
    }
    sock->salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
-
-    /* Connect to a hostname / port */
-    sock->fd = socket(family, SOCK_STREAM, 0);
-    if (sock->fd == -1) {
-      WARN("ncclSocketInit: Socket creation failed : %s", strerror(errno));
-      ret = ncclSystemError;
-      goto fail;
-    }
+    // in case of error, we close the fd before returning as it's unclear if the caller has to use ncclSocketClose for cleanup
+    NCCLCHECKGOTO(socketResetFd(sock), ret, fail);
  } else {
    memset(&sock->addr, 0, sizeof(union ncclSocketAddress));
  }
-
-  /* Set socket as non-blocking if async or if we need to be able to abort */
-  if ((sock->asyncFlag || sock->abortFlag) && sock->fd >= 0) {
-    int flags;
-    SYSCHECKGOTO(flags = fcntl(sock->fd, F_GETFL), "fcntl", ret, fail);
-    SYSCHECKGOTO(fcntl(sock->fd, F_SETFL, flags | O_NONBLOCK), "fcntl", ret, fail);
-  }
-
 exit:
  return ret;
 fail:
@@ -750,12 +797,12 @@ fail:
  goto exit;
 }

-ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
+ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int* closed) {
  if (sock == NULL) {
    WARN("ncclSocketProgress: pass NULL socket");
    return ncclInvalidArgument;
  }
-  NCCLCHECK(socketProgress(op, sock, ptr, size, offset));
+  NCCLCHECK(socketProgress(op, sock, ptr, size, offset, closed));
  return ncclSuccess;
 }

@@ -788,7 +835,7 @@ ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size) {
    WARN("ncclSocketRecv: pass NULL socket");
    return ncclInvalidArgument;
  }
-  if (sock->state != ncclSocketStateReady) {
+  if (sock->state != ncclSocketStateReady && sock->state != ncclSocketStateTerminating) {
    WARN("ncclSocketRecv: socket state (%d) is not ready", sock->state);
    return ncclInternalError;
  }
@@ -802,7 +849,8 @@ ncclResult_t ncclSocketSendRecv(struct ncclSocket* sendSock, void* sendPtr, int
    WARN("ncclSocketSendRecv: invalid socket %p/%p", sendSock, recvSock);
    return ncclInternalError;
  }
-  if (sendSock->state != ncclSocketStateReady || recvSock->state != ncclSocketStateReady) {
+  if (sendSock->state != ncclSocketStateReady ||
+      (recvSock->state != ncclSocketStateReady && recvSock->state != ncclSocketStateTerminating)) {
    WARN("ncclSocketSendRecv: socket state (%d/%d) is not ready", sendSock->state, recvSock->state);
    return ncclInternalError;
  }
@@ -846,9 +894,20 @@ ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int
  return ncclSuccess;
 }

-ncclResult_t ncclSocketClose(struct ncclSocket* sock) {
+// Make it possible to close just one part of a socket.
+ncclResult_t ncclSocketShutdown(struct ncclSocket* sock, int how) {
  if (sock != NULL) {
    if (sock->fd >= 0) {
+      shutdown(sock->fd, how);
+    }
+    sock->state = ncclSocketStateTerminating;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketClose(struct ncclSocket* sock) {
+  if (sock != NULL) {
+    if (sock->state > ncclSocketStateNone && sock->state < ncclSocketStateNum && sock->fd >= 0) {
      /* shutdown() is needed to send FIN packet to proxy thread; shutdown() is not affected
       * by refcount of fd, but close() is. close() won't close a fd and send FIN packet if
       * the fd is duplicated (e.g. fork()). So shutdown() guarantees the correct and graceful
@@ -16,9 +16,11 @@
 pthread_mutex_t tunerPluginLock = PTHREAD_MUTEX_INITIALIZER;
 static int tunerPluginRefCount;
 static void* tunerPluginLib = nullptr;
-static ncclTuner_v3_t* tunerSymbol = nullptr;
+static ncclTuner_v4_t* tunerSymbol = nullptr;
+static ncclTuner_v3_t* ncclTuner_v3 = nullptr;
 static ncclTuner_v2_t* ncclTuner_v2 = nullptr;
-static ncclTuner_v3_t ncclTuner_v2_as_v3;
+static ncclTuner_v4_t ncclTuner_v2_as_v4;
+static ncclTuner_v4_t ncclTuner_v3_as_v4;

 static int hasNvlsSupport(float** collCostTable) {
  // Requirements for support of different algorithms:
@@ -39,7 +41,20 @@ static int hasCollNetSupport(float** collCostTable) {
  return (table[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] == NCCL_ALGO_PROTO_IGNORE) ? 0 : 1;
 }

-static ncclResult_t ncclTuner_v2_as_v3_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo __attribute__((unused)), int numProto __attribute__((unused)), int* nChannels) {
+static ncclResult_t ncclTuner_v3_as_v4_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo, int numProto, int regBuff __attribute__((unused)), int* nChannels) {
+  NCCLCHECK(ncclTuner_v3->getCollInfo(context, collType, nBytes, numPipeOps, collCostTable, numAlgo, numProto,  nChannels));
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclTuner_v3_as_v4_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void** context) {
+  NCCLCHECK(ncclTuner_v3->init(nRanks, nNodes, logFunction, context));
+  ncclTuner_v3_as_v4.name = ncclTuner_v3->name;
+  ncclTuner_v3_as_v4.getCollInfo = ncclTuner_v3_as_v4_getCollInfo;
+  ncclTuner_v3_as_v4.destroy = ncclTuner_v3->destroy;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclTuner_v2_as_v4_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo __attribute__((unused)), int numProto __attribute__((unused)), int regBuff __attribute__((unused)), int* nChannels) {
  int algorithm = NCCL_ALGO_UNDEF;
  int protocol = NCCL_PROTO_UNDEF;
  int nvlsSupport = hasNvlsSupport(collCostTable);
@@ -53,11 +68,11 @@ static ncclResult_t ncclTuner_v2_as_v3_getCollInfo(void* context, ncclFunc_t col
  return ncclSuccess;
 }

-static ncclResult_t ncclTuner_v2_as_v3_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void** context) {
+static ncclResult_t ncclTuner_v2_as_v4_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void** context) {
  NCCLCHECK(ncclTuner_v2->init(nRanks, nNodes, logFunction, context));
-  ncclTuner_v2_as_v3.name = ncclTuner_v2->name;
-  ncclTuner_v2_as_v3.getCollInfo = ncclTuner_v2_as_v3_getCollInfo;
-  ncclTuner_v2_as_v3.destroy = ncclTuner_v2->destroy;
+  ncclTuner_v2_as_v4.name = ncclTuner_v2->name;
+  ncclTuner_v2_as_v4.getCollInfo = ncclTuner_v2_as_v4_getCollInfo;
+  ncclTuner_v2_as_v4.destroy = ncclTuner_v2->destroy;
  return ncclSuccess;
 }

@@ -198,18 +213,26 @@ ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm) {
    goto fail;
  }

-  tunerSymbol = (ncclTuner_v3_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v3");
+  tunerSymbol = (ncclTuner_v4_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v4");
  if (tunerSymbol == nullptr) {
-    INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.");
-    ncclTuner_v2 = (ncclTuner_v2_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v2");
-    if (ncclTuner_v2 == nullptr) {
-      INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.");
-      dlclose(tunerPluginLib);
-      goto fail;
+    INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.");
+    ncclTuner_v3 = (ncclTuner_v3_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v3");
+    if (ncclTuner_v3 == nullptr) {
+      INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.");
+      ncclTuner_v2 = (ncclTuner_v2_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v2");
+      if (ncclTuner_v2 == nullptr) {
+        INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.");
+        dlclose(tunerPluginLib);
+        goto fail;
+      } else {
+        ncclTuner_v2_as_v4.init = ncclTuner_v2_as_v4_init;
+        ncclTuner_v2_as_v4.name = ncclTuner_v2->name;
+        tunerSymbol = &ncclTuner_v2_as_v4;
+      }
    } else {
-      ncclTuner_v2_as_v3.init = ncclTuner_v2_as_v3_init;
-      ncclTuner_v2_as_v3.name = ncclTuner_v2->name;
-      tunerSymbol = &ncclTuner_v2_as_v3;
+      ncclTuner_v3_as_v4.init = ncclTuner_v3_as_v4_init;
+      ncclTuner_v3_as_v4.name = ncclTuner_v3->name;
+      tunerSymbol = &ncclTuner_v3_as_v4;
    }
  }

@@ -12,6 +12,9 @@
 #if CUDART_VERSION >= 11000
 #include <cuda_bf16.h>
 #endif
+#if CUDART_VERSION >= 11080
+#include <cuda_fp8.h>
+#endif

 #define NCCL_MAJOR ${nccl:Major}
 #define NCCL_MINOR ${nccl:Minor}
@@ -183,6 +186,10 @@ const char* pncclGetErrorString(ncclResult_t result);
 const char*  ncclGetLastError(ncclComm_t comm);
 const char* pncclGetLastError(ncclComm_t comm);

+/* Reload environment variables that determine logging. */
+void  ncclResetDebugInit();
+void pncclResetDebugInit();
+
 /* Checks whether the comm has encountered any asynchronous errors */
 ncclResult_t  ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
 ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
@@ -236,12 +243,10 @@ typedef enum { ncclInt8       = 0, ncclChar       = 0,
               ncclFloat16    = 6, ncclHalf       = 6,
               ncclFloat32    = 7, ncclFloat      = 7,
               ncclFloat64    = 8, ncclDouble     = 8,
-#if defined(__CUDA_BF16_TYPES_EXIST__)
               ncclBfloat16   = 9,
-               ncclNumTypes   = 10
-#else
-               ncclNumTypes   = 9
-#endif
+               ncclFloat8e4m3 = 10,
+               ncclFloat8e5m2 = 11,
+               ncclNumTypes   = 12
 } ncclDataType_t;

 /* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */
@@ -15,20 +15,95 @@
 //#include <sys/stat.h>
 //#include <unistd.h>

-static ncclNet_v8_t ncclNet_v5_as_v8;
-static ncclNet_v8_t ncclNet_v6_as_v8;
-static ncclNet_v8_t ncclNet_v7_as_v8;
+static ncclNet_v9_t ncclNet_v5_as_v9;
+static ncclNet_v9_t ncclNet_v6_as_v9;
+static ncclNet_v9_t ncclNet_v7_as_v9;
+static ncclNet_v9_t ncclNet_v8_as_v9;
 static ncclNet_v5_t *ncclNet_v5;
 static ncclNet_v6_t *ncclNet_v6;
 static ncclNet_v7_t *ncclNet_v7;
-static ncclCollNet_v8_t ncclCollNet_v5_as_v8;
-static ncclCollNet_v8_t ncclCollNet_v6_as_v8;
-static ncclCollNet_v8_t ncclCollNet_v7_as_v8;
+static ncclNet_v8_t *ncclNet_v8;
+static ncclCollNet_v9_t ncclCollNet_v5_as_v9;
+static ncclCollNet_v9_t ncclCollNet_v6_as_v9;
+static ncclCollNet_v9_t ncclCollNet_v7_as_v9;
+static ncclCollNet_v9_t ncclCollNet_v8_as_v9;
 static ncclCollNet_v5_t *ncclCollNet_v5;
 static ncclCollNet_v6_t *ncclCollNet_v6;
 static ncclCollNet_v7_t *ncclCollNet_v7;
+static ncclCollNet_v8_t *ncclCollNet_v8;

-static ncclResult_t ncclNet_v7_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) {
+#define MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two.
+#define MAX_COLLNET_SIZE (512*1024*1024L) //Set for initial collent plugins when size was not dynamically queried
+
+static ncclResult_t ncclNet_v8_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
+  ncclNetProperties_v8_t p8;
+  ncclResult_t ans = ncclNet_v8->getProperties(dev, &p8);
+  if (ans != ncclSuccess) return ans;
+  props->name = p8.name;
+  props->pciPath = p8.pciPath;
+  props->guid = p8.guid;
+  props->ptrSupport = p8.ptrSupport;
+  props->regIsGlobal = p8.regIsGlobal;
+  props->forceFlush = 0;
+  props->speed = p8.speed;
+  props->port = p8.port;
+  props->maxComms = p8.maxComms;
+  props->maxRecvs = p8.maxRecvs;
+  props->latency = p8.latency;
+  props->netDeviceType = p8.netDeviceType;
+  props->netDeviceVersion = p8.netDeviceVersion;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclNet_v8_as_v9_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
+   int sizeInt;
+   if (size > MAX_NET_SIZE) return ncclInternalError;
+   sizeInt = (int)size;
+   ncclResult_t ans = ncclNet_v8->isend(sendComm, data, sizeInt, tag, mhandle, request);
+   return ans;
+}
+
+static ncclResult_t ncclNet_v8_as_v9_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
+   int sizesInt[NCCL_PROXY_MAX_SUBS];
+   //reset to NULL if optional receive completion is set
+   if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = NULL;
+   for (int i=0; i<n; i++) {
+     if (sizes[i] > MAX_NET_SIZE) return ncclInternalError;
+     sizesInt[i] = (int) sizes[i];
+   }
+   ncclResult_t ans = ncclNet_v8->irecv(recvComm, n, data, sizesInt, tags, mhandles, request);
+   return ans;
+}
+
+static ncclResult_t ncclNet_v8_as_v9_init(ncclDebugLogger_t logfn) {
+  NCCLCHECK(ncclNet_v8->init(logfn));
+  ncclNet_v8_as_v9.name = ncclNet_v8->name;
+  ncclNet_v8_as_v9.devices = ncclNet_v8->devices;
+  ncclNet_v8_as_v9.getProperties = ncclNet_v8_as_v9_getProperties;
+  ncclNet_v8_as_v9.listen = ncclNet_v8->listen;
+  ncclNet_v8_as_v9.connect = ncclNet_v8->connect;
+  ncclNet_v8_as_v9.accept =  ncclNet_v8->accept;
+  ncclNet_v8_as_v9.regMr = ncclNet_v8->regMr;
+  ncclNet_v8_as_v9.regMrDmaBuf = ncclNet_v8->regMrDmaBuf;
+  ncclNet_v8_as_v9.deregMr = ncclNet_v8->deregMr;
+  ncclNet_v8_as_v9.isend = ncclNet_v8_as_v9_isend;
+  ncclNet_v8_as_v9.irecv = ncclNet_v8_as_v9_irecv;
+  ncclNet_v8_as_v9.iflush = ncclNet_v8->iflush;
+  ncclNet_v8_as_v9.test = ncclNet_v8->test;
+  ncclNet_v8_as_v9.closeSend = ncclNet_v8->closeSend;
+  ncclNet_v8_as_v9.closeRecv = ncclNet_v8->closeRecv;
+  ncclNet_v8_as_v9.closeListen = ncclNet_v8->closeListen;
+  ncclNet_v8_as_v9.getDeviceMr = ncclNet_v8->getDeviceMr;
+  ncclNet_v8_as_v9.irecvConsumed = ncclNet_v8->irecvConsumed;
+  ncclNet_v8_as_v9.makeVDevice   = NULL;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclNet_v7_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
  ncclNetProperties_v7_t p7;
  ncclResult_t ans = ncclNet_v7->getProperties(dev, &p7);
  if (ans != ncclSuccess) return ans;
@@ -37,6 +112,7 @@ static ncclResult_t ncclNet_v7_as_v8_getProperties(int dev, ncclNetProperties_v8
  props->guid = p7.guid;
  props->ptrSupport = p7.ptrSupport;
  props->regIsGlobal = 0;
+  props->forceFlush = 0;
  props->speed = p7.speed;
  props->port = p7.port;
  props->maxComms = p7.maxComms;
@@ -44,38 +120,63 @@ static ncclResult_t ncclNet_v7_as_v8_getProperties(int dev, ncclNetProperties_v8
  props->latency = p7.latency;
  props->netDeviceType = p7.netDeviceType;
  props->netDeviceVersion = p7.netDeviceVersion;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
  return ncclSuccess;
 }

-static ncclResult_t ncclNet_v7_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
+static ncclResult_t ncclNet_v7_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
  if (size >= 1UL<<31) return ncclInternalError;
  return ncclNet_v7->regMr(comm, data, (int) size, type, mhandle);
 }

-static ncclResult_t ncclNet_v7_as_v8_init(ncclDebugLogger_t logfn) {
+static ncclResult_t ncclNet_v7_as_v9_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
+   int sizeInt;
+   if (size > MAX_NET_SIZE) return ncclInternalError;
+   sizeInt = (int)size;
+   ncclResult_t ans = ncclNet_v7->isend(sendComm, data, sizeInt, tag, mhandle, request);
+   return ans;
+}
+
+static ncclResult_t ncclNet_v7_as_v9_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
+   int sizesInt[NCCL_PROXY_MAX_SUBS];
+   //reset to NULL if optional receive completion is set
+   if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = NULL;
+   for (int i=0; i<n; i++) {
+     if (sizes[i] > MAX_NET_SIZE) return ncclInternalError;
+     sizesInt[i] = (int) sizes[i];
+   }
+   ncclResult_t ans = ncclNet_v7->irecv(recvComm, n, data, sizesInt, tags, mhandles, request);
+   return ans;
+}
+
+static ncclResult_t ncclNet_v7_as_v9_init(ncclDebugLogger_t logfn) {
  NCCLCHECK(ncclNet_v7->init(logfn));
-  ncclNet_v7_as_v8.name = ncclNet_v7->name;
-  ncclNet_v7_as_v8.devices = ncclNet_v7->devices;
-  ncclNet_v7_as_v8.getProperties = ncclNet_v7_as_v8_getProperties; // ncclNet_v5->getProperties;
-  ncclNet_v7_as_v8.listen = ncclNet_v7->listen;
-  ncclNet_v7_as_v8.connect = ncclNet_v7->connect;
-  ncclNet_v7_as_v8.accept =  ncclNet_v7->accept;
-  ncclNet_v7_as_v8.regMr = ncclNet_v7_as_v8_regMr;
-  ncclNet_v7_as_v8.regMrDmaBuf = ncclNet_v7->regMrDmaBuf;
-  ncclNet_v7_as_v8.deregMr = ncclNet_v7->deregMr;
-  ncclNet_v7_as_v8.isend = ncclNet_v7->isend;
-  ncclNet_v7_as_v8.irecv = ncclNet_v7->irecv;
-  ncclNet_v7_as_v8.iflush = ncclNet_v7->iflush;
-  ncclNet_v7_as_v8.test = ncclNet_v7->test;
-  ncclNet_v7_as_v8.closeSend = ncclNet_v7->closeSend;
-  ncclNet_v7_as_v8.closeRecv = ncclNet_v7->closeRecv;
-  ncclNet_v7_as_v8.closeListen = ncclNet_v7->closeListen;
-  ncclNet_v7_as_v8.getDeviceMr = ncclNet_v7->getDeviceMr;
-  ncclNet_v7_as_v8.irecvConsumed = ncclNet_v7->irecvConsumed;
+  ncclNet_v7_as_v9.name = ncclNet_v7->name;
+  ncclNet_v7_as_v9.devices = ncclNet_v7->devices;
+  ncclNet_v7_as_v9.getProperties = ncclNet_v7_as_v9_getProperties; // ncclNet_v5->getProperties;
+  ncclNet_v7_as_v9.listen = ncclNet_v7->listen;
+  ncclNet_v7_as_v9.connect = ncclNet_v7->connect;
+  ncclNet_v7_as_v9.accept =  ncclNet_v7->accept;
+  ncclNet_v7_as_v9.regMr = ncclNet_v7_as_v9_regMr;
+  ncclNet_v7_as_v9.regMrDmaBuf = ncclNet_v7->regMrDmaBuf;
+  ncclNet_v7_as_v9.deregMr = ncclNet_v7->deregMr;
+  ncclNet_v7_as_v9.isend = ncclNet_v7_as_v9_isend;
+  ncclNet_v7_as_v9.irecv = ncclNet_v7_as_v9_irecv;
+  ncclNet_v7_as_v9.iflush = ncclNet_v7->iflush;
+  ncclNet_v7_as_v9.test = ncclNet_v7->test;
+  ncclNet_v7_as_v9.closeSend = ncclNet_v7->closeSend;
+  ncclNet_v7_as_v9.closeRecv = ncclNet_v7->closeRecv;
+  ncclNet_v7_as_v9.closeListen = ncclNet_v7->closeListen;
+  ncclNet_v7_as_v9.getDeviceMr = ncclNet_v7->getDeviceMr;
+  ncclNet_v7_as_v9.irecvConsumed = ncclNet_v7->irecvConsumed;
+  ncclNet_v7_as_v9.makeVDevice  = NULL;
  return ncclSuccess;
 }

-static ncclResult_t ncclNet_v6_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) {
+static ncclResult_t ncclNet_v6_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
  ncclNetProperties_v6_t p6;
  ncclResult_t ans = ncclNet_v6->getProperties(dev, &p6);
  if (ans != ncclSuccess) return ans;
@@ -84,6 +185,7 @@ static ncclResult_t ncclNet_v6_as_v8_getProperties(int dev, ncclNetProperties_v8
  props->guid = p6.guid;
  props->ptrSupport = p6.ptrSupport;
  props->regIsGlobal = 0;
+  props->forceFlush = 0;
  props->speed = p6.speed;
  props->port = p6.port;
  props->maxComms = p6.maxComms;
@@ -91,46 +193,71 @@ static ncclResult_t ncclNet_v6_as_v8_getProperties(int dev, ncclNetProperties_v8
  props->latency = p6.latency;
  props->netDeviceType = NCCL_NET_DEVICE_HOST;
  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
  return ncclSuccess;
 }

-static ncclResult_t ncclNet_v6_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
+static ncclResult_t ncclNet_v6_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
  if (size >= 1UL<<31) return ncclInternalError;
  return ncclNet_v6->regMr(comm, data, (int) size, type, mhandle);
 }

-static ncclResult_t ncclNet_v6_as_v8_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
+static ncclResult_t ncclNet_v6_as_v9_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
  return ncclNet_v6->connect(dev, handle, sendComm);
 }

-static ncclResult_t ncclNet_v6_as_v8_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
+static ncclResult_t ncclNet_v6_as_v9_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
  return ncclNet_v6->accept(listenComm, recvComm);
 }

-static ncclResult_t ncclNet_v6_as_v8_init(ncclDebugLogger_t logfn) {
+static ncclResult_t ncclNet_v6_as_v9_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
+   int sizeInt;
+   if (size > MAX_NET_SIZE) return ncclInternalError;
+   sizeInt = (int)size;
+   ncclResult_t ans = ncclNet_v6->isend(sendComm, data, sizeInt, tag, mhandle, request);
+   return ans;
+}
+
+static ncclResult_t ncclNet_v6_as_v9_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
+   int sizesInt[NCCL_PROXY_MAX_SUBS];
+   //reset to NULL if optional receive completion is set
+   if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = NULL;
+   for (int i=0; i<n; i++) {
+     if (sizes[i] > MAX_NET_SIZE) return ncclInternalError;
+     sizesInt[i] = (int) sizes[i];
+   }
+   ncclResult_t ans = ncclNet_v6->irecv(recvComm, n, data, sizesInt, tags, mhandles, request);
+   return ans;
+}
+
+static ncclResult_t ncclNet_v6_as_v9_init(ncclDebugLogger_t logfn) {
  NCCLCHECK(ncclNet_v6->init(logfn));
-  ncclNet_v6_as_v8.name = ncclNet_v6->name;
-  ncclNet_v6_as_v8.devices = ncclNet_v6->devices;
-  ncclNet_v6_as_v8.getProperties = ncclNet_v6_as_v8_getProperties; // ncclNet_v5->getProperties;
-  ncclNet_v6_as_v8.listen = ncclNet_v6->listen;
-  ncclNet_v6_as_v8.connect = ncclNet_v6_as_v8_connect;
-  ncclNet_v6_as_v8.accept =  ncclNet_v6_as_v8_accept;
-  ncclNet_v6_as_v8.regMr = ncclNet_v6_as_v8_regMr;
-  ncclNet_v6_as_v8.regMrDmaBuf = ncclNet_v6->regMrDmaBuf;
-  ncclNet_v6_as_v8.deregMr = ncclNet_v6->deregMr;
-  ncclNet_v6_as_v8.isend = ncclNet_v6->isend;
-  ncclNet_v6_as_v8.irecv = ncclNet_v6->irecv;
-  ncclNet_v6_as_v8.iflush = ncclNet_v6->iflush;
-  ncclNet_v6_as_v8.test = ncclNet_v6->test;
-  ncclNet_v6_as_v8.closeSend = ncclNet_v6->closeSend;
-  ncclNet_v6_as_v8.closeRecv = ncclNet_v6->closeRecv;
-  ncclNet_v6_as_v8.closeListen = ncclNet_v6->closeListen;
-  ncclNet_v6_as_v8.getDeviceMr = NULL;
-  ncclNet_v6_as_v8.irecvConsumed = NULL;
+  ncclNet_v6_as_v9.name = ncclNet_v6->name;
+  ncclNet_v6_as_v9.devices = ncclNet_v6->devices;
+  ncclNet_v6_as_v9.getProperties = ncclNet_v6_as_v9_getProperties;
+  ncclNet_v6_as_v9.listen = ncclNet_v6->listen;
+  ncclNet_v6_as_v9.connect = ncclNet_v6_as_v9_connect;
+  ncclNet_v6_as_v9.accept =  ncclNet_v6_as_v9_accept;
+  ncclNet_v6_as_v9.regMr = ncclNet_v6_as_v9_regMr;
+  ncclNet_v6_as_v9.regMrDmaBuf = ncclNet_v6->regMrDmaBuf;
+  ncclNet_v6_as_v9.deregMr = ncclNet_v6->deregMr;
+  ncclNet_v6_as_v9.isend = ncclNet_v6_as_v9_isend;
+  ncclNet_v6_as_v9.irecv = ncclNet_v6_as_v9_irecv;
+  ncclNet_v6_as_v9.iflush = ncclNet_v6->iflush;
+  ncclNet_v6_as_v9.test = ncclNet_v6->test;
+  ncclNet_v6_as_v9.closeSend = ncclNet_v6->closeSend;
+  ncclNet_v6_as_v9.closeRecv = ncclNet_v6->closeRecv;
+  ncclNet_v6_as_v9.closeListen = ncclNet_v6->closeListen;
+  ncclNet_v6_as_v9.getDeviceMr = NULL;
+  ncclNet_v6_as_v9.irecvConsumed = NULL;
+  ncclNet_v6_as_v9.makeVDevice  = NULL;
  return ncclSuccess;
 }

-static ncclResult_t ncclNet_v5_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) {
+static ncclResult_t ncclNet_v5_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
  ncclNetProperties_v6_t p6;
  ncclResult_t ans = ncclNet_v5->getProperties(dev, &p6);
  if (ans != ncclSuccess) return ans;
@@ -139,6 +266,7 @@ static ncclResult_t ncclNet_v5_as_v8_getProperties(int dev, ncclNetProperties_v8
  props->guid = p6.guid;
  props->ptrSupport = p6.ptrSupport;
  props->regIsGlobal = 0;
+  props->forceFlush = 0;
  props->speed = p6.speed;
  props->port = p6.port;
  props->maxComms = p6.maxComms;
@@ -146,48 +274,73 @@ static ncclResult_t ncclNet_v5_as_v8_getProperties(int dev, ncclNetProperties_v8
  props->latency = p6.latency;
  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
  return ncclSuccess;
 }

-static ncclResult_t ncclNet_v5_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
+static ncclResult_t ncclNet_v5_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
  if (size >= 1UL<<31) return ncclInternalError;
  return ncclNet_v5->regMr(comm, data, (int) size, type, mhandle);
 }

-static ncclResult_t ncclNet_v5_as_v8_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
+static ncclResult_t ncclNet_v5_as_v9_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
  return ncclNet_v5->connect(dev, handle, sendComm);
 }

-static ncclResult_t ncclNet_v5_as_v8_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
+static ncclResult_t ncclNet_v5_as_v9_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
  return ncclNet_v5->accept(listenComm, recvComm);
 }

+static ncclResult_t ncclNet_v5_as_v9_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
+   int sizeInt;
+   if (size > MAX_NET_SIZE) return ncclInternalError;
+   sizeInt = (int)size;
+   ncclResult_t ans = ncclNet_v5->isend(sendComm, data, sizeInt, tag, mhandle, request);
+   return ans;
+}
+
+static ncclResult_t ncclNet_v5_as_v9_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
+   int sizesInt[NCCL_PROXY_MAX_SUBS];
+   //reset to NULL if optional receive completion is set
+   if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = NULL;
+   for (int i=0; i<n; i++) {
+     if (sizes[i] > MAX_NET_SIZE) return ncclInternalError;
+     sizesInt[i] = (int) sizes[i];
+   }
+   ncclResult_t ans = ncclNet_v5->irecv(recvComm, n, data, sizesInt, tags, mhandles, request);
+   return ans;
+}
+
 // We use a wrapper around the v5 init to copy over the struct contents
 // post-init since they may not be initialized before hand.
-static ncclResult_t ncclNet_v5_as_v8_init(ncclDebugLogger_t logfn) {
+static ncclResult_t ncclNet_v5_as_v9_init(ncclDebugLogger_t logfn) {
  NCCLCHECK(ncclNet_v5->init(logfn));
-  ncclNet_v5_as_v8.name = ncclNet_v5->name;
-  ncclNet_v5_as_v8.devices = ncclNet_v5->devices;
-  ncclNet_v5_as_v8.getProperties = ncclNet_v5_as_v8_getProperties;
-  ncclNet_v5_as_v8.listen = ncclNet_v5->listen;
-  ncclNet_v5_as_v8.connect = ncclNet_v5_as_v8_connect;
-  ncclNet_v5_as_v8.accept =  ncclNet_v5_as_v8_accept;
-  ncclNet_v5_as_v8.regMr = ncclNet_v5_as_v8_regMr;
-  ncclNet_v5_as_v8.regMrDmaBuf = NULL;
-  ncclNet_v5_as_v8.deregMr = ncclNet_v5->deregMr;
-  ncclNet_v5_as_v8.isend = ncclNet_v5->isend;
-  ncclNet_v5_as_v8.irecv = ncclNet_v5->irecv;
-  ncclNet_v5_as_v8.iflush = ncclNet_v5->iflush;
-  ncclNet_v5_as_v8.test = ncclNet_v5->test;
-  ncclNet_v5_as_v8.closeSend = ncclNet_v5->closeSend;
-  ncclNet_v5_as_v8.closeRecv = ncclNet_v5->closeRecv;
-  ncclNet_v5_as_v8.closeListen = ncclNet_v5->closeListen;
-  ncclNet_v5_as_v8.getDeviceMr = NULL;
-  ncclNet_v5_as_v8.irecvConsumed = NULL;
+  ncclNet_v5_as_v9.name = ncclNet_v5->name;
+  ncclNet_v5_as_v9.devices = ncclNet_v5->devices;
+  ncclNet_v5_as_v9.getProperties = ncclNet_v5_as_v9_getProperties;
+  ncclNet_v5_as_v9.listen = ncclNet_v5->listen;
+  ncclNet_v5_as_v9.connect = ncclNet_v5_as_v9_connect;
+  ncclNet_v5_as_v9.accept =  ncclNet_v5_as_v9_accept;
+  ncclNet_v5_as_v9.regMr = ncclNet_v5_as_v9_regMr;
+  ncclNet_v5_as_v9.regMrDmaBuf = NULL;
+  ncclNet_v5_as_v9.deregMr = ncclNet_v5->deregMr;
+  ncclNet_v5_as_v9.isend = ncclNet_v5_as_v9_isend;
+  ncclNet_v5_as_v9.irecv = ncclNet_v5_as_v9_irecv;
+  ncclNet_v5_as_v9.iflush = ncclNet_v5->iflush;
+  ncclNet_v5_as_v9.test = ncclNet_v5->test;
+  ncclNet_v5_as_v9.closeSend = ncclNet_v5->closeSend;
+  ncclNet_v5_as_v9.closeRecv = ncclNet_v5->closeRecv;
+  ncclNet_v5_as_v9.closeListen = ncclNet_v5->closeListen;
+  ncclNet_v5_as_v9.getDeviceMr = NULL;
+  ncclNet_v5_as_v9.irecvConsumed = NULL;
+  ncclNet_v5_as_v9.makeVDevice = NULL;
  return ncclSuccess;
 }

-static ncclResult_t ncclCollNet_v5_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) {
+static ncclResult_t ncclCollNet_v5_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
  ncclNetProperties_v6_t p6;
  ncclResult_t ans = ncclCollNet_v5->getProperties(dev, &p6);
  if (ans != ncclSuccess) return ans;
@@ -196,6 +349,7 @@ static ncclResult_t ncclCollNet_v5_as_v8_getProperties(int dev, ncclNetPropertie
  props->guid = p6.guid;
  props->ptrSupport = p6.ptrSupport;
  props->regIsGlobal = 0;
+  props->forceFlush = 0;
  props->speed = p6.speed;
  props->port = p6.port;
  props->maxComms = p6.maxComms;
@@ -203,38 +357,52 @@ static ncclResult_t ncclCollNet_v5_as_v8_getProperties(int dev, ncclNetPropertie
  props->latency = p6.latency;
  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
  return ncclSuccess;
 }

-static ncclResult_t ncclCollNet_v5_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
+static ncclResult_t ncclCollNet_v5_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
  if (size >= 1UL<<31) return ncclInternalError;
  return ncclCollNet_v5->regMr(comm, data, (int) size, type, mhandle);
 }

+static ncclResult_t ncclCollNet_v5_as_v9_iallreduce(void* collComm, void* sendData, void* recvData, size_t count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
+   int countInt;
+   if (count > MAX_NET_SIZE) return ncclInternalError;
+   countInt = (int)count;
+   ncclResult_t ans = ncclCollNet_v5->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp,
+                  sendMhandle, recvMhandle, request);
+   return ans;
+}
+
 // We use a wrapper around the v5 init to copy over the struct contents
 // post-init since they may not be initialized before hand.
-static ncclResult_t ncclCollNet_v5_as_v8_init(ncclDebugLogger_t logfn) {
+static ncclResult_t ncclCollNet_v5_as_v9_init(ncclDebugLogger_t logfn) {
  NCCLCHECK(ncclCollNet_v5->init(logfn));
-  ncclCollNet_v5_as_v8.name = ncclCollNet_v5->name;
-  ncclCollNet_v5_as_v8.devices = ncclCollNet_v5->devices;
-  ncclCollNet_v5_as_v8.getProperties = ncclCollNet_v5_as_v8_getProperties;
-  ncclCollNet_v5_as_v8.listen = ncclCollNet_v5->listen;
-  ncclCollNet_v5_as_v8.connect = ncclCollNet_v5->connect;
-  ncclCollNet_v5_as_v8.reduceSupport = ncclCollNet_v5->reduceSupport;
-  ncclCollNet_v5_as_v8.regMr = ncclCollNet_v5_as_v8_regMr;
-  ncclCollNet_v5_as_v8.regMrDmaBuf = NULL;
-  ncclCollNet_v5_as_v8.deregMr = ncclCollNet_v5->deregMr;
-  ncclCollNet_v5_as_v8.iallreduce = ncclCollNet_v5->iallreduce;
-  ncclCollNet_v5_as_v8.iallgather = nullptr;
-  ncclCollNet_v5_as_v8.ireducescatter = nullptr;
-  ncclCollNet_v5_as_v8.iflush = ncclCollNet_v5->iflush;
-  ncclCollNet_v5_as_v8.test = ncclCollNet_v5->test;
-  ncclCollNet_v5_as_v8.closeColl = ncclCollNet_v5->closeColl;
-  ncclCollNet_v5_as_v8.closeListen = ncclCollNet_v5->closeListen;
+  ncclCollNet_v5_as_v9.name = ncclCollNet_v5->name;
+  ncclCollNet_v5_as_v9.devices = ncclCollNet_v5->devices;
+  ncclCollNet_v5_as_v9.getProperties = ncclCollNet_v5_as_v9_getProperties;
+  ncclCollNet_v5_as_v9.listen = ncclCollNet_v5->listen;
+  ncclCollNet_v5_as_v9.connect = ncclCollNet_v5->connect;
+  ncclCollNet_v5_as_v9.reduceSupport = ncclCollNet_v5->reduceSupport;
+  ncclCollNet_v5_as_v9.regMr = ncclCollNet_v5_as_v9_regMr;
+  ncclCollNet_v5_as_v9.regMrDmaBuf = NULL;
+  ncclCollNet_v5_as_v9.deregMr = ncclCollNet_v5->deregMr;
+  ncclCollNet_v5_as_v9.iallreduce = ncclCollNet_v5_as_v9_iallreduce;
+  ncclCollNet_v5_as_v9.iallgather = nullptr;
+  ncclCollNet_v5_as_v9.ireducescatter = nullptr;
+  ncclCollNet_v5_as_v9.iflush = ncclCollNet_v5->iflush;
+  ncclCollNet_v5_as_v9.test = ncclCollNet_v5->test;
+  ncclCollNet_v5_as_v9.closeColl = ncclCollNet_v5->closeColl;
+  ncclCollNet_v5_as_v9.closeListen = ncclCollNet_v5->closeListen;
  return ncclSuccess;
 }

-static ncclResult_t ncclCollNet_v6_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) {
+static ncclResult_t ncclCollNet_v6_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
  ncclNetProperties_v6_t p6;
  ncclResult_t ans = ncclCollNet_v6->getProperties(dev, &p6);
  if (ans != ncclSuccess) return ans;
@@ -243,6 +411,7 @@ static ncclResult_t ncclCollNet_v6_as_v8_getProperties(int dev, ncclNetPropertie
  props->guid = p6.guid;
  props->ptrSupport = p6.ptrSupport;
  props->regIsGlobal = 0;
+  props->forceFlush = 0;
  props->speed = p6.speed;
  props->port = p6.port;
  props->maxComms = p6.maxComms;
@@ -250,38 +419,52 @@ static ncclResult_t ncclCollNet_v6_as_v8_getProperties(int dev, ncclNetPropertie
  props->latency = p6.latency;
  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
  return ncclSuccess;
 }

-static ncclResult_t ncclCollNet_v6_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
+static ncclResult_t ncclCollNet_v6_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
  if (size >= 1UL<<31) return ncclInternalError;
  return ncclCollNet_v6->regMr(comm, data, (int) size, type, mhandle);
 }

+static ncclResult_t ncclCollNet_v6_as_v9_iallreduce(void* collComm, void* sendData, void* recvData, size_t count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
+   int countInt;
+   if (count > MAX_NET_SIZE) return ncclInternalError;
+   countInt = (int)count;
+   ncclResult_t ans = ncclCollNet_v6->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp,
+                  sendMhandle, recvMhandle, request);
+   return ans;
+}
+
 // We use a wrapper around the v6 init to copy over the struct contents
 // post-init since they may not be initialized before hand.
-static ncclResult_t ncclCollNet_v6_as_v8_init(ncclDebugLogger_t logfn) {
+static ncclResult_t ncclCollNet_v6_as_v9_init(ncclDebugLogger_t logfn) {
  NCCLCHECK(ncclCollNet_v6->init(logfn));
-  ncclCollNet_v6_as_v8.name = ncclCollNet_v6->name;
-  ncclCollNet_v6_as_v8.devices = ncclCollNet_v6->devices;
-  ncclCollNet_v6_as_v8.getProperties = ncclCollNet_v6_as_v8_getProperties;
-  ncclCollNet_v6_as_v8.listen = ncclCollNet_v6->listen;
-  ncclCollNet_v6_as_v8.connect = ncclCollNet_v6->connect;
-  ncclCollNet_v6_as_v8.reduceSupport = ncclCollNet_v6->reduceSupport;
-  ncclCollNet_v6_as_v8.regMr = ncclCollNet_v6_as_v8_regMr;
-  ncclCollNet_v6_as_v8.regMrDmaBuf = ncclCollNet_v6->regMrDmaBuf;
-  ncclCollNet_v6_as_v8.deregMr = ncclCollNet_v6->deregMr;
-  ncclCollNet_v6_as_v8.iallreduce = ncclCollNet_v6->iallreduce;
-  ncclCollNet_v6_as_v8.iallgather = nullptr;
-  ncclCollNet_v6_as_v8.ireducescatter = nullptr;
-  ncclCollNet_v6_as_v8.iflush = ncclCollNet_v6->iflush;
-  ncclCollNet_v6_as_v8.test = ncclCollNet_v6->test;
-  ncclCollNet_v6_as_v8.closeColl = ncclCollNet_v6->closeColl;
-  ncclCollNet_v6_as_v8.closeListen = ncclCollNet_v6->closeListen;
+  ncclCollNet_v6_as_v9.name = ncclCollNet_v6->name;
+  ncclCollNet_v6_as_v9.devices = ncclCollNet_v6->devices;
+  ncclCollNet_v6_as_v9.getProperties = ncclCollNet_v6_as_v9_getProperties;
+  ncclCollNet_v6_as_v9.listen = ncclCollNet_v6->listen;
+  ncclCollNet_v6_as_v9.connect = ncclCollNet_v6->connect;
+  ncclCollNet_v6_as_v9.reduceSupport = ncclCollNet_v6->reduceSupport;
+  ncclCollNet_v6_as_v9.regMr = ncclCollNet_v6_as_v9_regMr;
+  ncclCollNet_v6_as_v9.regMrDmaBuf = ncclCollNet_v6->regMrDmaBuf;
+  ncclCollNet_v6_as_v9.deregMr = ncclCollNet_v6->deregMr;
+  ncclCollNet_v6_as_v9.iallreduce = ncclCollNet_v6_as_v9_iallreduce;
+  ncclCollNet_v6_as_v9.iallgather = nullptr;
+  ncclCollNet_v6_as_v9.ireducescatter = nullptr;
+  ncclCollNet_v6_as_v9.iflush = ncclCollNet_v6->iflush;
+  ncclCollNet_v6_as_v9.test = ncclCollNet_v6->test;
+  ncclCollNet_v6_as_v9.closeColl = ncclCollNet_v6->closeColl;
+  ncclCollNet_v6_as_v9.closeListen = ncclCollNet_v6->closeListen;
  return ncclSuccess;
 }

-static ncclResult_t ncclCollNet_v7_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) {
+static ncclResult_t ncclCollNet_v7_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
  ncclNetProperties_v7_t p7;
  ncclResult_t ans = ncclCollNet_v7->getProperties(dev, &p7);
  if (ans != ncclSuccess) return ans;
@@ -290,6 +473,7 @@ static ncclResult_t ncclCollNet_v7_as_v8_getProperties(int dev, ncclNetPropertie
  props->guid = p7.guid;
  props->ptrSupport = p7.ptrSupport;
  props->regIsGlobal = 0;
+  props->forceFlush = 0;
  props->speed = p7.speed;
  props->port = p7.port;
  props->maxComms = p7.maxComms;
@@ -297,47 +481,150 @@ static ncclResult_t ncclCollNet_v7_as_v8_getProperties(int dev, ncclNetPropertie
  props->latency = p7.latency;
  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
  return ncclSuccess;
 }

-static ncclResult_t ncclCollNet_v7_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
+static ncclResult_t ncclCollNet_v7_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
  if (size >= 1UL<<31) return ncclInternalError;
  return ncclCollNet_v7->regMr(comm, data, (int) size, type, mhandle);
 }

+static ncclResult_t ncclCollNet_v7_as_v9_iallreduce(void* collComm, void* sendData, void* recvData, size_t count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
+   int countInt;
+   if (count > MAX_NET_SIZE) return ncclInternalError;
+   countInt = (int)count;
+   ncclResult_t ans = ncclCollNet_v7->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp,
+                  sendMhandle, recvMhandle, request);
+   return ans;
+}
+
 // We use a wrapper around the v7 init to copy over the struct contents
 // post-init since they may not be initialized before hand.
-static ncclResult_t ncclCollNet_v7_as_v8_init(ncclDebugLogger_t logfn) {
+static ncclResult_t ncclCollNet_v7_as_v9_init(ncclDebugLogger_t logfn) {
  NCCLCHECK(ncclCollNet_v7->init(logfn));
-  ncclCollNet_v7_as_v8.name = ncclCollNet_v7->name;
-  ncclCollNet_v7_as_v8.devices = ncclCollNet_v7->devices;
-  ncclCollNet_v7_as_v8.getProperties = ncclCollNet_v7_as_v8_getProperties;
-  ncclCollNet_v7_as_v8.listen = ncclCollNet_v7->listen;
-  ncclCollNet_v7_as_v8.connect = ncclCollNet_v7->connect;
-  ncclCollNet_v7_as_v8.reduceSupport = ncclCollNet_v7->reduceSupport;
-  ncclCollNet_v7_as_v8.regMr = ncclCollNet_v7_as_v8_regMr;
-  ncclCollNet_v7_as_v8.regMrDmaBuf = ncclCollNet_v7->regMrDmaBuf;
-  ncclCollNet_v7_as_v8.deregMr = ncclCollNet_v7->deregMr;
-  ncclCollNet_v7_as_v8.iallreduce = ncclCollNet_v7->iallreduce;
-  ncclCollNet_v7_as_v8.iallgather = nullptr;
-  ncclCollNet_v7_as_v8.ireducescatter = nullptr;
-  ncclCollNet_v7_as_v8.iflush = ncclCollNet_v7->iflush;
-  ncclCollNet_v7_as_v8.test = ncclCollNet_v7->test;
-  ncclCollNet_v7_as_v8.closeColl = ncclCollNet_v7->closeColl;
-  ncclCollNet_v7_as_v8.closeListen = ncclCollNet_v7->closeListen;
+  ncclCollNet_v7_as_v9.name = ncclCollNet_v7->name;
+  ncclCollNet_v7_as_v9.devices = ncclCollNet_v7->devices;
+  ncclCollNet_v7_as_v9.getProperties = ncclCollNet_v7_as_v9_getProperties;
+  ncclCollNet_v7_as_v9.listen = ncclCollNet_v7->listen;
+  ncclCollNet_v7_as_v9.connect = ncclCollNet_v7->connect;
+  ncclCollNet_v7_as_v9.reduceSupport = ncclCollNet_v7->reduceSupport;
+  ncclCollNet_v7_as_v9.regMr = ncclCollNet_v7_as_v9_regMr;
+  ncclCollNet_v7_as_v9.regMrDmaBuf = ncclCollNet_v7->regMrDmaBuf;
+  ncclCollNet_v7_as_v9.deregMr = ncclCollNet_v7->deregMr;
+  ncclCollNet_v7_as_v9.iallreduce = ncclCollNet_v7_as_v9_iallreduce;
+  ncclCollNet_v7_as_v9.iallgather = nullptr;
+  ncclCollNet_v7_as_v9.ireducescatter = nullptr;
+  ncclCollNet_v7_as_v9.iflush = ncclCollNet_v7->iflush;
+  ncclCollNet_v7_as_v9.test = ncclCollNet_v7->test;
+  ncclCollNet_v7_as_v9.closeColl = ncclCollNet_v7->closeColl;
+  ncclCollNet_v7_as_v9.closeListen = ncclCollNet_v7->closeListen;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclCollNet_v8_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
+  ncclNetProperties_v8_t p8;
+  ncclResult_t ans = ncclCollNet_v8->getProperties(dev, &p8);
+  if (ans != ncclSuccess) return ans;
+  props->name = p8.name;
+  props->pciPath = p8.pciPath;
+  props->guid = p8.guid;
+  props->ptrSupport = p8.ptrSupport;
+  props->regIsGlobal = p8.regIsGlobal;
+  props->forceFlush = 0;
+  props->speed = p8.speed;
+  props->port = p8.port;
+  props->maxComms = p8.maxComms;
+  props->maxRecvs = p8.maxRecvs;
+  props->latency = p8.latency;
+  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
+  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclCollNet_v8_as_v9_iallreduce(void* collComm, void* sendData, void* recvData, size_t count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
+   int countInt;
+   if (count > MAX_NET_SIZE) return ncclInternalError;
+   countInt = (int)count;
+   ncclResult_t ans = ncclCollNet_v8->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp,
+                  sendMhandle, recvMhandle, request);
+   return ans;
+}
+
+static ncclResult_t ncclCollNet_v8_as_v9_iallgather (void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v9_t* recvParts,
+                           size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                           void* sendMhandle, void** request) {
+   ncclNetSGE_v8_t recvPartsInt;
+   if (nRecvParts > 1) return ncclInternalError;
+   if (recvParts->size > MAX_COLLNET_SIZE) return ncclInternalError;
+   recvPartsInt.mhandle = recvParts->mhandle;
+   recvPartsInt.address = recvParts->address;
+   recvPartsInt.size = (int)recvParts->size;
+   ncclResult_t ans = ncclCollNet_v8->iallgather(collComm, sendData, nRecvParts, &recvPartsInt,
+                   bytesPerRank, windowOffset, windowBytes,
+                   sendMhandle, request);
+   return ans;
+}
+
+static ncclResult_t ncclCollNet_v8_as_v9_ireducescatter(void* collComm, int nSendParts, ncclNetSGE_v9_t* sendParts, void* recvData,
+                               size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                               ncclDataType_t dataType, ncclRedOp_t redOp,
+                               void* recvMhandle, void** request) {
+   ncclNetSGE_v8_t sendPartsInt;
+   if (nSendParts > 1) return ncclInternalError;
+   if (sendParts->size > MAX_COLLNET_SIZE) return ncclInternalError;
+   sendPartsInt.mhandle = sendParts->mhandle;
+   sendPartsInt.address = sendParts->address;
+   sendPartsInt.size = (int)sendParts->size;
+   ncclResult_t ans = ncclCollNet_v8->ireducescatter(collComm, nSendParts, &sendPartsInt,
+                   recvData, bytesPerRank, windowOffset, windowBytes,
+                   dataType, redOp,
+                  recvMhandle, request);
+   return ans;
+}
+
+// We use a wrapper around the v8 init to copy over the struct contents
+// post-init since they may not be initialized before hand.
+static ncclResult_t ncclCollNet_v8_as_v9_init(ncclDebugLogger_t logfn) {
+  NCCLCHECK(ncclCollNet_v8->init(logfn));
+  ncclCollNet_v8_as_v9.name = ncclCollNet_v8->name;
+  ncclCollNet_v8_as_v9.devices = ncclCollNet_v8->devices;
+  ncclCollNet_v8_as_v9.getProperties = ncclCollNet_v8_as_v9_getProperties;
+  ncclCollNet_v8_as_v9.listen = ncclCollNet_v8->listen;
+  ncclCollNet_v8_as_v9.connect = ncclCollNet_v8->connect;
+  ncclCollNet_v8_as_v9.reduceSupport = ncclCollNet_v8->reduceSupport;
+  ncclCollNet_v8_as_v9.regMr = ncclCollNet_v8->regMr;
+  ncclCollNet_v8_as_v9.regMrDmaBuf = ncclCollNet_v8->regMrDmaBuf;
+  ncclCollNet_v8_as_v9.deregMr = ncclCollNet_v8->deregMr;
+  ncclCollNet_v8_as_v9.iallreduce = ncclCollNet_v8_as_v9_iallreduce;
+  ncclCollNet_v8_as_v9.iallgather = ncclCollNet_v8_as_v9_iallgather;
+  ncclCollNet_v8_as_v9.ireducescatter = ncclCollNet_v8_as_v9_ireducescatter;
+  ncclCollNet_v8_as_v9.iflush = ncclCollNet_v8->iflush;
+  ncclCollNet_v8_as_v9.test = ncclCollNet_v8->test;
+  ncclCollNet_v8_as_v9.closeColl = ncclCollNet_v8->closeColl;
+  ncclCollNet_v8_as_v9.closeListen = ncclCollNet_v8->closeListen;
  return ncclSuccess;
 }

 static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER;
-ncclNet_t* ncclNets[3] = { nullptr, &ncclNetIb, &ncclNetSocket };
-ncclCollNet_t* ncclCollNets[3] = { nullptr, nullptr, nullptr };
+ncclNet_t* ncclNets[NCCL_NET_MAX_PLUGINS] = { nullptr, &ncclNetIb, &ncclNetSocket };
+ncclCollNet_t* ncclCollNets[NCCL_NET_MAX_PLUGINS] = { nullptr, nullptr, nullptr };
 enum ncclNetState {
  ncclNetStateInit = 0,
  ncclNetStateEnabled = 1,
  ncclNetStateDisabled = 2
 };
-enum ncclNetState ncclNetStates[3] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
-enum ncclNetState ncclCollNetStates[3] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
+enum ncclNetState ncclNetStates[NCCL_NET_MAX_PLUGINS] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
+enum ncclNetState ncclCollNetStates[NCCL_NET_MAX_PLUGINS] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };

 #define MAX_STR_LEN 255

@@ -443,72 +730,93 @@ ncclResult_t ncclNetPluginLoad(struct ncclComm* comm) {
    goto fail;
  }

-  ncclNets[0] = (ncclNet_v8_t*)dlsym(netPluginLib, "ncclNetPlugin_v8");
+  ncclNets[0] = (ncclNet_v9_t*)dlsym(netPluginLib, "ncclNetPlugin_v9");
  if (ncclNets[0] == nullptr) {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v8 symbol.");
-    // Try v7 plugin
-    ncclNet_v7 = (ncclNet_v7_t*)dlsym(netPluginLib, "ncclNetPlugin_v7");
-    if (ncclNet_v7 == nullptr) {
-      // Try v6 plugin
-      ncclNet_v6 = (ncclNet_v6_t*)dlsym(netPluginLib, "ncclNetPlugin_v6");
-      if (ncclNet_v6 == nullptr) {
-        // Try v5 plugin
-        ncclNet_v5 = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5");
-        if (ncclNet_v5 == nullptr) {
-          INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin symbol (>= v5). ncclNetPlugin symbols v4 and lower are not supported.");
-          goto fail;
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v9 symbol.");
+    ncclNet_v8 = (ncclNet_v8_t*)dlsym(netPluginLib, "ncclNetPlugin_v8");
+    if (ncclNet_v8 == nullptr) {
+      // Try v7 plugin
+      ncclNet_v7 = (ncclNet_v7_t*)dlsym(netPluginLib, "ncclNetPlugin_v7");
+      if (ncclNet_v7 == nullptr) {
+        // Try v6 plugin
+        ncclNet_v6 = (ncclNet_v6_t*)dlsym(netPluginLib, "ncclNetPlugin_v6");
+        if (ncclNet_v6 == nullptr) {
+          // Try v5 plugin
+          ncclNet_v5 = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5");
+          if (ncclNet_v5 == nullptr) {
+            INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin symbol (>= v5). ncclNetPlugin symbols v4 and lower are not supported.");
+            goto fail;
+          } else {
+            ncclNets[0] = &ncclNet_v5_as_v9;
+            ncclNet_v5_as_v9.init = ncclNet_v5_as_v9_init;
+            // Set the name right away to allow for NCCL_NET=... to work
+            ncclNet_v5_as_v9.name = ncclNet_v5->name;
+            INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v5)", ncclNets[0]->name);
+          }
        } else {
-          ncclNets[0] = &ncclNet_v5_as_v8;
-          ncclNet_v5_as_v8.init = ncclNet_v5_as_v8_init;
+          ncclNets[0] = &ncclNet_v6_as_v9;
+          ncclNet_v6_as_v9.init = ncclNet_v6_as_v9_init;
          // Set the name right away to allow for NCCL_NET=... to work
-          ncclNet_v5_as_v8.name = ncclNet_v5->name;
-          INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v5)", ncclNets[0]->name);
+          ncclNet_v6_as_v9.name = ncclNet_v6->name;
+          INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v6)", ncclNets[0]->name);
        }
      } else {
-        ncclNets[0] = &ncclNet_v6_as_v8;
-        ncclNet_v6_as_v8.init = ncclNet_v6_as_v8_init;
+        ncclNets[0] = &ncclNet_v7_as_v9;
+        ncclNet_v7_as_v9.init = ncclNet_v7_as_v9_init;
        // Set the name right away to allow for NCCL_NET=... to work
-        ncclNet_v6_as_v8.name = ncclNet_v6->name;
-        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v6)", ncclNets[0]->name);
+        ncclNet_v7_as_v9.name = ncclNet_v7->name;
+        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v7)", ncclNets[0]->name);
      }
    } else {
-      ncclNets[0] = &ncclNet_v7_as_v8;
-      ncclNet_v7_as_v8.init = ncclNet_v7_as_v8_init;
+      ncclNets[0] = &ncclNet_v8_as_v9;
+      ncclNet_v8_as_v9.init = ncclNet_v8_as_v9_init;
      // Set the name right away to allow for NCCL_NET=... to work
-      ncclNet_v7_as_v8.name = ncclNet_v7->name;
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v7)", ncclNets[0]->name);
+      ncclNet_v8_as_v9.name = ncclNet_v8->name;
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v8)", ncclNets[0]->name);
    }
+  } else {
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v9)", ncclNets[0]->name);
  }

  // Check for CollNet
-  ncclCollNets[0] = (ncclCollNet_v8_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v8");
+  ncclCollNets[0] = (ncclCollNet_v9_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v9");
  if (ncclCollNets[0] == nullptr) {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v8 symbol.");
-    ncclCollNet_v7 = (ncclCollNet_v7_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v7");
-    if (ncclCollNet_v7 == nullptr) {
-      ncclCollNet_v6 = (ncclCollNet_v6_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v6");
-      if (ncclCollNet_v6 == nullptr) {
-        ncclCollNet_v5 = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5");
-        if (ncclCollNet_v5 == nullptr) {
-          INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin symbol (>= v5). ncclCollNetPlugin symbols v4 and lower are not supported.");
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v9 symbol.");
+    ncclCollNet_v8 = (ncclCollNet_v8_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v8");
+    if (ncclCollNet_v8 == nullptr) {
+      ncclCollNet_v7 = (ncclCollNet_v7_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v7");
+      if (ncclCollNet_v7 == nullptr) {
+        ncclCollNet_v6 = (ncclCollNet_v6_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v6");
+        if (ncclCollNet_v6 == nullptr) {
+          ncclCollNet_v5 = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5");
+          if (ncclCollNet_v5 == nullptr) {
+            INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin symbol (>= v5). ncclCollNetPlugin symbols v4 and lower are not supported.");
+          } else {
+            ncclCollNets[0] = &ncclCollNet_v5_as_v9;
+            ncclCollNet_v5_as_v9.init = ncclCollNet_v5_as_v9_init;
+            ncclCollNet_v5_as_v9.name = ncclCollNet_v5->name;
+            INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v5)", ncclCollNets[0]->name);
+          }
        } else {
-          ncclCollNets[0] = &ncclCollNet_v5_as_v8;
-          ncclCollNet_v5_as_v8.init = ncclCollNet_v5_as_v8_init;
-          ncclCollNet_v5_as_v8.name = ncclCollNet_v5->name;
-          INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v5)", ncclCollNets[0]->name);
+         ncclCollNets[0] = &ncclCollNet_v6_as_v9;
+         ncclCollNet_v6_as_v9.init = ncclCollNet_v6_as_v9_init;
+         ncclCollNet_v6_as_v9.name = ncclCollNet_v6->name;
+         INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v6)", ncclCollNets[0]->name);
        }
      } else {
-        ncclCollNets[0] = &ncclCollNet_v6_as_v8;
-        ncclCollNet_v6_as_v8.init = ncclCollNet_v6_as_v8_init;
-        ncclCollNet_v6_as_v8.name = ncclCollNet_v6->name;
-        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v6)", ncclCollNets[0]->name);
+       ncclCollNets[0] = &ncclCollNet_v7_as_v9;
+       ncclCollNet_v7_as_v9.init = ncclCollNet_v7_as_v9_init;
+       ncclCollNet_v7_as_v9.name = ncclCollNet_v7->name;
+       INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v7)", ncclCollNets[0]->name);
      }
    } else {
-      ncclCollNets[0] = &ncclCollNet_v7_as_v8;
-      ncclCollNet_v7_as_v8.init = ncclCollNet_v7_as_v8_init;
-      ncclCollNet_v7_as_v8.name = ncclCollNet_v7->name;
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v7)", ncclCollNets[0]->name);
+      ncclCollNets[0] = &ncclCollNet_v8_as_v9;
+      ncclCollNet_v8_as_v9.init = ncclCollNet_v8_as_v9_init;
+      ncclCollNet_v8_as_v9.name = ncclCollNet_v8->name;
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v8)", ncclCollNets[0]->name);
    }
+  } else {
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v9)", ncclCollNets[0]->name);
  }

  ++netPluginRefCount;
@@ -539,6 +847,8 @@ ncclResult_t ncclNetPluginUnload(struct ncclComm* comm) {
    ncclCollNets[0] = nullptr;
    netPluginStatus = netPluginLoadReady;
    comm->netPluginLoaded = 0;
+    for (int i = 0; i < NCCL_NET_MAX_PLUGINS; ++i)
+      ncclCollNetStates[i] = ncclNetStates[i] = ncclNetStateInit;
  }
  pthread_mutex_unlock(&netPluginLock);
  return ncclSuccess;
@@ -561,7 +871,7 @@ ncclResult_t ncclNetCheckDeviceVersion(struct ncclComm* comm, ncclNet_t* net, in
        return ncclInternalError;
      }
    default:
-      WARN("Unknown device code index");
+      WARN("Unknown device code index %d \n", type);
      return ncclInternalError;
  }

@@ -715,8 +1025,9 @@ cleanup1:

 int ncclNetVersion(struct ncclComm* comm) {
  return
-    (comm->ncclNet == &ncclNet_v5_as_v8) ? 5 :
-    (comm->ncclNet == &ncclNet_v6_as_v8) ? 6 :
-    (comm->ncclNet == &ncclNet_v7_as_v8) ? 7 :
-    8;
+    (comm->ncclNet == &ncclNet_v5_as_v9) ? 5 :
+    (comm->ncclNet == &ncclNet_v6_as_v9) ? 6 :
+    (comm->ncclNet == &ncclNet_v7_as_v9) ? 7 :
+    (comm->ncclNet == &ncclNet_v8_as_v9) ? 8 :
+    9;
 }
@@ -364,7 +364,11 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr
  sub->channelId = op->channelId;
  sub->nsteps = op->nsteps;
  sub->nbytes = op->nbytes;
+  sub->chunkSize = op->chunkSize;
  sub->offset = 0;
+  sub->loopSize = op->loopSize;
+  sub->loopOffset = op->loopOffset;
+  sub->isOneRPN = op->isOneRPN;
  sub->peer = op->peer;
  sub->reg = op->reg;
  sub->sendMhandle = op->sendMhandle;
@@ -374,8 +378,9 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr
  sub->eActivationMask = op->eActivationMask;
  sub->taskEventHandle = op->taskEventHandle;
  sub->rank = op->rank;
-  args->pid = op->pid;
-  args->profilerContext = op->profilerContext;
+  sub->pid = op->pid;
+  sub->profilerContext = op->profilerContext;
+  sub->ringAlgo = op->ringAlgo;
  args->nsubs = subIndex+1;
  if (subIndex) {
    if ((args->sliceSteps != op->sliceSteps) ||
@@ -404,6 +409,7 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr
  args->pattern = op->pattern;
  args->protocol = op->protocol;
  args->coll = op->coll;
+  args->algorithm = op->algorithm;
  args->specifics = op->specifics;
  args->state = ncclProxyOpReady;
  args->progress = op->connection->tcomm->proxyProgress;
@@ -485,6 +491,7 @@ static ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyCon
  }
  if (op->next != -1) __builtin_prefetch(pool->ops+op->next); // Prefetch next free op
  memcpy(op, proxyOp, sizeof(struct ncclProxyOp));
+  if (proxyOp->ringAlgo) proxyOp->ringAlgo->incRefCount();
  op->next = -1;
  op->connection = proxyConn->connection;
  if (proxyOps->nextOps == -1) {
@@ -601,13 +608,15 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool
    } break;
  case ncclPatternPatUp: {
      // Run full algorithm to count the number of steps for each peer.
-      int *nstepsSend, *nstepsRecv;
-      const int rank = comm->rank, nranks = comm->nRanks;
-      NCCLCHECK(ncclCalloc(&nstepsSend, log2Up(nranks)));
-      NCCLCHECK(ncclCalloc(&nstepsRecv, log2Up(nranks)));
+      ncclResult_t result = ncclSuccess;
      const ssize_t size = op->nbytes/comm->nRanks;
-      PatRSAlgorithm<char> algo(op->chunkSize, NCCL_STEPS, 0, size, size, op->chunkSize, rank, nranks);
      int last = 0;
+      int *nstepsSend = NULL, *nstepsRecv = NULL;
+      const int rank = comm->rank, nranks = comm->nRanks;
+      PatRSAlgorithm<char> algo(op->chunkSize, NCCL_STEPS, 0, size, size, op->chunkSize, rank, nranks);
+      NCCLCHECKGOTO(ncclCalloc(&nstepsSend, log2Up(nranks)), result, exit_pat_up);
+      NCCLCHECKGOTO(ncclCalloc(&nstepsRecv, log2Up(nranks)), result, exit_pat_up);
+
      while (last == 0) {
        int recvDim, sendDim, recvOffset, sendOffset, sendStepOffset, postRecv, postSend, nelem;
        size_t inpIx, outIx;
@@ -619,24 +628,30 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool
        if (nstepsSend[i]) {
          int sendPeer = (rank + (1<<i)) % nranks;
          op->nsteps = nstepsSend[i];
-          NCCLCHECK(SaveProxy(comm, channel, proxySend, sendPeer, op, 0, justInquire));
+          NCCLCHECKGOTO(SaveProxy(comm, channel, proxySend, sendPeer, op, 0, justInquire), result, exit_pat_up);
        }
        if (nstepsRecv[i]) {
          int recvPeer = (rank - (1<<i) + nranks) % nranks;
          op->nsteps = nstepsRecv[i];
-          NCCLCHECK(SaveProxy(comm, channel, proxyRecv, recvPeer, op, 0, justInquire));
+          NCCLCHECKGOTO(SaveProxy(comm, channel, proxyRecv, recvPeer, op, 0, justInquire), result, exit_pat_up);
        }
      }
+    exit_pat_up:
+      free(nstepsSend);
+      free(nstepsRecv);
+      NCCLCHECK(result);
    } break;
  case ncclPatternPatDown: {
      // Run full algorithm to count the number of steps for each peer.
-      int *nstepsSend, *nstepsRecv;
-      const int rank = comm->rank, nranks = comm->nRanks;
-      NCCLCHECK(ncclCalloc(&nstepsSend, log2Up(nranks)));
-      NCCLCHECK(ncclCalloc(&nstepsRecv, log2Up(nranks)));
+      ncclResult_t result = ncclSuccess;
      const ssize_t size = op->nbytes/comm->nRanks;
-      PatAGAlgorithm<char> algo(op->chunkSize, NCCL_STEPS, 0, size, size, op->chunkSize, rank, nranks);
      int last = 0;
+      int *nstepsSend = NULL, *nstepsRecv = NULL;
+      const int rank = comm->rank, nranks = comm->nRanks;
+      PatAGAlgorithm<char> algo(op->chunkSize, NCCL_STEPS, 0, size, size, op->chunkSize, rank, nranks);
+      NCCLCHECKGOTO(ncclCalloc(&nstepsSend, log2Up(nranks)), result, exit_pat_down);
+      NCCLCHECKGOTO(ncclCalloc(&nstepsRecv, log2Up(nranks)), result, exit_pat_down);
+
      while (last == 0) {
        int recvDim, sendDim, recvOffset, sendOffset, recvStepOffset, postRecv, postSend, nelem;
        size_t inpIx, outIx;
@@ -648,14 +663,18 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool
        if (nstepsSend[i]) {
          int sendPeer = (rank - (1<<i) + nranks) % nranks;
          op->nsteps = nstepsSend[i];
-          NCCLCHECK(SaveProxy(comm, channel, proxySend, sendPeer, op, 0, justInquire));
+          NCCLCHECKGOTO(SaveProxy(comm, channel, proxySend, sendPeer, op, 0, justInquire), result, exit_pat_down);
        }
        if (nstepsRecv[i]) {
          int recvPeer = (rank + (1<<i)) % nranks;
          op->nsteps = nstepsRecv[i];
-          NCCLCHECK(SaveProxy(comm, channel, proxyRecv, recvPeer, op, 0, justInquire));
+          NCCLCHECKGOTO(SaveProxy(comm, channel, proxyRecv, recvPeer, op, 0, justInquire), result, exit_pat_down);
        }
      }
+    exit_pat_down:
+      free(nstepsSend);
+      free(nstepsRecv);
+      NCCLCHECK(result);
    } break;
  case ncclPatternSend:
  case ncclPatternRecv: {
@@ -735,23 +754,17 @@ static ncclResult_t ncclProxyGetPostedOps(struct ncclProxyState* proxyState, int

  if (state->active == NULL) {
    pthread_mutex_lock(&pool->mutex);
-    while (pool->nextOps == -1 && !state->stop) {
+    if (pool->nextOps == -1 && !state->stop) {
      ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle);
      ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlSleep);
      pthread_cond_wait(&pool->cond, &pool->mutex);
      ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlWakeup);
      ncclProfilerStopProxyCtrlEvent(eHandle);
    }
-    if (state->stop) { // We might have been woken up to stop.
-      pthread_mutex_unlock(&pool->mutex);
-      return ncclSuccess;
-    }
  }
-
  state->nextOps = pool->nextOps;
  pool->nextOps = pool->nextOpsEnd = -1;
  pthread_mutex_unlock(&pool->mutex);
-  if (state->nextOps == -1) return ncclInternalError;

 process_nextops:
  ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle);
@@ -889,7 +902,7 @@ void* ncclProxyProgress(void *proxyState_) {
   * ncclParamProgressAppendOpFreq(). If they are equal, we will append proxy ops. This will decrease the
   * frequency of calling ncclProxyGetPostedOps() and reduce the perf impact. */
  int proxyOpAppendCounter = 0;
-  while (state->stop == 0 || (state->stop == 1 && state->active)) {
+  do {
    int idle = 1;
    ncclResult_t ret = progressOps(proxyState, state, state->active, &idle);
    if (ret != ncclSuccess) {
@@ -902,12 +915,11 @@ void* ncclProxyProgress(void *proxyState_) {
    if (lastIdle == 0 && idle == 1) ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlIdle);
    if (lastIdle == 1 && idle == 0) ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlActive);
    ncclProfilerStopProxyCtrlEvent(eHandle);
-    if (idle || (++proxyOpAppendCounter == ncclParamProgressAppendOpFreq())) {
+    if (idle || !state->active || (++proxyOpAppendCounter == ncclParamProgressAppendOpFreq())) {
      int added = 0;
      proxyOpAppendCounter = 0;
      TIME_START(3);
-      if (state->stop == 0)
-        ret = ncclProxyGetPostedOps(proxyState, &added);
+      ret = ncclProxyGetPostedOps(proxyState, &added);
      if (added) { TIME_STOP(3); } else { TIME_CANCEL(3); }
      if (ret != ncclSuccess) {
        __atomic_store_n(&proxyState->asyncResult, ret, __ATOMIC_RELEASE);
@@ -918,7 +930,7 @@ void* ncclProxyProgress(void *proxyState_) {
      }
    }
    lastIdle = idle;
-  }
+  } while (state->stop == 0 || (state->stop == 1 && state->active));
  return NULL;
 }

@@ -1090,7 +1102,7 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
    strncpy(poolPath+sizeof("/dev/shm/nccl-")-1, resp.devShmPath, sizeof("XXXXXX")-1);
    struct ncclProxyOps* proxyOps = sharedProxyState->proxyOps + proxyConn->tpLocalRank;
    if (proxyOps->pool == NULL) {
-      NCCLCHECK(ncclShmOpen(poolPath, sizeof(struct ncclProxyOpsPool), (void**)(&proxyOps->pool), NULL, -1, &proxyOps->handle));
+      NCCLCHECK(ncclShmOpen(poolPath, sizeof(poolPath), sizeof(struct ncclProxyOpsPool), (void**)(&proxyOps->pool), NULL, -1, &proxyOps->handle));
      proxyOps->nextOps = proxyOps->nextOpsEnd = proxyOps->freeOp = -1;
    }
  }
@@ -1293,7 +1305,7 @@ static ncclResult_t proxyProgressInit(struct ncclProxyState* proxyState) {

    char shmPath[sizeof("/dev/shm/nccl-XXXXXX")];
    shmPath[0] = '\0';
-    NCCLCHECK(ncclShmOpen(shmPath, size, (void**)&pool, NULL, proxyState->tpLocalnRanks, &state->handle));
+    NCCLCHECK(ncclShmOpen(shmPath, sizeof(shmPath), size, (void**)&pool, NULL, proxyState->tpLocalnRanks, &state->handle));
    // Init pool
    pool->nextOps = -1;

@@ -1372,7 +1384,7 @@ static ncclResult_t proxyQueryFd(struct ncclProxyState* proxyState, int rank, vo
  ncclResult_t ret = ncclSuccess;

  NCCLCHECKGOTO(ncclIpcSocketInit(&ipcSock, proxyState->tpRank, hash^1, proxyState->abortFlag), ret, exit);
-  NCCLCHECKGOTO(ncclIpcSocketSendMsg(&ipcSock, &rmtFd, sizeof(int), rmtFd, rank, hash), ret, exit);
+  NCCLCHECKGOTO(ncclIpcSocketSendMsg(&ipcSock, &rmtFd, sizeof(int), -1, rank, hash), ret, exit);
 exit:
  NCCLCHECK(ncclIpcSocketClose(&ipcSock));
  return ncclSuccess;
@@ -1603,7 +1615,7 @@ void* ncclProxyService(void* _args) {
      if (pollfds[s].fd == -1) continue;

      // Progress all ops for this ncclProxyLocalPeer
-      if (stop == PROXY_ABORT && ncclCuMemEnable() && ncclCuMemHostEnable() && !proxyState->directMode) closeConn = 1;
+      if (stop == PROXY_ABORT && ncclCuMemEnable() && ncclCuMemHostEnable() && !proxyState->directMode && __atomic_load_n(&proxyState->stop, __ATOMIC_ACQUIRE)) closeConn = 1;
      ncclProxyAsyncOp* op = peer->asyncOps;
      while (op != nullptr) {
        ncclProxyAsyncOp* opnext = op->next; /* in case op is freed in proxyProgressAsync */
@@ -1692,11 +1704,17 @@ static ncclResult_t proxyUDSRecvReq(struct ncclProxyState* proxyState, int reqFd

  NCCLCHECK(ncclIpcSocketRecvMsg(&proxyState->ipcSock, &hdr, sizeof(hdr), &rmtFd));
  if (hdr.type == ncclProxyMsgGetFd) {
-    // cuMem API support
+    // cuMem API support for non-UB case, and rmtFd is not used since UDS proxy thread need to export
+    // fd from handle and send it back to the main thread to import the buffer. We just need to close
+    // this dummy rmtFd.
    uint64_t handle = *(uint64_t*)hdr.data;
    INFO(NCCL_PROXY, "proxyUDSRecvReq::ncclProxyMsgGetFd rank %d opId %p handle=0x%lx", hdr.rank, hdr.opId, handle);
+    close(rmtFd);
    return proxyGetFd(proxyState, hdr.rank, hdr.opId, handle);
  } else if (hdr.type == ncclProxyMsgQueryFd) {
+    // remote main thread registers buffer into this rank, it querys rmtFd of this rank through UDS
+    // and the rmtFd is returned unchanged back to remote main thread which will use rmtFd to call into
+    // proxy service thread for buffer registration.
    INFO(NCCL_PROXY, "proxyUDSRecvReq::proxyQueryFd rank %d opId %p rmtFd %d", hdr.rank, hdr.opId, rmtFd);
    return proxyQueryFd(proxyState, hdr.rank, hdr.opId, rmtFd);
  }
@@ -1743,7 +1761,7 @@ void* ncclProxyServiceUDS(void* _args) {
    }
  }

-  ncclIpcSocketClose(&proxyState->ipcSock);
+  (void)ncclIpcSocketClose(&proxyState->ipcSock);
  INFO(NCCL_PROXY, "[Proxy Service UDS] exit: stop %d abortFlag %d", proxyState->stop, *proxyState->abortFlag);
  return NULL;
 }
@@ -1800,15 +1818,10 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) {
    struct ncclProxyState* sharedProxyState = comm->proxyState;

    if ((comm->proxyRefCountOld = ncclAtomicRefCountDecrement(&sharedProxyState->refCount)) == 0) {
-      if (comm->proxyState->threadUDS) {
-        // UDS support
-        __atomic_store_n(&comm->proxyState->stop, 1, __ATOMIC_RELEASE);
-      }
-
      if (*comm->abortFlag == 0 && sharedProxyState->peerAddresses) {
        struct ncclSocket sock;
        int type = ncclProxyMsgStop;
-        ncclSocketInit(&sock, sharedProxyState->peerAddresses + comm->topParentRanks[comm->rank], comm->sharedRes->magic, ncclSocketTypeProxy, comm->abortFlag);
+        NCCLCHECK(ncclSocketInit(&sock, sharedProxyState->peerAddresses + comm->topParentRanks[comm->rank], comm->sharedRes->magic, ncclSocketTypeProxy, comm->abortFlag));
        if (ncclSocketConnect(&sock) == ncclSuccess) {
          (void)ncclSocketSend(&sock, &type, sizeof(int));
        }
@@ -1835,6 +1848,8 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) {
          }
        }
      }
+      // Now we notify proxy service and UDS thread to exit.
+      __atomic_store_n(&comm->proxyState->stop, 1, __ATOMIC_RELEASE);
    }
  }

@@ -0,0 +1,318 @@
+/*************************************************************************
+ * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <cerrno>
+#include <climits>
+#include <cstddef>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <getopt.h>
+#include <netdb.h>
+#include <unistd.h>
+
+#include "nccl.h"
+#define NCCL_RAS_CLIENT // Only pull client-specific definitions from the header file below.
+#include "ras_internal.h"
+
+#define STR2(v) #v
+#define STR(v) STR2(v)
+
+// Local timeout increment compared to the '-t' argument, in seconds.
+#define TIMEOUT_INCREMENT 1
+
+static const char* hostName = "localhost";
+static const char* port = STR(NCCL_RAS_CLIENT_PORT);
+static int timeout = -1;
+static bool verbose = false;
+static int sock = -1;
+
+static void printUsage(const char* argv0) {
+  fprintf(stderr,
+          "Usage: %s [OPTION]...\n"
+          "Query the state of a running NCCL job.\n"
+          "\nOptions:\n"
+          "  -h, --host=HOST     Host name or IP address of the RAS client socket of the\n"
+          "                      NCCL job to connect to (localhost by default)\n"
+          "  -p, --port=PORT     TCP port of the RAS client socket of the NCCL job\n"
+          "                      (" STR(NCCL_RAS_CLIENT_PORT) " by default)\n"
+          "  -t, --timeout=SECS  Maximum time for the local NCCL process to wait for\n"
+          "                      responses from other NCCL processes\n"
+          "                      (" STR(RAS_COLLECTIVE_LEG_TIMEOUT_SEC) " secs by default; 0 disables the timeout)\n"
+          "  -v, --verbose       Increase the verbosity level of the RAS output\n"
+          "      --help          Print this help and exit\n"
+          "      --version       Print the version number and exit\n", argv0);
+}
+
+static void parseArgs(int argc, char** argv) {
+  int c;
+  int optIdx = 0;
+  struct option longOpts[] = {
+    {"host",    required_argument, NULL, 'h'},
+    {"port",    required_argument, NULL, 'p'},
+    {"timeout", required_argument, NULL, 't'},
+    {"verbose", no_argument,       NULL, 'v'},
+    {"help",    no_argument,       NULL, 'e'},
+    {"version", no_argument,       NULL, 'r'},
+    {0}
+  };
+
+  while ((c = getopt_long(argc, argv, "h:p:t:v", longOpts, &optIdx)) != -1) {
+    switch (c) {
+      case 'h':
+        hostName = optarg;
+        break;
+      case 'p':
+        port = optarg;
+        break;
+      case 't': {
+        char* endPtr = nullptr;
+        timeout = strtol(optarg, &endPtr, 10);
+        if (timeout < 0 || !endPtr || *endPtr != '\0') {
+          fprintf(stderr, "Invalid timeout: %s\n", optarg);
+          exit(1);
+        }
+        break;
+      }
+      case 'v':
+        verbose = true;
+        break;
+      case 'e':
+        printUsage(argv[0]);
+        exit(0);
+      case 'r':
+        fprintf(stderr, "NCCL RAS client version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "."
+                STR(NCCL_PATCH) NCCL_SUFFIX "\n");
+        exit(0);
+      default:
+        printUsage(argv[0]);
+        exit(1);
+    }
+  }
+}
+
+static ssize_t socketWrite(int fd, const void* buf, size_t count) {
+  size_t done = 0;
+  do {
+    ssize_t ret;
+    ret = write(fd, ((const char*)buf)+done, count-done);
+    if (ret == -1) {
+      if (errno != EINTR)
+        return -1;
+      continue;
+    }
+    done += ret;
+  } while (done < count);
+
+  return done;
+}
+
+// Reads a message from RAS.  Assumes that the message ends with '\n' (will continue reading until the terminating
+// newline, unless false is passed as untilNewLine).
+// Terminates the buffer with '\0'.  Returns the number of bytes read (excluding the added terminating '\0').
+static ssize_t rasRead(int fd, void* buf, size_t count, bool untilNewline = true) {
+  char* bufChar = (char*)buf;
+  size_t done = 0;
+  do {
+    ssize_t ret;
+    ret = read(fd, bufChar+done, count-1-done);
+    if (ret == -1) {
+      if (errno != EINTR)
+        return -1;
+      continue;
+    }
+    if (ret == 0)
+      break; // EOF
+    done += ret;
+  } while (untilNewline && (done == 0 || bufChar[done-1] != '\n'));
+  bufChar[done] = '\0';
+
+  return done;
+}
+
+static int connectToNCCL() {
+  struct addrinfo hints = {0};
+  struct addrinfo* addrInfo = nullptr;
+  int ret;
+  char msgBuf[1024];
+  int bytes;
+  struct timeval tv = {TIMEOUT_INCREMENT, 0};
+
+retry:
+  hints.ai_family = AF_UNSPEC;
+  hints.ai_socktype = SOCK_STREAM;
+  if ((ret = getaddrinfo(hostName, port, &hints, &addrInfo)) != 0) {
+    fprintf(stderr, "Resolving %s:%s: %s\n", hostName, port, gai_strerror(ret));
+    goto fail;
+  }
+  for (struct addrinfo* ai = addrInfo; ai; ai = ai->ai_next) {
+    char hostBuf[NI_MAXHOST], portBuf[NI_MAXSERV];
+    int err;
+    sock = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
+    if (sock == -1) {
+      perror("socket");
+      continue;
+    }
+    // Initially start with a small, 1-sec timeout to quickly eliminate non-responsive processes...
+    if (timeout && (setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof tv) != 0 ||
+                    setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof tv) != 0)) {
+      perror("setsockopt");
+      // Non-fatal; fall through.
+    }
+    if (connect(sock, ai->ai_addr, ai->ai_addrlen) == 0)
+      break;
+    err = errno;
+    if (getnameinfo(ai->ai_addr, ai->ai_addrlen, hostBuf, sizeof(hostBuf), portBuf, sizeof(portBuf),
+                    NI_NUMERICHOST | NI_NUMERICSERV) != 0) {
+      strcpy(hostBuf, hostName);
+      strcpy(portBuf, port);
+    }
+    fprintf(stderr, "Connecting to %s:%s: %s\n", hostBuf, portBuf, strerror(err));
+    close(sock);
+    sock = -1;
+  }
+  freeaddrinfo(addrInfo);
+  addrInfo = nullptr;
+
+  if (sock == -1) {
+    fprintf(stderr, "Failed to connect to the NCCL RAS service!\n"
+            "Please make sure that the NCCL job has the RAS service enabled and that\n"
+            "%s.\n",
+            (strcmp(hostName, "localhost") || strcmp(port, STR(NCCL_RAS_CLIENT_PORT)) ?
+            "the host/port arguments are correct and match NCCL_RAS_ADDR" :
+            "the RAS client was started on a node where the NCCL job is running"));
+    goto fail;
+  }
+
+  // Exchange the RAS client handshake.
+  strcpy(msgBuf, "CLIENT PROTOCOL " STR(NCCL_RAS_CLIENT_PROTOCOL) "\n");
+  if (socketWrite(sock, msgBuf, strlen(msgBuf)) != strlen(msgBuf)) {
+    if (errno == EAGAIN || errno == EWOULDBLOCK) {
+      goto timeout;
+    }
+    perror("write to socket");
+    goto fail;
+  }
+  bytes = rasRead(sock, msgBuf, sizeof(msgBuf));
+  if (bytes < 0) {
+    if (errno == EAGAIN || errno == EWOULDBLOCK) {
+      goto timeout;
+    }
+    perror("read socket");
+    goto fail;
+  }
+  if (bytes == 0) {
+    fprintf(stderr, "NCCL unexpectedly closed the connection\n");
+    goto fail;
+  }
+  if (strncasecmp(msgBuf, "SERVER PROTOCOL ", strlen("SERVER PROTOCOL "))) {
+    fprintf(stderr, "Unexpected response from NCCL: %s\n", msgBuf);
+    goto fail;
+  }
+  if (strtol(msgBuf+strlen("SERVER PROTOCOL "), nullptr, 10) != NCCL_RAS_CLIENT_PROTOCOL) {
+    fprintf(stderr, "NCCL RAS protocol version mismatch (NCCL: %s; RAS client: %d)!\n"
+            "Will try to continue in spite of that...\n", msgBuf+strlen("SERVER PROTOCOL "), NCCL_RAS_CLIENT_PROTOCOL);
+  }
+
+  if (timeout >= 0) {
+    snprintf(msgBuf, sizeof(msgBuf), "TIMEOUT %d\n", timeout);
+    if (socketWrite(sock, msgBuf, strlen(msgBuf)) != strlen(msgBuf)) {
+      if (errno == EAGAIN || errno == EWOULDBLOCK) {
+        goto timeout;
+      }
+      perror("write to socket");
+      goto fail;
+    }
+    bytes = rasRead(sock, msgBuf, sizeof(msgBuf));
+    if (bytes < 0) {
+      if (errno == EAGAIN || errno == EWOULDBLOCK) {
+        goto timeout;
+      }
+      perror("read socket");
+      goto fail;
+    }
+    if (bytes == 0) {
+      fprintf(stderr, "NCCL unexpectedly closed the connection\n");
+      goto fail;
+    }
+    if (strcasecmp(msgBuf, "OK\n")) {
+      fprintf(stderr, "Unexpected response from NCCL: %s\n", msgBuf);
+      goto fail;
+    }
+  }
+  if (timeout) {
+    // Increase the socket timeout to accommodate NCCL timeout.
+    tv.tv_sec += (timeout > 0 ? timeout : RAS_COLLECTIVE_LEG_TIMEOUT_SEC) + RAS_COLLECTIVE_EXTRA_TIMEOUT_SEC;
+    if (setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof tv) != 0) {
+      perror("setsockopt");
+      // Non-fatal; fall through.
+    }
+  }
+
+  return 0;
+fail:
+  if (addrInfo)
+    freeaddrinfo(addrInfo);
+  if (sock != -1)
+    (void)close(sock);
+  return 1;
+timeout:
+  fprintf(stderr, "Connection timed out; retrying...\n");
+  (void)close(sock);
+  goto retry;
+}
+
+int getNCCLStatus() {
+  char msgBuf[4096];
+  int bytes;
+  snprintf(msgBuf, sizeof(msgBuf), "%sSTATUS\n", (verbose ? "VERBOSE " : ""));
+  if (socketWrite(sock, msgBuf, strlen(msgBuf)) != strlen(msgBuf)) {
+    if (errno == EAGAIN || errno == EWOULDBLOCK)
+      fprintf(stderr, "Connection timed out\n");
+    else
+      perror("write to socket");
+    return 1;
+  }
+  for (;;) {
+    bytes = rasRead(sock, msgBuf, sizeof(msgBuf), /*untileNewLine*/false);
+    if (bytes < 0) {
+      if (errno == EAGAIN || errno == EWOULDBLOCK)
+        fprintf(stderr, "Connection timed out\n");
+      else
+        perror("read socket");
+      return 1;
+    }
+    if (bytes == 0) // EOF
+      break;
+    if (fwrite(msgBuf, 1, bytes, stdout) != bytes) {
+      fprintf(stderr, "fwrite to stdout failed!\n");
+      return 1;
+    }
+    if (fflush(stdout) != 0) {
+      perror("fflush stdout");
+      return 1;
+    }
+  }
+  return 0;
+}
+
+int main(int argc, char** argv) {
+  parseArgs(argc, argv);
+
+  if (connectToNCCL())
+    return 1;
+
+  if (getNCCLStatus()) {
+    (void)close(sock);
+    return 1;
+  }
+
+  if (close(sock) == -1) {
+    perror("close socket");
+    return 1;
+  }
+  return 0;
+}
@@ -0,0 +1,762 @@
+/*************************************************************************
+ * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#define NDEBUG // Comment out duriyng development only!
+#include <cassert>
+#include <mutex>
+
+#include "alloc.h"
+#include "checks.h"
+#include "comm.h"
+#include "nccl.h"
+#include "utils.h"
+#include "ras_internal.h"
+
+// The number of recent collectives to keep track of.  Completely arbitrary.
+#define COLL_HISTORY_SIZE 64
+
+// An entry in the rasCollHistory array keeping track of recently completed collectives (to make it possible to
+// identify and drop duplicates arriving over different links).
+struct rasCollHistoryEntry {
+  union ncclSocketAddress rootAddr;
+  uint64_t rootId;
+};
+
+// Array keeping track of recently completed collectives (to avoid infinite loops).  LRU-based replacement.
+static struct rasCollHistoryEntry rasCollHistory[COLL_HISTORY_SIZE];
+static int nRasCollHistory, rasCollHistNextIdx;
+
+// Monotonically increased to ensure that each collective originating locally has a unique Id.
+static uint64_t rasCollLastId;
+
+// Array keeping track of ongoing collective operations (apart from broadcasts, which have no response so require
+// no such tracking).
+struct rasCollective* rasCollectives;
+static int nRasCollectives;
+
+static ncclResult_t getNewCollEntry(struct rasCollective** pColl);
+static ncclResult_t rasLinkSendCollReq(struct rasLink* link, struct rasCollective* coll,
+                                       const struct rasCollRequest* req, size_t reqLen, int fromConnIdx);
+static ncclResult_t rasConnSendCollReq(struct rasConnection* conn, const struct rasCollRequest* req, size_t reqLen);
+static ncclResult_t rasCollReadyResp(struct rasCollective* coll);
+static ncclResult_t rasConnSendCollResp(struct rasConnection* conn,
+                                        const union ncclSocketAddress* rootAddr, uint64_t rootId,
+                                        const union ncclSocketAddress* peers, int nPeers,
+                                        const char* data, int nData, int nLegTimeouts);
+
+static ncclResult_t rasCollConnsInit(char** pData, int* pNData);
+static ncclResult_t rasCollConnsMerge(struct rasCollective* coll, struct rasMsg* msg);
+
+static ncclResult_t rasCollCommsInit(char** pData, int* pNData);
+static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg* msg);
+static int ncclCommsCompare(const void* p1, const void* p2);
+
+
+///////////////////////////////////////////////////////////////////////////////////////
+// Functions related to the initialization of collectives and the message exchanges. //
+///////////////////////////////////////////////////////////////////////////////////////
+
+// Returns the index of the first available entry in the rasCollectives array, enlarging the array if necessary.
+static ncclResult_t getNewCollEntry(struct rasCollective** pColl) {
+  struct rasCollective* coll;
+  int i;
+  for (i = 0; i < nRasCollectives; i++)
+    if (rasCollectives[i].type == RAS_MSG_NONE)
+      break;
+  if (i == nRasCollectives) {
+    NCCLCHECK(ncclRealloc(&rasCollectives, nRasCollectives, nRasCollectives+RAS_INCREMENT));
+    nRasCollectives += RAS_INCREMENT;
+  }
+
+  coll = rasCollectives+i;
+  memset(coll, '\0', sizeof(*coll));
+  coll->startTime = clockNano();
+  coll->fromConnIdx = -1;
+  // We are unlikely to use the whole array, but at least we won't need to realloc.
+  NCCLCHECK(ncclCalloc(&coll->fwdConns, nRasConns));
+
+  *pColl = coll;
+  return ncclSuccess;
+}
+
+// Initializes a collective request by giving it a unique ID.
+void rasCollReqInit(struct rasCollRequest* req) {
+  memcpy(&req->rootAddr, &rasNetListeningSocket.addr, sizeof(req->rootAddr));
+  req->rootId = ++rasCollLastId;
+}
+
+// Sends a collective request message through all regular RAS network connections (effectively, broadcasts it).
+// Also used for re-broadcasts (on peers receiving the request over the network).
+// Checking for duplicates is the responsibility of the caller.
+// For collectives other than broadcasts, initializes a rasCollective structure and fills it with local data,
+// in preparation for collective response messages.
+// pAllDone indicates on return if the collective operation is already finished, which is unusual, but possible
+// in scenarios such as a total of two peers.
+// pCollIdx provides on return an index of the allocated rasCollective structure to track this collective (unless
+// it's a broadcast, which require no such tracking).
+ncclResult_t rasNetSendCollReq(const struct rasCollRequest* req, size_t reqLen, bool* pAllDone, int* pCollIdx,
+                               int fromConnIdx) {
+  struct rasCollective* coll = nullptr;
+  if (req->type >= RAS_COLL_CONNS) {
+    // Keep track of this collective operation so that we can handle the responses appropriately.
+    NCCLCHECK(getNewCollEntry(&coll));
+    if (pCollIdx)
+      *pCollIdx = coll-rasCollectives;
+    memcpy(&coll->rootAddr, &req->rootAddr, sizeof(coll->rootAddr));
+    coll->rootId = req->rootId;
+    coll->type = req->type;
+    coll->timeout = req->timeout;
+    coll->fromConnIdx = fromConnIdx;
+    if (ncclCalloc(&coll->peers, 1) == ncclSuccess) {
+      memcpy(coll->peers, &rasNetListeningSocket.addr, sizeof(*coll->peers));
+      coll->nPeers = 1;
+    }
+
+    // Collective-specific initialization of accumulated data (using local data for now).
+    if (req->type == RAS_COLL_CONNS)
+      (void)rasCollConnsInit(&coll->data, &coll->nData);
+    else if (req->type == RAS_COLL_COMMS)
+      (void)rasCollCommsInit(&coll->data, &coll->nData);
+  } else { // req->type < RAS_COLL_CONNS
+    // Add the info to the collective message history.
+    nRasCollHistory = std::min(nRasCollHistory+1, COLL_HISTORY_SIZE);
+    memcpy(&rasCollHistory[rasCollHistNextIdx].rootAddr, &req->rootAddr,
+           sizeof(rasCollHistory[rasCollHistNextIdx].rootAddr));
+    rasCollHistory[rasCollHistNextIdx].rootId = req->rootId;
+    rasCollHistNextIdx = (rasCollHistNextIdx + 1) % COLL_HISTORY_SIZE;
+
+    // Collective-specific message handling.
+    if (req->type == RAS_BC_DEADPEER) {
+      bool done = false;
+      rasMsgHandleBCDeadPeer(req, &done);
+      if (done)
+        goto exit;
+    }
+  } // req->type < RAS_COLL_CONNS
+
+  for (int connIdx = 0; connIdx < nRasConns; connIdx++)
+    rasConns[connIdx].linkFlag = false;
+
+  (void)rasLinkSendCollReq(&rasNextLink, coll, req, reqLen, fromConnIdx);
+  (void)rasLinkSendCollReq(&rasPrevLink, coll, req, reqLen, fromConnIdx);
+
+  if (coll && pAllDone)
+    *pAllDone = (coll->nFwdSent == coll->nFwdRecv);
+exit:
+  return ncclSuccess;
+}
+
+// Sends the collective message through all connections associated with this link (with the exception of the one
+// the message came from, if any).
+static ncclResult_t rasLinkSendCollReq(struct rasLink* link, struct rasCollective* coll,
+                                       const struct rasCollRequest* req, size_t reqLen, int fromConnIdx) {
+  for (int i = 0; i < link->nConns; i++) {
+    struct rasLinkConn* linkConn = link->conns+i;
+    if (linkConn->connIdx != -1 && linkConn->connIdx != fromConnIdx) {
+      struct rasConnection* conn = rasConns+linkConn->connIdx;
+      if (!conn->linkFlag) {
+        // We send collective messages through fully established and operational connections only.
+        if (conn->sockIdx != -1 && rasSockets[conn->sockIdx].status == RAS_SOCK_READY && !conn->experiencingDelays) {
+          if (rasConnSendCollReq(conn, req, reqLen) == ncclSuccess && coll != nullptr)
+            coll->fwdConns[coll->nFwdSent++] = linkConn->connIdx;
+        } // if (conn->sockIdx != -1 && RAS_SOCK_READY)
+        conn->linkFlag = true;
+      } // if (!conn->linkFlag)
+    } // if (linkConn->connIdx != -1 && linkConn->connIdx != fromConnIdx)
+  } // for (i)
+
+  return ncclSuccess;
+}
+
+// Sends a collective message down a particular connection.
+static ncclResult_t rasConnSendCollReq(struct rasConnection* conn, const struct rasCollRequest* req, size_t reqLen) {
+  struct rasMsg* msg = nullptr;
+  int msgLen = rasMsgLength(RAS_MSG_COLLREQ) + reqLen;
+
+  NCCLCHECK(rasMsgAlloc(&msg, msgLen));
+  msg->type = RAS_MSG_COLLREQ;
+  memcpy(&msg->collReq, req, reqLen);
+
+  rasConnEnqueueMsg(conn, msg, msgLen);
+
+  return ncclSuccess;
+}
+
+// Handles the RAS_MSG_COLLREQ collective message request on the receiver side.  Primarily deals with duplicates and
+// re-broadcasts the message to local peers, though in case of a very limited RAS network it might be done right away,
+// in which case it can immediately send the response.
+ncclResult_t rasMsgHandleCollReq(struct rasMsg* msg, struct rasSocket* sock) {
+  bool allDone = false;
+  int collIdx = -1;
+  assert(sock->connIdx != -1);
+
+  // First check if we've already handled this request (through another connection).
+  for (int i = 0; i < nRasCollHistory; i++) {
+    // In principle we can use i to index the array but we convert it so that we check the most recent entries first.
+    int collHistIdx = (rasCollHistNextIdx + COLL_HISTORY_SIZE - 1 - i) % COLL_HISTORY_SIZE;
+    if (memcmp(&msg->collReq.rootAddr, &rasCollHistory[collHistIdx].rootAddr, sizeof(msg->collReq.rootAddr)) == 0 &&
+        msg->collReq.rootId == rasCollHistory[collHistIdx].rootId) {
+      if (msg->collReq.type >= RAS_COLL_CONNS) {
+        // Send an empty response so that the sender can account for it.  The non-empty response has already been
+        // sent through the connection that we received the request through first.
+        NCCLCHECK(rasConnSendCollResp(rasConns+sock->connIdx, &msg->collReq.rootAddr, msg->collReq.rootId,
+                                      /*peers*/nullptr, /*nPeers*/0, /*data*/nullptr, /*nData*/0, /*nLegTimeouts*/0));
+      }
+      goto exit;
+    }
+  } // for (i)
+
+  if (msg->collReq.type >= RAS_COLL_CONNS) {
+    // Check if we're currently handling this collective request.
+    for (int i = 0; i < nRasCollectives; i++) {
+      struct rasCollective* coll = rasCollectives+i;
+      if (coll->type != RAS_MSG_NONE &&
+          memcmp(&msg->collReq.rootAddr, &coll->rootAddr, sizeof(msg->collReq.rootAddr)) == 0 &&
+          msg->collReq.rootId == coll->rootId) {
+        assert(msg->collReq.type == coll->type);
+
+        // Send an empty response so that the sender can account for it.  The non-empty response will be
+        // sent through the connection that we received the request through first.
+        NCCLCHECK(rasConnSendCollResp(rasConns+sock->connIdx, &msg->collReq.rootAddr, msg->collReq.rootId,
+                                      /*peers*/nullptr, /*nPeers*/0, /*data*/nullptr, /*nData*/0, /*nLegTimeouts*/0));
+        goto exit;
+      } // if match
+    } // for (i)
+  } // if (msg->collReq.type >= RAS_COLL_CONNS)
+
+  // Re-broadcast the message to my peers (minus the one it came from) and handle it locally.
+  NCCLCHECK(rasNetSendCollReq(&msg->collReq, rasCollDataLength(msg->collReq.type), &allDone, &collIdx, sock->connIdx));
+
+  if (msg->collReq.type >= RAS_COLL_CONNS && allDone) {
+    assert(collIdx != -1);
+    // We are a leaf process -- send the response right away.  This can probably trigger only for the case of a total
+    // of two peers, and hence just one RAS connection, or during communication issues, because normally every peer
+    // has more than one connection so there should always be _some_ other peer to forward the request to.
+    NCCLCHECK(rasCollReadyResp(rasCollectives+collIdx));
+  }
+exit:
+  return ncclSuccess;
+}
+
+// Sends a collective response back to the process we received the collective request from.
+// Invoked when we are finished waiting for the collective responses from other peers (i.e., either there weren't
+// any peers (unlikely), the peers sent their responses (likely), or we timed out.
+static ncclResult_t rasCollReadyResp(struct rasCollective* coll) {
+  if (coll->fromConnIdx != -1) {
+    // For remotely-initiated collectives, send the response back.
+    NCCLCHECK(rasConnSendCollResp(rasConns+coll->fromConnIdx, &coll->rootAddr, coll->rootId,
+                                  coll->peers, coll->nPeers, coll->data, coll->nData, coll->nLegTimeouts));
+
+    // Add the identifying info to the collective message history.
+    nRasCollHistory = std::min(nRasCollHistory+1, COLL_HISTORY_SIZE);
+    memcpy(&rasCollHistory[rasCollHistNextIdx].rootAddr, &coll->rootAddr,
+           sizeof(rasCollHistory[rasCollHistNextIdx].rootAddr));
+    rasCollHistory[rasCollHistNextIdx].rootId = coll->rootId;
+    rasCollHistNextIdx = (rasCollHistNextIdx + 1) % COLL_HISTORY_SIZE;
+
+    rasCollFree(coll);
+  } else {
+    // For locally-initiated collectives, invoke the client code again (which will release it, once finished).
+    NCCLCHECK(rasClientResume(coll));
+  }
+  return ncclSuccess;
+}
+
+// Sends a collective response via the connection we originally received the request from.  The message should be
+// a cumulative response from this process and all the processes that we forwarded the request to.
+static ncclResult_t rasConnSendCollResp(struct rasConnection* conn,
+                                        const union ncclSocketAddress* rootAddr, uint64_t rootId,
+                                        const union ncclSocketAddress* peers, int nPeers,
+                                        const char* data, int nData, int nLegTimeouts) {
+  struct rasMsg* msg = nullptr;
+  int msgLen = rasMsgLength(RAS_MSG_COLLRESP) + nPeers*sizeof(*peers);
+  int dataOffset = 0;
+
+  if (nData > 0) {
+    ALIGN_SIZE(msgLen, alignof(int64_t));
+    dataOffset = msgLen;
+    msgLen += nData;
+  }
+
+  NCCLCHECK(rasMsgAlloc(&msg, msgLen));
+  msg->type = RAS_MSG_COLLRESP;
+  memcpy(&msg->collResp.rootAddr, rootAddr, sizeof(msg->collResp.rootAddr));
+  msg->collResp.rootId = rootId;
+  msg->collResp.nLegTimeouts = nLegTimeouts;
+  msg->collResp.nPeers = nPeers;
+  msg->collResp.nData = nData;
+  if (nPeers)
+    memcpy(msg->collResp.peers, peers, nPeers*sizeof(*msg->collResp.peers));
+  if (nData)
+    memcpy(((char*)msg)+dataOffset, data, nData);
+
+  rasConnEnqueueMsg(conn, msg, msgLen);
+
+  return ncclSuccess;
+}
+
+// Handles the collective response on the receiver side.  Finds the corresponding rasCollective structure, merges
+// the data from the response into the accumulated data.  If all the responses have been accounted for, sends the
+// accumulated response back.
+ncclResult_t rasMsgHandleCollResp(struct rasMsg* msg, struct rasSocket* sock) {
+  int collIdx;
+  struct rasCollective* coll = nullptr;
+  char line[SOCKET_NAME_MAXLEN+1];
+
+  for (collIdx = 0; collIdx < nRasCollectives; collIdx++) {
+    coll = rasCollectives+collIdx;
+    if (coll->type != RAS_MSG_NONE &&
+        memcmp(&msg->collResp.rootAddr, &coll->rootAddr, sizeof(msg->collResp.rootAddr)) == 0 &&
+        msg->collResp.rootId == coll->rootId)
+      break;
+  }
+  if (collIdx == nRasCollectives) {
+    INFO(NCCL_RAS, "RAS failed to find a matching ongoing collective for response %s:%ld from %s!",
+         ncclSocketToString(&msg->collResp.rootAddr, line), msg->collResp.rootId,
+         ncclSocketToString(&sock->sock.addr, rasLine));
+    goto exit;
+  }
+
+  coll->nLegTimeouts += msg->collResp.nLegTimeouts;
+  assert(sock->connIdx != -1);
+  // Account for the received response in our collective operation tracking.
+  for (int i = 0; i < coll->nFwdSent; i++) {
+    if (coll->fwdConns[i] == sock->connIdx) {
+      coll->fwdConns[i] = -1;
+      break;
+    }
+  }
+  coll->nFwdRecv++;
+  if (msg->collResp.nData > 0) {
+    // Collective-specific merging of the response into locally accumulated data.
+    if (coll->type == RAS_COLL_CONNS)
+      NCCLCHECK(rasCollConnsMerge(coll, msg));
+    else if (coll->type == RAS_COLL_COMMS)
+      NCCLCHECK(rasCollCommsMerge(coll, msg));
+  }
+  // We merge the peers after merging the data, so that the data merge function can rely on peers being unchanged.
+  if (msg->collResp.nPeers > 0) {
+    NCCLCHECK(ncclRealloc(&coll->peers, coll->nPeers, coll->nPeers + msg->collResp.nPeers));
+    memcpy(coll->peers+coll->nPeers, msg->collResp.peers, msg->collResp.nPeers * sizeof(*coll->peers));
+    coll->nPeers += msg->collResp.nPeers;
+  }
+
+  // If we received all the data we were waiting for, send our response back.
+  if (coll->nFwdSent == coll->nFwdRecv)
+    NCCLCHECK(rasCollReadyResp(coll));
+exit:
+  return ncclSuccess;
+}
+
+// Removes a connection from all ongoing collectives.  Called when a connection is experiencing a delay or is being
+// terminated.
+void rasCollsPurgeConn(int connIdx) {
+  for (int i = 0; i < nRasCollectives; i++) {
+    struct rasCollective* coll = rasCollectives+i;
+    if (coll->type != RAS_MSG_NONE) {
+      char line[SOCKET_NAME_MAXLEN+1];
+      if (coll->fromConnIdx == connIdx) {
+        INFO(NCCL_RAS, "RAS purging collective %s:%ld because it comes from %s",
+             ncclSocketToString(&coll->rootAddr, line), coll->rootId,
+             ncclSocketToString(&rasConns[connIdx].addr, rasLine));
+        rasCollFree(coll);
+      } else {
+        for (int j = 0; j < coll->nFwdSent; j++) {
+          if (coll->fwdConns[j] == connIdx) {
+            coll->fwdConns[j] = -1;
+            coll->nFwdRecv++;
+            coll->nLegTimeouts++;
+            INFO(NCCL_RAS, "RAS not waiting for response from %s to collective %s:%ld "
+                 "(nFwdSent %d, nFwdRecv %d, nLegTimeouts %d)",
+                 ncclSocketToString(&rasConns[connIdx].addr, rasLine), ncclSocketToString(&coll->rootAddr, line),
+                 coll->rootId, coll->nFwdSent, coll->nFwdRecv, coll->nLegTimeouts);
+            if (coll->nFwdSent == coll->nFwdRecv)
+              (void)rasCollReadyResp(coll);
+            break;
+          }
+        } // for (j)
+      } // coll->fromConnIdx != connIdx
+    } // !RAS_MSG_NONE
+  } // for (i)
+}
+
+// Frees a rasCollective entry and any memory associated with it.
+void rasCollFree(struct rasCollective* coll) {
+  free(coll->fwdConns);
+  coll->fwdConns = nullptr;
+  free(coll->peers);
+  coll->peers = nullptr;
+  free(coll->data);
+  coll->data = nullptr;
+  coll->fromConnIdx = -1;
+  coll->type = RAS_MSG_NONE;
+}
+
+// Invoked from the main RAS thread loop to handle timeouts of the collectives.
+// We obviously want to have a reasonable *total* timeout that the RAS client can rely on, but we don't have strict
+// global coordination.  So we have, in effect, two timeouts: soft (5s) and hard (10s).  Soft equals the keep-alive
+// timeout.
+// When sending collective requests, we skip any connections that are experiencing delays.  After the 5s timeout, we
+// check again the status of all outstanding connections and if any is now delayed, we give up on it.
+// That works fine for directly observable delays, but if the problematic connection is further away from us, all
+// we can do is trust that the other peers will "do the right thing soon".  However, if there is a cascade of
+// problematic connections, they could still exceed the 5s total.  So after 10s we give up waiting no matter what
+// and send back whatever we have.  Unfortunately, the peer that the RAS client is connected to will in all likelihood
+// time out first, so at that point any delayed responses that eventually arrive are likely to be too late...
+void rasCollsHandleTimeouts(int64_t now, int64_t* nextWakeup) {
+  for (int collIdx = 0; collIdx < nRasCollectives; collIdx++) {
+    struct rasCollective* coll = rasCollectives+collIdx;
+    if (coll->type == RAS_MSG_NONE || coll->timeout == 0)
+      continue;
+
+    if (now - coll->startTime > coll->timeout) {
+      // We've exceeded the leg timeout.  For all outstanding responses, check their connections.
+      if (!coll->timeoutWarned) {
+        INFO(NCCL_RAS, "RAS collective %s:%ld timeout warning (%lds) -- %d responses missing",
+             ncclSocketToString(&coll->rootAddr, rasLine), coll->rootId,
+             (now - coll->startTime) / CLOCK_UNITS_PER_SEC, coll->nFwdSent - coll->nFwdRecv);
+        coll->timeoutWarned = true;
+      }
+      for (int i = 0; i < coll->nFwdSent; i++) {
+        if (coll->fwdConns[i] != -1) {
+          struct rasConnection* conn = rasConns+coll->fwdConns[i];
+          char line[SOCKET_NAME_MAXLEN+1];
+          if (!conn->experiencingDelays && conn->sockIdx != -1) {
+            struct rasSocket* sock = rasSockets+conn->sockIdx;
+            // Ensure that the connection is fully established and operational, and that the socket hasn't been
+            // re-created during the handling of the collective (which would suggest that the request may have been
+            // lost).
+            if (sock->status == RAS_SOCK_READY && sock->createTime < coll->startTime)
+              continue;
+          }
+          // In all other cases we declare a timeout so that we can (hopefully) recover.
+          INFO(NCCL_RAS, "RAS not waiting for response from %s to collective %s:%ld "
+               "(nFwdSent %d, nFwdRecv %d, nLegTimeouts %d)",
+               ncclSocketToString(&conn->addr, rasLine), ncclSocketToString(&coll->rootAddr, line),
+               coll->rootId, coll->nFwdSent, coll->nFwdRecv, coll->nLegTimeouts);
+          coll->fwdConns[i] = -1;
+          coll->nFwdRecv++;
+          coll->nLegTimeouts++;
+        } // if (coll->fwdConns[i] != -1)
+      } // for (i)
+      if (coll->nFwdSent == coll->nFwdRecv) {
+        (void)rasCollReadyResp(coll);
+      } else {
+        // At least some of the delays are *not* due to this process' connections experiencing delays, i.e., they
+        // must be due to delays at other processes.  Presumably those processes will give up waiting soon and the
+        // (incomplete) responses will arrive shortly, so we should wait a little longer.
+        if (now - coll->startTime > coll->timeout + RAS_COLLECTIVE_EXTRA_TIMEOUT) {
+          // We've exceeded even the longer timeout, which is unexpected.  Try to return whatever we have (though
+          // the originator of the collective, if it's not us, may have timed out already anyway).
+          INFO(NCCL_RAS, "RAS collective %s:%ld timeout error (%lds) -- giving up on %d missing responses",
+               ncclSocketToString(&coll->rootAddr, rasLine), coll->rootId,
+               (now - coll->startTime) / CLOCK_UNITS_PER_SEC, coll->nFwdSent - coll->nFwdRecv);
+          coll->nLegTimeouts += coll->nFwdSent - coll->nFwdRecv;
+          coll->nFwdRecv = coll->nFwdSent;
+          (void)rasCollReadyResp(coll);
+        } else {
+          *nextWakeup = std::min(*nextWakeup, coll->startTime+coll->timeout+RAS_COLLECTIVE_EXTRA_TIMEOUT);
+        }
+      } // conn->nFwdRecv < conn->nFwdSent
+    } else {
+      *nextWakeup = std::min(*nextWakeup, coll->startTime+coll->timeout);
+    }
+  } // for (collIdx)
+}
+
+
+/////////////////////////////////////////////////////////////////////////
+// Functions related to the handling of the RAS_COLL_CONNS collective. //
+/////////////////////////////////////////////////////////////////////////
+
+// Initializes the accumulated data with just the local data for now.
+// For this particular collective, we keep some reduced statistical data (min/max/avg travel time) as well
+// as connection-specific info in case we observed a negative min travel time (which, ideally, shouldn't happen,
+// but the system clocks may not be perfectly in sync).
+static ncclResult_t rasCollConnsInit(char** pData, int* pNData) {
+  struct rasCollConns connsData = {.travelTimeMin = INT64_MAX, .travelTimeMax = INT64_MIN};
+  struct rasCollConns* pConnsData;
+
+  // Update the statistical data first and in the process also calculate how much connection-specific space we
+  // will need.
+  for (int i = 0; i < nRasConns; i++) {
+    struct rasConnection* conn = rasConns+i;
+    if (conn->inUse && conn->travelTimeCount > 0) {
+      if (connsData.travelTimeMin > conn->travelTimeMin)
+        connsData.travelTimeMin = conn->travelTimeMin;
+      if (connsData.travelTimeMax < conn->travelTimeMax)
+        connsData.travelTimeMax = conn->travelTimeMax;
+      connsData.travelTimeSum += conn->travelTimeSum;
+      connsData.travelTimeCount += conn->travelTimeCount;
+      connsData.nConns++;
+      if (conn->travelTimeMin < 0)
+        connsData.nNegativeMins++;
+    }
+  }
+
+  *pNData = sizeof(connsData) + connsData.nNegativeMins*sizeof(*connsData.negativeMins);
+  NCCLCHECK(ncclCalloc(pData, *pNData));
+  pConnsData = (struct rasCollConns*)*pData;
+  memcpy(pConnsData, &connsData, sizeof(*pConnsData));
+  if (connsData.nNegativeMins > 0) {
+    for (int i = 0, negMinsIdx = 0; i < nRasConns; i++) {
+      struct rasConnection* conn = rasConns+i;
+      if (conn->inUse && conn->travelTimeMin < 0) {
+        struct rasCollConns::negativeMin* negativeMin = pConnsData->negativeMins+negMinsIdx;
+        memcpy(&negativeMin->source, &rasNetListeningSocket.addr, sizeof(negativeMin->source));
+        memcpy(&negativeMin->dest, &conn->addr, sizeof(negativeMin->dest));
+        negativeMin->travelTimeMin = conn->travelTimeMin;
+        negMinsIdx++;
+      }
+      assert(negMinsIdx <= connsData.nNegativeMins);
+    }
+  }
+
+  return ncclSuccess;
+}
+
+// Merges incoming collective RAS_COLL_CONNS response message into the local accumulated data.
+static ncclResult_t rasCollConnsMerge(struct rasCollective* coll, struct rasMsg* msg) {
+  struct rasCollConns* collData;
+  struct rasCollConns* msgData;
+  int dataOffset = rasMsgLength(RAS_MSG_COLLRESP) + msg->collResp.nPeers*sizeof(*msg->collResp.peers);
+  ALIGN_SIZE(dataOffset, alignof(int64_t));
+
+  msgData = (struct rasCollConns*)(((char*)msg) + dataOffset);
+  collData = (struct rasCollConns*)coll->data;
+
+  // Merge the stats.
+  if (collData->travelTimeMin > msgData->travelTimeMin)
+    collData->travelTimeMin = msgData->travelTimeMin;
+  if (collData->travelTimeMax < msgData->travelTimeMax)
+    collData->travelTimeMax = msgData->travelTimeMax;
+  collData->travelTimeSum += msgData->travelTimeSum;
+  collData->travelTimeCount += msgData->travelTimeCount;
+  collData->nConns += msgData->nConns;
+
+  // Append the info about negative minimums.
+  if (msgData->nNegativeMins > 0) {
+    int nData = sizeof(*collData) +
+      (collData->nNegativeMins+msgData->nNegativeMins) * sizeof(*collData->negativeMins);
+    NCCLCHECK(ncclRealloc(&coll->data, coll->nData, nData));
+    collData = (struct rasCollConns*)coll->data;
+    memcpy(coll->data+coll->nData, msgData->negativeMins,
+           msgData->nNegativeMins * sizeof(*collData->negativeMins));
+    coll->nData = nData;
+    collData->nNegativeMins += msgData->nNegativeMins;
+  }
+
+  return ncclSuccess;
+}
+
+
+/////////////////////////////////////////////////////////////////////////
+// Functions related to the handling of the RAS_COLL_COMMS collective. //
+/////////////////////////////////////////////////////////////////////////
+
+// Initializes the accumulated data with just the local data for now.
+// For this particular collective, we keep for every communicator information about every rank, to help identify
+// the missing ones and the discrepancies between the ones that did respond.
+static ncclResult_t rasCollCommsInit(char** pData, int* pNData) {
+  struct rasCollComms* commsData;
+  int nComms = 0, nRanks = 0;
+  std::lock_guard<std::mutex> lock(ncclCommsMutex);
+
+  // Start by counting the communicators so that we know how much space to allocate.
+  // We also need to sort the comms array, to make the subsequent merging easier, both between the ranks (in case
+  // of multiple GPUs per process) and between the peers.
+  if (!ncclCommsSorted) {
+    qsort(ncclComms, nNcclComms, sizeof(*ncclComms), &ncclCommsCompare);
+    ncclCommsSorted = true;
+  }
+  for (int i = 0; i < nNcclComms; i++) {
+    if (ncclComms[i] == nullptr) // nullptr's are always at the end after sorting.
+      break;
+    if (i == 0) {
+      nComms = 1;
+    } else if (ncclComms[i]->commHash != ncclComms[i-1]->commHash) {
+      nComms++;
+    }
+    nRanks++;
+  }
+
+  // rasNetCollCommsData has nested variable-length arrays, which makes the size calculation and subsequent
+  // pointer manipulations somewhat unwieldy...
+  *pNData = sizeof(*commsData) + nComms * sizeof(*commsData->comms) + nRanks * sizeof(*commsData->comms[0].ranks);
+  NCCLCHECK(ncclCalloc(pData, *pNData));
+  commsData = (struct rasCollComms*)*pData;
+  commsData->nComms = nComms;
+
+  // comm points at the space in the accumulated data where the info about the current communicator is to be stored.
+  struct rasCollComms::comm* comm = commsData->comms;
+  for (int i = 0; i < nNcclComms; i++) {
+    struct rasCollComms::comm::rank* rank;
+    ncclResult_t asyncError;
+    if (ncclComms[i] == nullptr)
+      break;
+    if (i == 0 || ncclComms[i]->commHash != ncclComms[i-1]->commHash) {
+      if (i > 0)
+        comm = (struct rasCollComms::comm*)(((char*)(comm+1)) + comm->nRanks * sizeof(*comm->ranks));
+      comm->commHash = ncclComms[i]->commHash;
+      comm->commNRanks = ncclComms[i]->nRanks;
+      comm->nRanks = 0;
+    } else if (ncclComms[i]->nRanks != ncclComms[i-1]->nRanks) {
+      INFO(NCCL_RAS, "RAS encountered inconsistent communicator data: size %d != %d -- "
+           "possible commHash collision (0x%lx)", ncclComms[i-1]->nRanks, ncclComms[i]->nRanks, comm->commHash);
+      continue; // Short of failing, the best we can do is skip...
+    } else if (ncclComms[i]->rank == ncclComms[i-1]->rank) {
+      INFO(NCCL_RAS, "RAS encountered duplicate data for rank %d -- possible commHash collision (0x%lx)",
+           ncclComms[i]->rank, comm->commHash);
+      continue; // Short of failing, the best we can do is skip...
+    }
+    if (comm->nRanks == comm->commNRanks) {
+      INFO(NCCL_RAS,
+           "RAS encountered more ranks than the communicator size (%d) -- possible commHash collision (0x%lx)",
+           comm->commNRanks, comm->commHash);
+      continue; // Short of failing, the best we can do is skip...
+    }
+    rank = comm->ranks+comm->nRanks;
+    rank->commRank = ncclComms[i]->rank;
+    // rasNetSendCollReq initializes coll->peers[0] to our rasNetListeningSocket.addr, so peerIdx is initially
+    // always 0.  It will increase after we send this response back to the peer we got the request from.
+    rank->peerIdx = 0;
+    rank->collOpCount = ncclComms[i]->collOpCount;
+    rank->status.initState = ncclComms[i]->initState;
+    if (ncclCommGetAsyncError(ncclComms[i], &asyncError) == ncclSuccess)
+      rank->status.asyncError = asyncError;
+    rank->status.finalizeCalled = (ncclComms[i]->finalizeCalled != 0);
+    rank->status.destroyFlag = (ncclComms[i]->destroyFlag != 0);
+    rank->status.abortFlag = (__atomic_load_n(ncclComms[i]->abortFlag, __ATOMIC_ACQUIRE) != 0);
+    rank->cudaDev = ncclComms[i]->cudaDev;
+    rank->nvmlDev = ncclComms[i]->nvmlDev;
+    comm->nRanks++;
+  }
+  assert(nComms == 0 || ((char*)(comm->ranks+comm->nRanks)) - (char*)commsData <= *pNData);
+
+  return ncclSuccess;
+}
+
+// Merges incoming collective RAS_COLL_COMMS response message into the local accumulated data.
+static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg* msg) {
+  struct rasCollComms* collData;
+  struct rasCollComms* msgData;
+  int dataOffset = rasMsgLength(RAS_MSG_COLLRESP) + msg->collResp.nPeers*sizeof(*msg->collResp.peers);
+  ALIGN_SIZE(dataOffset, alignof(int64_t));
+
+  msgData = (struct rasCollComms*)(((char*)msg) + dataOffset);
+  collData = (struct rasCollComms*)coll->data;
+
+  if (msgData->nComms > 0) {
+    struct rasCollComms* newData = nullptr;
+
+    // Allocate the new buffer pessimistically (sized as the sum of the two old ones).
+    NCCLCHECK(ncclCalloc((char**)&newData, coll->nData + msg->collResp.nData));
+    struct rasCollComms::comm* collComm = collData->comms;
+    struct rasCollComms::comm* msgComm = msgData->comms;
+    struct rasCollComms::comm* newComm = newData->comms;
+
+    for (int collIdx = 0, msgIdx = 0; collIdx < collData->nComms || msgIdx < msgData->nComms; newData->nComms++) {
+      int cmp;
+      if (collIdx < collData->nComms && msgIdx < msgData->nComms)
+        cmp = (collComm->commHash < msgComm->commHash ? -1 : (collComm->commHash > msgComm->commHash ? 1 : 0));
+      else
+        cmp = (collIdx < collData->nComms ? -1 : 1);
+
+      if (cmp == 0 && collComm->commNRanks != msgComm->commNRanks) {
+        INFO(NCCL_RAS, "RAS encountered inconsistent communicator data: size %d != %d -- "
+             "possible commHash collision (0x%lx)", collComm->commNRanks, msgComm->commNRanks, collComm->commHash);
+        cmp = (collComm->commNRanks < msgComm->commNRanks ? -1 : 1);
+        // We try to preserve both separately, although the input data might already be messed up anyway...
+      }
+
+      if (cmp == 0) {
+        // Merge the comms.
+        newComm->commHash = collComm->commHash;
+        newComm->commNRanks = collComm->commNRanks;
+        if (collComm->nRanks + msgComm->nRanks > collComm->commNRanks) {
+          INFO(NCCL_RAS,
+               "RAS encountered more ranks (%d) than the communicator size (%d) -- possible commHash collision (0x%lx)",
+               collComm->nRanks + msgComm->nRanks, newComm->commNRanks, newComm->commHash);
+          // We'll skip the extras in the loop below.
+        } else {
+          newComm->nRanks = collComm->nRanks + msgComm->nRanks;
+        }
+        // Merge the ranks.
+        for (int newRankIdx = 0, collRankIdx = 0, msgRankIdx = 0;
+             collRankIdx < collComm->nRanks || msgRankIdx < msgComm->nRanks;
+             newRankIdx++) {
+          int cmpRank;
+          if (newRankIdx == newComm->commNRanks)
+            break; // Short of failing, the best we can do is skip...
+          if (collRankIdx < collComm->nRanks && msgRankIdx < msgComm->nRanks)
+            cmpRank = (collComm->ranks[collRankIdx].commRank < msgComm->ranks[msgRankIdx].commRank ? -1 :
+                       (collComm->ranks[collRankIdx].commRank > msgComm->ranks[msgRankIdx].commRank ? 1 : 0));
+          else
+            cmpRank = (collRankIdx < collComm->nRanks ? -1 : 1);
+
+          // There shouldn't be any overlaps in ranks between different sources.
+          if (cmpRank == 0) {
+            INFO(NCCL_RAS, "RAS encountered duplicate data for rank %d -- possible commHash collision (0x%lx)",
+                 collComm->ranks[collRankIdx].commRank, newComm->commHash);
+            msgRankIdx++; // Short of failing, the best we can do is skip...
+          }
+          memcpy(newComm->ranks+newRankIdx, (cmpRank <= 0 ? collComm->ranks+collRankIdx++ :
+                                             msgComm->ranks+msgRankIdx++), sizeof(*newComm->ranks));
+          if (cmpRank > 0) {
+            // peerIdx values from msgComm need to shift after merge.
+            newComm->ranks[newRankIdx].peerIdx += coll->nPeers;
+          }
+        } // for (newRankIdx)
+        newComm = (struct rasCollComms::comm*)(((char*)(newComm+1)) + newComm->nRanks * sizeof(*newComm->ranks));
+        collComm = (struct rasCollComms::comm*)(((char*)(collComm+1)) + collComm->nRanks * sizeof(*collComm->ranks));
+        collIdx++;
+        msgComm = (struct rasCollComms::comm*)(((char*)(msgComm+1)) + msgComm->nRanks * sizeof(*msgComm->ranks));
+        msgIdx++;
+      } else if (cmp < 0) {
+        // Copy from collComm.
+        int commSize = sizeof(*collComm) + collComm->nRanks * sizeof(*collComm->ranks);
+        memcpy(newComm, collComm, commSize);
+        newComm = (struct rasCollComms::comm*)(((char*)(newComm)) + commSize);
+        collComm = (struct rasCollComms::comm*)(((char*)(collComm)) + commSize);
+        collIdx++;
+      } else { // cmp > 0
+        // Copy from msgComm.
+        int commSize = sizeof(*msgComm) + msgComm->nRanks * sizeof(*msgComm->ranks);
+        memcpy(newComm, msgComm, commSize);
+        for (int i = 0; i < newComm->nRanks; i++) {
+          // peerIdx values from msgComm need to shift after merge.
+          newComm->ranks[i].peerIdx += coll->nPeers;
+        }
+        newComm = (struct rasCollComms::comm*)(((char*)(newComm)) + commSize);
+        msgComm = (struct rasCollComms::comm*)(((char*)(msgComm)) + commSize);
+        msgIdx++;
+      } // cmp > 0
+    } // for (collIdx and msgIdx)
+
+    free(coll->data);
+    coll->data = (char*)newData;
+    // newComm points at the next element beyond the last one -- exactly what we need.
+    coll->nData = ((char*)newComm) - (char*)newData;
+  } // if (msgData->nComms > 0)
+
+  return ncclSuccess;
+}
+
+// Sorting callback for the ncclComms array.
+static int ncclCommsCompare(const void* p1, const void* p2) {
+  const ncclComm** pc1 = (const ncclComm**)p1;
+  const ncclComm** pc2 = (const ncclComm**)p2;
+
+  // Put nullptr's at the end.
+  if (*pc1 == nullptr || *pc2 == nullptr)
+    return (*pc1 != nullptr ? -1 : (*pc2 != nullptr ? 1 : 0));
+
+  if ((*pc1)->commHash == (*pc2)->commHash) {
+    return ((*pc1)->rank < (*pc2)->rank ? -1 : ((*pc1)->rank > (*pc2)->rank ? 1 : 0));
+  } else {
+    return ((*pc1)->commHash < (*pc2)->commHash ? -1 : 1);
+  }
+}
@@ -0,0 +1,960 @@
+/*************************************************************************
+ * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#define NDEBUG // Comment out during development only!
+#include <cassert>
+
+#include "alloc.h"
+#include "checks.h"
+#include "comm.h"
+#include "nccl.h"
+#include "ras_internal.h"
+
+
+// All the known peer NCCL processes. The array is sorted by addr to ensure locality (within a node and hopefully
+// also within a DC).  The array may grow over time and it *includes* dead peers.
+struct rasPeerInfo* rasPeers;
+int nRasPeers;
+// Hash of the rasPeers array, for figuring out when to sync with a remote peer.
+uint64_t rasPeersHash;
+// Index of this process within the rasPeers array (may change over time as the array grows).
+static int myPeerIdx = -1;
+
+// Addresses of all the dead peers, sorted.  In principle we could instead have a flag in rasPeerInfo for this,
+// but we expect rasPeers to be largely static (and large at scale!) and rasDeadPeers to be fairly dynamic and
+// much smaller, so we prefer to keep the dead info separately so that we don't end up sending the possibly large
+// rasPeerInfo array around all the time.
+union ncclSocketAddress* rasDeadPeers;
+// The number of dead peers.
+int nRasDeadPeers;
+// The array size (may be larger than nRasDeadPeers).
+static int rasDeadPeersSize;
+// Hash of the rasDeadPeers array, for figuring out when to sync with a remote peer.
+uint64_t rasDeadPeersHash;
+
+static ncclResult_t rasRanksConvertToPeers(struct rasRankInit* ranks, int nranks,
+                                           struct rasPeerInfo** rankPeers, int *nRankPeers, int* newNRasPeers);
+static ncclResult_t rasPeersUpdate(struct rasPeerInfo* rankPeers, int* nRankPeers, int newNRasPeers = -1);
+
+static ncclResult_t rasNetUpdatePeers(const struct rasPeerInfo* newPeers, int nNewPeers, bool updateDeadPeers,
+                                      struct rasRankInit* ranks = nullptr, int nranks = 0, int fromConnIdx = -1);
+static ncclResult_t rasLinkPropagateUpdate(struct rasLink* link, const struct rasPeerInfo* newPeers, int nNewPeers,
+                                           bool updateDeadPeers, struct rasRankInit* ranks, int nranks,
+                                           int fromConnIdx);
+static ncclResult_t rasConnPropagateUpdate(struct rasConnection* conn, const struct rasPeerInfo* newPeers,
+                                           int nNewPeers, bool updateDeadPeers, struct rasRankInit* ranks, int nranks);
+ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock);
+
+static ncclResult_t rasLinkReinitConns(struct rasLink* link);
+
+static ncclResult_t rasDeadPeersUpdate(union ncclSocketAddress* updatePeers, int* nUpdatePeers);
+static ncclResult_t getNewDeadEntry(union ncclSocketAddress** pAddr);
+
+static int rasAddrRankInitCompare(const void* k, const void* e);
+static int rasAddrPeerInfoCompare(const void* k, const void* e);
+static int rasRanksCompare(const void* e1, const void* e2);
+
+static void rasPeersDump();
+static void rasDeadPeersDump();
+static char* rasPeerDump(const struct rasPeerInfo* peer, char* result, size_t nres);
+
+
+/////////////////////////////////////////////////////////////////////////////
+// Functions related to the handling of local RAS_ADD_RANKS notifications. //
+/////////////////////////////////////////////////////////////////////////////
+
+// Handles RAS_ADD_RANKS notification -- adds new ranks to the internal list of all RAS peers, reconfigures RAS
+// network connections, and notifies the peers.
+ncclResult_t rasLocalHandleAddRanks(struct rasRankInit* ranks, int nranks) {
+  ncclResult_t ret = ncclSuccess;
+
+  INFO(NCCL_RAS, "RAS handling local addRanks request (old nRasPeers %d)", nRasPeers);
+
+  // Convert the input rasRankInit structures into our internal rasPeerInfo.
+  struct rasPeerInfo* rankPeers = nullptr;
+  int nRankPeers;
+  int newNRasPeers;
+  NCCLCHECKGOTO(rasRanksConvertToPeers(ranks, nranks, &rankPeers, &nRankPeers, &newNRasPeers), ret, fail);
+
+  // Update local rasPeers.
+  NCCLCHECKGOTO(rasPeersUpdate(rankPeers, &nRankPeers, newNRasPeers), ret, fail);
+
+  INFO(NCCL_RAS, "RAS finished local processing of addRanks request (new nRasPeers %d, nRankPeers %d)",
+       nRasPeers, nRankPeers);
+  // Print peers only if something changed and we're the "root".
+  if (nRankPeers > 0 && memcmp(&ranks[0].addr, &rasNetListeningSocket.addr, sizeof(ranks[0].addr)) == 0)
+    rasPeersDump();
+
+  // Propagate the changes through our RAS network links.
+  NCCLCHECKGOTO(rasNetUpdatePeers(rankPeers, nRankPeers, /*updateDeadPeers*/false, ranks, nranks), ret, fail);
+
+exit:
+  if (rankPeers)
+    free(rankPeers);
+  free(ranks);
+  return ret;
+fail:
+  goto exit;
+}
+
+// Converts the rasRankInit structure into rasPeerInfo.  This skips empty elements (in case of errors), orders
+// elements by the address/cudaDev, and merges elements with duplicate addresses (in case of multiple CUDA devices per
+// process).  In the process we also calculate how large the merged rasPeers array will need to be.
+static ncclResult_t rasRanksConvertToPeers(struct rasRankInit* ranks, int nranks,
+                                           struct rasPeerInfo** rankPeers, int *nRankPeers, int* newNRasPeers) {
+  ncclResult_t ret = ncclSuccess;
+  int peerIdx, rankPeerIdx;
+
+  // Handy when checking for empty (in case of errors) addresses.
+  union ncclSocketAddress emptyAddr;
+  memset(&emptyAddr, '\0', sizeof(emptyAddr));
+
+  // Begin by sorting the array by address and cudaDev (to match the rasPeers order).
+  qsort(ranks, nranks, sizeof(*ranks), &rasRanksCompare);
+
+  // We over-allocate peers here because to get an accurate count we would need to loop over the ranks first...
+  // nRankPeers will hold the actual count of used elements.
+  *rankPeers = nullptr;
+  NCCLCHECKGOTO(ncclCalloc(rankPeers, nranks), ret, fail);
+
+  peerIdx = rankPeerIdx = 0;
+  *newNRasPeers = nRasPeers;
+  for (int rankIdx = 0; rankIdx < nranks; rankIdx++) {
+    const struct rasRankInit* rank = ranks+rankIdx;
+    struct rasPeerInfo* rankPeer = *rankPeers+rankPeerIdx;
+
+    if (memcmp(&emptyAddr, &rank->addr, sizeof(emptyAddr)) == 0) {
+      // Skip empty rank entries.
+      continue;
+    }
+
+    // First check if the rank doesn't need to be merged into the previous entry in rankPeers
+    // (possible if there are multiple ranks with the same address).
+    if (rankPeerIdx > 0 && memcmp(&rank->addr, &rankPeer[-1].addr, sizeof(rank->addr)) == 0) {
+      // Merge into the previous entry in peers.
+      rankPeer[-1].cudaDevs |= (1UL << rank->cudaDev);
+      rankPeer[-1].nvmlDevs |= (1UL << rank->nvmlDev);
+      continue;
+    }
+
+    // Add a new entry to rankPeers.
+    assert(rankPeerIdx < nranks);
+    memcpy(&rankPeer->addr, &rank->addr, sizeof(rankPeer->addr));
+    rankPeer->pid = rank->pid;
+    rankPeer->cudaDevs = (1UL << rank->cudaDev);
+    rankPeer->nvmlDevs = (1UL << rank->nvmlDev);
+    rankPeerIdx++;
+
+    // Also check if there is already an entry with that address in the global rasPeers so that the caller can know how
+    // many more entries will be needed.
+    const struct rasPeerInfo* rasPeer = rasPeers+peerIdx;
+    int cmp = 0;
+    while (peerIdx < nRasPeers) {
+      cmp = ncclSocketsCompare(&rank->addr, &rasPeer->addr);
+      if (cmp <= 0)
+        break;
+      peerIdx++;
+      rasPeer++;
+    }
+    if (peerIdx == nRasPeers) {
+      // The current rank is "greater than" all existing peers, so it will need a new entry.  We stay in the loop so
+      // that we don't need to handle the remaining ranks separately.
+      (*newNRasPeers)++;
+      continue;
+    }
+    if (cmp < 0) {
+      (*newNRasPeers)++;
+    } else {
+      // Duplicates (cmp == 0) between the rank array and the peers array will be merged.
+      assert(rank->pid == rasPeer->pid);
+    }
+  }
+  assert(peerIdx <= nRasPeers);
+  *nRankPeers = rankPeerIdx;
+
+exit:
+  return ret;
+fail:
+  if (*rankPeers) {
+    free(*rankPeers);
+    *rankPeers = nullptr;
+  }
+  goto exit;
+}
+
+// Updates the rasPeers array with the new data.  The new data gets updated in the process as well: any data that
+// wasn't actually new is purged, so as to minimize the amount of data we forward to our peers.
+// On a successful return, nRankPeers contains the number of entries that were updated.
+static ncclResult_t rasPeersUpdate(struct rasPeerInfo* rankPeers, int* nRankPeers, int newNRasPeers) {
+  ncclResult_t ret = ncclSuccess;
+  int rankPeerIdxDst;
+  int rankPeerIdx, peerIdx;
+
+  if (newNRasPeers == -1) {
+    // First calculate the new size of rasPeers.
+    newNRasPeers = nRasPeers;
+    for (rankPeerIdx = peerIdx = 0; rankPeerIdx < *nRankPeers; rankPeerIdx++) {
+      struct rasPeerInfo* rankPeer = rankPeers+rankPeerIdx;
+      struct rasPeerInfo* rasPeer = rasPeers+peerIdx;
+      int cmp = 1;
+
+      while (peerIdx < nRasPeers) {
+        cmp = ncclSocketsCompare(&rankPeer->addr, &rasPeer->addr);
+
+        if (cmp < 0) {
+          // rankPeer will go in front of rasPeer.
+          newNRasPeers++;
+          break;
+        }
+
+        peerIdx++;
+        rasPeer++;
+
+        if (cmp == 0)
+          break;
+      }
+      if (cmp > 0) // No more rasPeer entries -- rankPeer will go at the end.
+        newNRasPeers++;
+    }
+  }
+
+  // If needed, allocate a new, larger rasPeers array.
+  struct rasPeerInfo* newRasPeers;
+  int myNewPeerIdx;
+  if (newNRasPeers > nRasPeers) {
+    NCCLCHECKGOTO(ncclCalloc(&newRasPeers, newNRasPeers), ret, fail);
+  } else {
+    newRasPeers = rasPeers;
+  }
+
+  // Now merge the rankPeers into newRasPeers.  In the process, modify rankPeers to become a "diff" between
+  // the old rasPeers and newRasPeers -- this will be the data structure to broadcast on the RAS network.
+  myNewPeerIdx = -1;
+  int newPeerIdx;
+  for (newPeerIdx = rankPeerIdx = peerIdx = 0; rankPeerIdx < *nRankPeers || peerIdx < nRasPeers;) {
+    struct rasPeerInfo* rankPeer = rankPeers+rankPeerIdx;
+    struct rasPeerInfo* rasPeer = rasPeers+peerIdx;
+    struct rasPeerInfo* newRasPeer = newRasPeers+newPeerIdx;
+
+    if (rankPeerIdx < *nRankPeers) {
+      if (peerIdx < nRasPeers) {
+        int cmp = ncclSocketsCompare(&rankPeer->addr, &rasPeer->addr);
+
+        if (cmp < 0) {
+          // rankPeer needs to occur before rasPeer -- that's possible only if we are adding new entries.
+          assert(newRasPeers != rasPeers);
+          // Add new entry to newRasPeers.
+          assert(newPeerIdx < newNRasPeers);
+          memcpy(newRasPeer, rankPeer, sizeof(*newRasPeer));
+          newPeerIdx++;
+          rankPeerIdx++;
+        }
+        else {
+          // cmp >= 0 -- Start by copying peer to newRasPeer, if needed.
+          if (newRasPeers != rasPeers) {
+            assert(newPeerIdx < newNRasPeers);
+            memcpy(newRasPeer, rasPeer, sizeof(*newRasPeer));
+          }
+          else { // in-place
+            assert(newRasPeer == rasPeer);
+          }
+
+          if (cmp == 0) {
+            // The address of rankPeer is the same as that of newRasPeer -- merge into it.
+            // First though calculate what GPUs from rankPeer are actually new (if any).
+            uint64_t newDevs = rankPeer->cudaDevs & ~newRasPeer->cudaDevs;
+            newRasPeer->cudaDevs |= rankPeer->cudaDevs;
+            // Update rankPeer->devs with the newly added devs only -- we'll clean it up at the end.
+            rankPeer->cudaDevs = newDevs;
+            // Repeat for nvmlDevs...
+            newDevs = rankPeer->nvmlDevs & ~newRasPeer->nvmlDevs;
+            newRasPeer->nvmlDevs |= rankPeer->nvmlDevs;
+            rankPeer->nvmlDevs = newDevs;
+            rankPeerIdx++;
+          }
+          // Given that we might've added new entries, we need to update myPeerIdx as well.
+          if (myPeerIdx == peerIdx)
+            myNewPeerIdx = newPeerIdx;
+          peerIdx++;
+          newPeerIdx++;
+        }
+      } else { // peerIdx == nRasPeers
+        // No more rasPeers -- add a new entry based on rank.
+        assert(newPeerIdx < newNRasPeers);
+        memcpy(newRasPeer, rankPeer, sizeof(*newRasPeer));
+        // If this is the first time this function is run, myPeerIdx will need to be set.  It's more work in that
+        // case as we need to compare the addresses of each peer until we find one.
+        if (myPeerIdx == -1 && memcmp(&newRasPeer->addr, &rasNetListeningSocket.addr, sizeof(newRasPeer->addr)) == 0)
+          myNewPeerIdx = newPeerIdx;
+        newPeerIdx++;
+        rankPeerIdx++;
+      }
+    } else { // rankPeerIdx == *nRankPeers
+      // No more rankPeers -- copy the rasPeer over if needed.
+      if (newRasPeers != rasPeers) {
+        assert(newPeerIdx < newNRasPeers);
+        memcpy(newRasPeer, rasPeer, sizeof(*newRasPeer));
+      }
+      else { // in-place at the end.
+        assert(newRasPeer == rasPeer);
+      }
+      if (myPeerIdx == peerIdx)
+        myNewPeerIdx = newPeerIdx;
+      peerIdx++;
+      newPeerIdx++;
+    }
+  }
+  assert(newPeerIdx == newNRasPeers);
+
+  if (newRasPeers != rasPeers) {
+    if (rasPeers)
+      free(rasPeers);
+    rasPeers = newRasPeers;
+    nRasPeers = newNRasPeers;
+    assert(myNewPeerIdx != -1);
+    myPeerIdx = myNewPeerIdx;
+  } else {
+    assert(myNewPeerIdx == myPeerIdx);
+  }
+  rasPeersHash = getHash((const char*)rasPeers, nRasPeers*sizeof(*rasPeers));
+
+  // Purge from rankPeers all entries that didn't actually contribute any new GPUs.
+  for (rankPeerIdx = rankPeerIdxDst = 0; rankPeerIdx < *nRankPeers; rankPeerIdx++) {
+    struct rasPeerInfo* rankPeer = rankPeers+rankPeerIdx;
+    if (rankPeer->cudaDevs != 0) {
+      if (rankPeerIdxDst != rankPeerIdx) {
+        memcpy(rankPeers+rankPeerIdxDst, rankPeer, sizeof(*rankPeers));
+      }
+      rankPeerIdxDst++;
+    }
+  }
+  assert(rankPeerIdxDst <= *nRankPeers);
+  *nRankPeers = rankPeerIdxDst;
+
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+// Searches through rasPeers given the peer address.  Returns the index of the found entry in the rasPeers
+// array or -1 if not found.
+int rasPeerFind(const union ncclSocketAddress* addr) {
+  struct rasPeerInfo* peer = (struct rasPeerInfo*)bsearch(addr, rasPeers, nRasPeers, sizeof(*rasPeers),
+                                                          rasAddrPeerInfoCompare);
+  return (peer ? peer-rasPeers : -1);
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////
+// Functions related to the propagation of peers updates over the RAS network. //
+/////////////////////////////////////////////////////////////////////////////////
+
+// Propagates information about new peers through the RAS network links.
+// ranks -- if provided -- lists all the peers who are already aware of this update (because they are the members
+// of the new communicator being established), and who thus don't need to be notified.  updatedDeadPeers can
+// be used, however, to request at least the propagation of rasDeadPeers to such peers.
+// fromConnIdx -- if provided -- identified the connection used to receive this update; there's no need to
+// propagate the update back through it.
+// Reconfigures the RAS network to accommodate the newly added peers, by modifying the links and establishing new
+// connections as needed.
+static ncclResult_t rasNetUpdatePeers(const struct rasPeerInfo* newPeers, int nNewPeers, bool updateDeadPeers,
+                                      struct rasRankInit* ranks, int nranks, int fromConnIdx) {
+  ncclResult_t ret = ncclSuccess;
+
+  // Do we actually have anything to do?
+  if (nNewPeers == 0 && !updateDeadPeers)
+    goto exit;
+
+  // Start by propagating the update through the RAS network links.  We consider any errors during this process
+  // to be non-fatal (we can re-sync later around a keep-alive exchange).
+  (void)rasLinkPropagateUpdate(&rasNextLink, newPeers, nNewPeers, updateDeadPeers, ranks, nranks, fromConnIdx);
+  (void)rasLinkPropagateUpdate(&rasPrevLink, newPeers, nNewPeers, updateDeadPeers, ranks, nranks, fromConnIdx);
+
+  // Calculate new link peers and open new connections if needed.
+  NCCLCHECKGOTO(rasLinkReinitConns(&rasNextLink), ret, fail);
+  NCCLCHECKGOTO(rasLinkReinitConns(&rasPrevLink), ret, fail);
+
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+// Sends a peers update through all the connections associated with a particular link.  See rasNetUpdatePeers
+// for the explanation of the function arguments.
+static ncclResult_t rasLinkPropagateUpdate(struct rasLink* link, const struct rasPeerInfo* newPeers, int nNewPeers,
+                                           bool updateDeadPeers, struct rasRankInit* ranks, int nranks,
+                                           int fromConnIdx) {
+  for (int i = 0; i < link->nConns; i++) {
+    struct rasLinkConn* linkConn = link->conns+i;
+    // Note that we don't send the update via the connection that we received this notification from in the first
+    // place (while it wouldn't loop indefinitely, it would add a needless extra exchange).
+    if (linkConn->connIdx != -1 && linkConn->connIdx != fromConnIdx) {
+      struct rasConnection* conn = rasConns+linkConn->connIdx;
+      // Failed propagations are not considered fatal (we will retry after a keep-alive).
+      (void)rasConnPropagateUpdate(conn, newPeers, nNewPeers, updateDeadPeers, ranks, nranks);
+    }
+  }
+
+  return ncclSuccess;
+}
+
+// Sends a peers update down a particular connection.  See rasNetUpdatePeers for the explanation of the function
+// arguments.
+static ncclResult_t rasConnPropagateUpdate(struct rasConnection* conn, const struct rasPeerInfo* newPeers,
+                                           int nNewPeers, bool updateDeadPeers, struct rasRankInit* ranks, int nranks) {
+  if (conn->sockIdx != -1 && rasSockets[conn->sockIdx].status == RAS_SOCK_READY) {
+    // If we have the rank info, check if the peer on the other side of this connection has participated in the new
+    // communicator.
+    int connRank = -1;
+    if (ranks && !updateDeadPeers) {
+      struct rasRankInit* rank = (struct rasRankInit*)bsearch(&conn->addr, ranks, nranks, sizeof(*ranks),
+                                                              rasAddrRankInitCompare);
+      if (rank)
+        connRank = rank-ranks;
+    }
+    if (connRank < 0) {
+      // It did not participate or we don't know -- we should send an update to that peer then.
+      NCCLCHECK(rasConnSendPeersUpdate(conn, newPeers, nNewPeers));
+    }
+  }
+
+  return ncclSuccess;
+}
+
+// Sends a RAS_MSG_PEERSUPDATE message, which can include both the rasPeers (preferably only the newly added peers
+// rather than the complete rasPeers array, to save on the network bandwidth) and rasDeadPeers (sent in its entirety
+// if at all, as it's assumed to be a lot smaller than rasPeers).
+ncclResult_t rasConnSendPeersUpdate(struct rasConnection* conn, const struct rasPeerInfo* peers, int nPeers) {
+  struct rasMsg* msg = nullptr;
+  int msgLen;
+  int deadPeersOffset = 0;
+  int nDeadPeers;
+
+  if (conn->lastSentPeersHash == rasPeersHash || conn->lastRecvPeersHash == rasPeersHash) {
+    nPeers = 0;
+  }
+  if (conn->lastSentDeadPeersHash == rasDeadPeersHash || conn->lastRecvDeadPeersHash == rasDeadPeersHash) {
+    nDeadPeers = 0;
+  } else {
+    // We expect the rasDeadPeers array to be much smaller than rasPeers so if we send it, we send it in full.
+    nDeadPeers = nRasDeadPeers;
+  }
+
+  if (nPeers == 0 && nDeadPeers == 0)
+    goto exit;
+
+  msgLen = rasMsgLength(RAS_MSG_PEERSUPDATE) + nPeers*sizeof(*peers);
+  if (nDeadPeers > 0) {
+    ALIGN_SIZE(msgLen, alignof(union ncclSocketAddress));
+    deadPeersOffset = msgLen;
+    msgLen += nDeadPeers*sizeof(*rasDeadPeers);
+  }
+
+  NCCLCHECK(rasMsgAlloc(&msg, msgLen));
+  msg->type = RAS_MSG_PEERSUPDATE;
+  msg->peersUpdate.peersHash = rasPeersHash;
+  msg->peersUpdate.nPeers = nPeers;
+  msg->peersUpdate.deadPeersHash = rasDeadPeersHash;
+  msg->peersUpdate.nDeadPeers = nDeadPeers;
+  memcpy(msg->peersUpdate.peers, peers, nPeers * sizeof(msg->peersUpdate.peers[0]));
+  memcpy(((char*)msg)+deadPeersOffset, rasDeadPeers, nDeadPeers * sizeof(*rasDeadPeers));
+
+  if (nPeers > 0)
+    conn->lastSentPeersHash = rasPeersHash;
+  if (nDeadPeers > 0)
+    conn->lastSentDeadPeersHash = rasDeadPeersHash;
+
+  INFO(NCCL_RAS, "RAS sending a peersUpdate to %s (nPeers %d, nDeadPeers %d)",
+       ncclSocketToString(&conn->addr, rasLine), nPeers, nDeadPeers);
+
+  rasConnEnqueueMsg(conn, msg, msgLen);
+exit:
+  return ncclSuccess;
+}
+
+// Handles the RAS_MSG_PEERSUPDATE message on the receiver side.  The received data is merged into the local
+// rasPeers and rasDeadPeers arrays.  If the checksums of the resulting arrays don't match those from the message,
+// sends its own RAS_MSG_PEERSUPDATE back to the source, to ensure a sync.
+// Subsequently propagates the update to its own peers.
+ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock) {
+  ncclResult_t ret = ncclSuccess;
+  struct rasMsg* newMsg = nullptr;
+  int newMsgLen = 0;
+  assert(sock->connIdx != -1);
+  struct rasConnection* conn = rasConns+sock->connIdx;
+  int nPeers, nDeadPeers;
+  int deadPeersOffset = 0;
+  bool updatePeers, updateDeadPeers;
+
+  INFO(NCCL_RAS, "RAS handling peersUpdate from %s (peersHash 0x%lx, deadPeersHash 0x%lx, nPeers %d, nDeadPeers %d)",
+       ncclSocketToString(&sock->sock.addr, rasLine), msg->peersUpdate.peersHash, msg->peersUpdate.deadPeersHash,
+       msg->peersUpdate.nPeers, msg->peersUpdate.nDeadPeers);
+  INFO(NCCL_RAS, "RAS my old rasPeersHash 0x%lx, rasDeadPeersHash 0x%lx, nRasPeers %d, nRasDeadPeers %d",
+       rasPeersHash, rasDeadPeersHash, nRasPeers, nRasDeadPeers);
+  conn->lastRecvPeersHash = msg->peersUpdate.peersHash;
+  conn->lastRecvDeadPeersHash = msg->peersUpdate.deadPeersHash;
+
+  // Prepare ours to send back.  We don't enqueue it right away because we want to make sure first that we need
+  // to send it.  We'll find out by comparing the hash values after the merge.
+  // We want to prepare the message pre-merge though because post-merge it will include the just received new peers,
+  // and it's pointless to send those back to where they just came from.
+  // nPeers and nDeadPeers are used primarily for message length calculations, so they have to assume the worst-case
+  // scenario (e.g., no overlap in case of nDeadPeers).
+  nPeers = (msg->peersUpdate.peersHash != rasPeersHash ? nRasPeers : 0);
+  nDeadPeers = (msg->peersUpdate.deadPeersHash != rasDeadPeersHash ? nRasDeadPeers+msg->peersUpdate.nDeadPeers : 0);
+  if (nPeers > 0 || nDeadPeers > 0) {
+    newMsgLen = rasMsgLength(RAS_MSG_PEERSUPDATE) + nPeers*sizeof(*rasPeers);
+    if (nDeadPeers > 0) {
+      ALIGN_SIZE(newMsgLen, alignof(union ncclSocketAddress));
+      newMsgLen += nDeadPeers*sizeof(*rasDeadPeers);
+    }
+    NCCLCHECKGOTO(rasMsgAlloc(&newMsg, newMsgLen), ret, fail);
+    newMsg->type = RAS_MSG_PEERSUPDATE;
+    // Note that after rasPeersUpdate below we may still decide not to send the peers.
+    memcpy(newMsg->peersUpdate.peers, rasPeers, nPeers * sizeof(newMsg->peersUpdate.peers[0]));
+    newMsg->peersUpdate.nPeers = nPeers;
+
+    if (nDeadPeers > 0) {
+      // Calculate the offset where dead peers are stored in the received message.  We do it before the peers
+      // update because it could modify msg->peersUpdate.nPeers...
+      deadPeersOffset = rasMsgLength(RAS_MSG_PEERSUPDATE) + msg->peersUpdate.nPeers * sizeof(msg->peersUpdate.peers[0]);
+      ALIGN_SIZE(deadPeersOffset, alignof(union ncclSocketAddress));
+    }
+
+    if (nPeers > 0)
+      NCCLCHECKGOTO(rasPeersUpdate(msg->peersUpdate.peers, &msg->peersUpdate.nPeers), ret, fail);
+    else
+      msg->peersUpdate.nPeers = 0;
+    if (nDeadPeers > 0)
+      NCCLCHECKGOTO(rasDeadPeersUpdate((union ncclSocketAddress*)(((char*)msg)+deadPeersOffset),
+                                       &msg->peersUpdate.nDeadPeers), ret, fail);
+    else
+      msg->peersUpdate.nDeadPeers = 0;
+
+    INFO(NCCL_RAS, "RAS finished local processing of peersUpdate "
+         "(new nRasPeers %d, nRasDeadPeers %d, nPeers %d, nDeadPeers %d)",
+         nRasPeers, nRasDeadPeers, msg->peersUpdate.nPeers, msg->peersUpdate.nDeadPeers);
+    if (msg->peersUpdate.nPeers > 0)
+      rasPeersDump();
+    if (msg->peersUpdate.nDeadPeers > 0)
+      rasDeadPeersDump();
+
+    // If post-merge the hashes are still different, send our (dead) peers back.
+    updatePeers = (conn->lastSentPeersHash != rasPeersHash && conn->lastRecvPeersHash != rasPeersHash);
+    updateDeadPeers = (conn->lastSentDeadPeersHash != rasDeadPeersHash &&
+                       conn->lastRecvDeadPeersHash != rasDeadPeersHash);
+    if (updatePeers || updateDeadPeers) {
+      newMsg->peersUpdate.peersHash = rasPeersHash;
+      newMsg->peersUpdate.deadPeersHash = rasDeadPeersHash;
+      if (updatePeers) {
+        assert(nPeers > 0);
+        conn->lastSentPeersHash = rasPeersHash;
+      } else {
+        // If hashes match, make sure that we don't send the rasPeers back.
+        newMsg->peersUpdate.nPeers = 0;
+      }
+
+      // We need to recalculate the message size from scratch now that both rasPeers and rasDeadPeers may have changed.
+      newMsgLen = rasMsgLength(RAS_MSG_PEERSUPDATE) + newMsg->peersUpdate.nPeers * sizeof(*rasPeers);
+
+      if (updateDeadPeers) {
+        assert(nRasDeadPeers > 0);
+        conn->lastSentDeadPeersHash = rasDeadPeersHash;
+
+        ALIGN_SIZE(newMsgLen, alignof(union ncclSocketAddress));
+        deadPeersOffset = newMsgLen;
+        newMsgLen += nRasDeadPeers*sizeof(*rasDeadPeers);
+
+        memcpy(((char*)newMsg)+deadPeersOffset, rasDeadPeers, nDeadPeers * sizeof(*rasDeadPeers));
+        conn->lastSentDeadPeersHash = rasDeadPeersHash;
+        newMsg->peersUpdate.nDeadPeers = nRasDeadPeers;
+      } else {
+        newMsg->peersUpdate.nDeadPeers = 0;
+      }
+
+      INFO(NCCL_RAS, "RAS sending back a peersUpdate (nPeers %d, nDeadPeers %d)",
+           newMsg->peersUpdate.nPeers, newMsg->peersUpdate.nDeadPeers);
+
+      rasConnEnqueueMsg(conn, newMsg, newMsgLen);
+      newMsg = nullptr;
+    } // if (updatePeers || updateDeadPeers)
+
+    // Propagate the changes through our RAS network links.
+    NCCLCHECKGOTO(rasNetUpdatePeers(msg->peersUpdate.peers, msg->peersUpdate.nPeers, updateDeadPeers, nullptr, 0,
+                                    sock->connIdx), ret, fail);
+  }
+
+exit:
+  rasMsgFree(newMsg);
+  return ret;
+fail:
+  goto exit;
+}
+
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// Functions related to the (re-)configuration of RAS connections after a peers update. //
+//////////////////////////////////////////////////////////////////////////////////////////
+
+// Reinitializes the connection(s) of a particular link, following a peers update.
+// Adding new peers can affect the calculation of the link's primary connection and also the fallbacks.
+// The newly added peers could also shift all the existing peerIdx values, invalidating the values in RasLinkConn
+// structures, so it's better to drop it all and recalculate from scratch.
+// We recalculate the primary peer; if an active connection to it already exists, then we're done.  If there
+// is no connection, we create one.  If a connection exists but is experiencing delays then we add a fallback and
+// the process repeats.
+// External conns are dropped from the links as well (they will be re-created via keepAlive messages as needed).
+static ncclResult_t rasLinkReinitConns(struct rasLink* link) {
+  struct rasLinkConn* linkConn;
+  struct rasConnection* conn = nullptr;
+  int newPeerIdx = myPeerIdx;
+
+  if (link->connsSize == 0) {
+    link->connsSize = RAS_INCREMENT;
+    NCCLCHECK(ncclCalloc(&link->conns, link->connsSize));
+  }
+  link->nConns = 0;
+
+  // Establish a connection for this link.  We iterate as long as the connections we find are experiencing delays.
+  while (newPeerIdx != -1) {
+    if (link->nConns == link->connsSize) {
+      NCCLCHECK(ncclRealloc(&link->conns, link->connsSize, link->connsSize+RAS_INCREMENT));
+      link->connsSize += RAS_INCREMENT;
+    }
+
+    newPeerIdx = rasLinkCalculatePeer(link, newPeerIdx, /*isFallback*/link->nConns > 1);
+    if (newPeerIdx == -1) {
+      INFO(NCCL_RAS, "RAS link %d: no more fallbacks to add (nConns %d)", link->direction, link->nConns);
+      if (link->nConns > 0)
+        break;
+    }
+    linkConn = link->conns+link->nConns;
+    linkConn->peerIdx = newPeerIdx;
+    linkConn->connIdx = (newPeerIdx != -1 ? rasConnFind(&rasPeers[newPeerIdx].addr) : -1);
+    linkConn->external = false;
+
+    // If the calculated connection does not exist, then we are at the end of the chain and this is the last iteration.
+    // Depending on the circumstances, we may first need to create that connection.
+    if (linkConn->connIdx == - 1) {
+      if (link->nConns == 0) {
+        if (linkConn->peerIdx != -1) {
+          INFO(NCCL_RAS, "RAS link %d: %s primary connection with %s",
+               link->direction, (myPeerIdx < linkConn->peerIdx ? "opening new" : "calculated deferred"),
+               ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine));
+          // We try to initiate primary connections from the side with a lower address (and thus an earlier peer index)
+          // to avoid races and the creation of duplicate connections.
+          if (myPeerIdx < linkConn->peerIdx) {
+            NCCLCHECK(rasConnCreate(&rasPeers[linkConn->peerIdx].addr, &linkConn->connIdx));
+          }
+          else { // If we didn't initiate the connection, start the timeout.
+            link->lastUpdatePeersTime = clockNano();
+          }
+        } // if (linkConn->peerIdx != -1)
+      } else { // link->nConns > 0
+        INFO(NCCL_RAS, "RAS link %d: opening new fallback connection %d with %s",
+             link->direction, link->nConns, ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine));
+        NCCLCHECK(rasConnCreate(&rasPeers[newPeerIdx].addr, &linkConn->connIdx));
+      } // link->nConns > 0
+    } else { // linkConn->connIdx != -1
+      if (link->nConns == 0) {
+        INFO(NCCL_RAS, "RAS link %d: calculated existing primary connection with %s",
+             link->direction, ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine));
+      } else {
+        INFO(NCCL_RAS, "RAS link %d: calculated existing fallback connection %d with %s",
+             link->direction, link->nConns, ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine));
+      }
+    }
+    link->nConns++;
+    if (linkConn->connIdx == -1)
+      break;
+    conn = rasConns+linkConn->connIdx;
+
+    // We check if the connection already went through the fallback calculation; if so, we'll need to create a new
+    // fallback in the next iteration, to ensure that RAS will keep retrying.
+    if (!conn->experiencingDelays)
+      break;
+
+    INFO(NCCL_RAS, "RAS connection experiencingDelays %d, startRetryTime %.2fs, socket status %d",
+         conn->experiencingDelays, (clockNano()-conn->startRetryTime)/1e9,
+         (conn->sockIdx == -1 ? -1 : rasSockets[conn->sockIdx].status));
+  }
+
+  return ncclSuccess;
+}
+
+// Calculates the index of the peer on the RAS network.  Can also be used to calculate the index of the next fallback
+// peer.
+// In the simplest case we want to try the "next closest" fallback, although we still need to check for and skip
+// any dead peers.
+// For fallbacks to fallbacks, we also apply a more pessimistic policy.  We skip all the remaining RAS threads that
+// are on the same node as the previous fallback (unless it's the same node that we're running on or we have strong
+// indications that the node is up).  We do that to avoid having to excessively wait iterating through, say, 8
+// processes when a whole node might be down.
+int rasLinkCalculatePeer(const struct rasLink* link, int peerIdx, bool isFallback) {
+  int newPeerIdx = (peerIdx + link->direction + nRasPeers) % nRasPeers;
+  do {
+    if (isFallback && !ncclSocketsSameNode(&rasPeers[peerIdx].addr, &rasNetListeningSocket.addr)) {
+      // peerIdx is a fallback and it is not running on the same node as us.
+      int tryPeerIdx = newPeerIdx;
+      int tryConnIdx = -1;
+
+      // Try to skip the remaining peers on the same node as peerIdx.  We may end up skipping over some peers that
+      // are alive, which is fine -- they will still have connectivity with the rest of the RAS network, just a
+      // little suboptimal one.
+      while (ncclSocketsSameNode(&rasPeers[tryPeerIdx].addr, &rasPeers[peerIdx].addr)) {
+        if (!rasPeerIsDead(&rasPeers[tryPeerIdx].addr)) {
+          tryConnIdx = rasConnFind(&rasPeers[tryPeerIdx].addr);
+          if (tryConnIdx != -1) {
+            struct rasConnection* tryConn = rasConns+tryConnIdx;
+            // Check if the connection is fully established and operational, i.e., if the underlying socket
+            // is ready and there's been recent communication on it.
+            if (tryConn->sockIdx != -1 && rasSockets[tryConn->sockIdx].status == RAS_SOCK_READY &&
+                !tryConn->experiencingDelays) {
+              // We convinced ourselves that the node is not down.  We don't adjust newPeerIdx in
+              // this case.  This is the only case when tryConnIdx != -1 after this loop.
+              break;
+            }
+          } // if (tryConnIdx != -1)
+        } // if (!rasPeerIsDead(&rasPeers[tryPeerIdx].addr))
+
+        tryConnIdx = -1;
+        tryPeerIdx = (tryPeerIdx + nRasPeers + link->direction) % nRasPeers;
+        if (tryPeerIdx == myPeerIdx)
+          break;
+      }
+
+      if (tryConnIdx == -1)
+        newPeerIdx = tryPeerIdx;
+      if (tryPeerIdx == myPeerIdx)
+        break;
+    } // if (isFallback && !ncclSocketsSameNode(&rasPeers[peerIdx].addr, &rasNetListeningSocket.addr))
+    
+    if (rasPeerIsDead(&rasPeers[newPeerIdx].addr)) {
+      newPeerIdx = (newPeerIdx + nRasPeers + link->direction) % nRasPeers;
+    }
+    else
+      break;
+  } while (newPeerIdx != myPeerIdx);
+
+  return (newPeerIdx != myPeerIdx ? newPeerIdx : -1);
+}
+
+
+//////////////////////////////////////////////////////
+// Functions related to the handling of dead peers. //
+//////////////////////////////////////////////////////
+
+// Marks a peer as dead in the local rasDeadPeers array.  Any propagation, reconfiguration, etc., needs to be
+// handled outside of this function.
+ncclResult_t rasPeerDeclareDead(const union ncclSocketAddress* addr) {
+  union ncclSocketAddress* deadAddr;
+
+  if (!rasPeerIsDead(addr)) {
+    NCCLCHECK(getNewDeadEntry(&deadAddr));
+    memcpy(deadAddr, addr, sizeof(*deadAddr));
+    qsort(rasDeadPeers, nRasDeadPeers, sizeof(*rasDeadPeers), &ncclSocketsCompare);
+
+    rasDeadPeersHash = getHash((const char*)rasDeadPeers, nRasDeadPeers*sizeof(*rasDeadPeers));
+
+    INFO(NCCL_RAS, "RAS declaring peer %s as DEAD; rasDeadPeersHash 0x%lx",
+         ncclSocketToString(addr, rasLine), rasDeadPeersHash);
+  }
+  return ncclSuccess;
+}
+
+// Invoked when an incoming RAS_MSG_PEERSUPDATE includes info on dead peers.  Updates the rasDeadPeers array.
+// Any propagation needs to be handled outside of this function, though it *does* disconnect any connections
+// with the newly dead peers.
+// On return, nUpdatePeers contains the number of newly added dead entries.
+static ncclResult_t rasDeadPeersUpdate(union ncclSocketAddress* updatePeers, int* nUpdatePeers) {
+  static union ncclSocketAddress* newPeers = nullptr;
+  static union ncclSocketAddress* oldPeers;
+
+  if (*nUpdatePeers == 0)
+    return ncclSuccess;
+
+  // Pessimistically estimate the new size of rasDeadPeers.
+  int nNewPeers = nRasDeadPeers + *nUpdatePeers;
+  if (nNewPeers > rasDeadPeersSize) {
+    nNewPeers = ROUNDUP(nNewPeers, RAS_INCREMENT);
+
+    NCCLCHECK(ncclCalloc(&newPeers, nNewPeers));
+    oldPeers = rasDeadPeers;
+  } else {
+    // We don't need to allocate a new array in this case.  We just shift the existing content to the end of the
+    // array to make room in the front for merging.
+    oldPeers = rasDeadPeers+(rasDeadPeersSize-nRasDeadPeers);
+    memmove(oldPeers, rasDeadPeers, nRasDeadPeers*sizeof(*rasDeadPeers));
+    newPeers = rasDeadPeers;
+  }
+
+  // Merge updatePeers with oldPeers into newPeers.
+  int oldPeersIdx, updatePeersIdx, newPeersIdx;
+  for (oldPeersIdx = updatePeersIdx = newPeersIdx = 0; oldPeersIdx < nRasDeadPeers || updatePeersIdx < *nUpdatePeers;) {
+    int cmp;
+    if (oldPeersIdx < nRasDeadPeers && updatePeersIdx < *nUpdatePeers) {
+      cmp = ncclSocketsCompare(oldPeers+oldPeersIdx, updatePeers+updatePeersIdx);
+    } else {
+      cmp = (oldPeersIdx < nRasDeadPeers ? -1 : 1);
+    }
+
+    memmove(newPeers+newPeersIdx++, (cmp <= 0 ? oldPeers+oldPeersIdx : updatePeers+updatePeersIdx), sizeof(*newPeers));
+    if (cmp <= 0)
+      oldPeersIdx++;
+    if (cmp > 0) {
+      rasConnDisconnect(updatePeers+updatePeersIdx);
+    }
+    if (cmp >= 0)
+      updatePeersIdx++;
+  }
+  *nUpdatePeers = newPeersIdx - nRasDeadPeers;
+  nRasDeadPeers = newPeersIdx;
+
+  if (newPeers != rasDeadPeers) {
+    free(rasDeadPeers);
+    rasDeadPeers = newPeers;
+    rasDeadPeersSize = nNewPeers;
+  }
+
+  rasDeadPeersHash = getHash((const char*)rasDeadPeers, nRasDeadPeers*sizeof(*rasDeadPeers));
+
+  return ncclSuccess;
+}
+
+// Returns the index of the first available entry in the rasDeadPeers array, enlarging the array if necessary.
+static ncclResult_t getNewDeadEntry(union ncclSocketAddress** pAddr) {
+  if (nRasDeadPeers == rasDeadPeersSize) {
+    NCCLCHECK(ncclRealloc(&rasDeadPeers, rasDeadPeersSize, rasDeadPeersSize+RAS_INCREMENT));
+    rasDeadPeersSize += RAS_INCREMENT;
+  }
+
+  *pAddr = rasDeadPeers+(nRasDeadPeers++);
+  return ncclSuccess;
+}
+
+// Checks whether a peer is dead by looking it up in the rasDeadPeers array.
+bool rasPeerIsDead(const union ncclSocketAddress* addr) {
+  return (rasDeadPeers != nullptr &&
+          bsearch(addr, rasDeadPeers, nRasDeadPeers, sizeof(*rasDeadPeers), ncclSocketsCompare) != nullptr);
+}
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Auxiliary functions -- primarily sorting/searching callbacks, plus some debug output support. //
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Searching callback for struct rasRankInit.  Compares the ncclSocketAddress key against a rasRankInit element.
+static int rasAddrRankInitCompare(const void* k, const void* e) {
+  const union ncclSocketAddress* key = (const union ncclSocketAddress*)k;
+  const struct rasRankInit* elem = (const struct rasRankInit*)e;
+
+  return ncclSocketsCompare(key, &elem->addr);
+}
+
+// Searching callback for struct rasPeerInfo.  Compares the ncclSocketAddress key against a rasPeerInfo element.
+static int rasAddrPeerInfoCompare(const void* k, const void* e) {
+  const union ncclSocketAddress* key = (const union ncclSocketAddress*)k;
+  const struct rasPeerInfo* elem = (const struct rasPeerInfo*)e;
+
+  return ncclSocketsCompare(key, &elem->addr);
+}
+
+// Sorting callback for struct rasRankInit. addr is the primary key; cudaDev is secondary.
+static int rasRanksCompare(const void* e1, const void* e2) {
+  const struct rasRankInit* r1 = (const struct rasRankInit*)e1;
+  const struct rasRankInit* r2 = (const struct rasRankInit*)e2;
+  int cmp = ncclSocketsCompare(&r1->addr, &r2->addr);
+  if (cmp == 0) {
+    if (r1->addr.sa.sa_family == 0) // Bail out in case of empty addresses...
+      return 0;
+    assert(r1->pid == r2->pid);
+    cmp = (r1->cudaDev < r2->cudaDev ? -1 : (r1->cudaDev > r2->cudaDev ? 1 : 0));
+    assert(cmp != 0); // There should be no complete duplicates within the rank array.
+  }
+  return cmp;
+}
+
+// Sorting callback for ncclSocketAddress.  We want to sort by the address family (IPv4 first), then the address,
+// then port.  Unfortunately, that's not the order of how they are laid out in memory, so one big memcmp won't do.
+// memcmp is still useful though for individual elements in the network byte order.
+int ncclSocketsCompare(const void* p1, const void* p2) {
+  const union ncclSocketAddress* a1 = (const union ncclSocketAddress*)p1;
+  const union ncclSocketAddress* a2 = (const union ncclSocketAddress*)p2;
+  // AF_INET (2) is less than AF_INET6 (10).
+  int family = a1->sa.sa_family;
+  if (family != a2->sa.sa_family) {
+    if (family > 0 && a2->sa.sa_family > 0)
+      return (family < a2->sa.sa_family ? -1 : 1);
+    else // Put empty addresses at the end (not that it matters...).
+      return (family > 0 ? -1 : 1);
+  }
+
+  int cmp;
+  if (family == AF_INET) {
+    if ((cmp = memcmp(&a1->sin.sin_addr, &a2->sin.sin_addr, sizeof(a1->sin.sin_addr))) == 0) {
+      cmp = memcmp(&a1->sin.sin_port, &a2->sin.sin_port, sizeof(a1->sin.sin_port));
+    }
+  }
+  else if (family == AF_INET6) {
+    if ((cmp = memcmp(&a1->sin6.sin6_addr, &a2->sin6.sin6_addr, sizeof(a1->sin6.sin6_addr))) == 0) {
+      cmp = memcmp(&a1->sin6.sin6_port, &a2->sin6.sin6_port, sizeof(a1->sin6.sin6_port));
+    }
+  } else {
+    // The only remaining valid case are empty addresses.
+    assert(family == 0);
+    cmp = 0; // Two empty addresses are equal...
+  }
+
+  return cmp;
+}
+
+// Returns true if two socket addresses are from the same node (actually, the same network interface on one node).
+bool ncclSocketsSameNode(const union ncclSocketAddress* a1, const union ncclSocketAddress* a2) {
+  // AF_INET (2) is less than AF_INET6 (10).
+  int family = a1->sa.sa_family;
+  if (family != a2->sa.sa_family)
+    return false;
+
+  if (family == AF_INET)
+    return (memcmp(&a1->sin.sin_addr, &a2->sin.sin_addr, sizeof(a1->sin.sin_addr)) == 0);
+  else if (family == AF_INET6)
+    return (memcmp(&a1->sin6.sin6_addr, &a2->sin6.sin6_addr, sizeof(a1->sin6.sin6_addr)) == 0);
+  else
+    return true; // Two empty addresses are equal...
+}
+
+// Debug output routine: dumps the rasPeers array.
+static void rasPeersDump() {
+  for (int p = 0; p < nRasPeers; p++) {
+    const struct rasPeerInfo* peer = rasPeers+p;
+    INFO(NCCL_RAS, "RAS peer %d: %s%s", p, rasPeerDump(peer, rasLine, sizeof(rasLine)), (p == myPeerIdx ? " [this process]" : ""));
+  }
+  if (nRasPeers > 0)
+    INFO(NCCL_RAS, "RAS peersHash 0x%lx", rasPeersHash);
+}
+
+// Debug output routine: dumps the rasDeadPeers array.
+static void rasDeadPeersDump() {
+  for (int p = 0; p < nRasDeadPeers; p++) {
+    int deadPeerIdx = rasPeerFind(rasDeadPeers+p);
+    INFO(NCCL_RAS, "RAS dead peer %d: %s", p,
+         (deadPeerIdx >= 0 ? rasPeerDump(rasPeers+deadPeerIdx, rasLine, sizeof(rasLine)) :
+          ncclSocketToString(rasDeadPeers+p, rasLine)));
+  }
+  if (nRasDeadPeers > 0)
+    INFO(NCCL_RAS, "RAS deadPeersHash 0x%lx", rasDeadPeersHash);
+}
+
+// Debug output routine: dumps part of an individual element from the rasPeers array.
+static char* rasPeerDump(const struct rasPeerInfo* peer, char* result, size_t nres) {
+  char line[SOCKET_NAME_MAXLEN+1], line2[1024];
+  snprintf(result, nres, "socket %s, pid %d, GPU%s %s", ncclSocketToString(&peer->addr, line), peer->pid,
+           (__builtin_popcountll(peer->cudaDevs) > 1 ? "s" : ""),
+           rasGpuDevsToString(peer->cudaDevs, peer->nvmlDevs, line2, sizeof(line2)));
+  return result;
+}
@@ -0,0 +1,668 @@
+/*************************************************************************
+ * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#define NDEBUG // Comment out during development only!
+#include <cassert>
+#include <cstddef>
+#include <mutex>
+#include <poll.h>
+#include <unistd.h>
+
+#include "alloc.h"
+#include "checks.h"
+#include "comm.h"
+#include "nccl.h"
+#include "utils.h"
+#include "ras_internal.h"
+
+// Type of a notification from a local NCCL thread.
+typedef enum {
+  RAS_ADD_RANKS = 0,
+  RAS_TERMINATE = 1
+} rasNotificationType;
+
+// Used for communication from local NCCL threads to the RAS thread.
+struct rasNotification {
+  rasNotificationType type;
+  union {
+    struct {
+      struct rasRankInit* ranks;
+      int nranks;
+    } addRanks;
+  };
+};
+static_assert(sizeof(struct rasNotification) <= PIPE_BUF, "The rasNotification structure is too large");
+
+// These ensure that we get only one RAS port/thread per process.
+static std::mutex rasInitMutex;
+static bool rasInitialized = false;
+static int rasInitRefCount = 0;
+
+// The RAS network listening socket of this RAS thread (random port).
+struct ncclSocket rasNetListeningSocket;
+
+static pthread_t rasThread;
+
+// Used for communication from regular NCCL threads to the RAS thread.
+static std::mutex rasNotificationMutex;
+static int rasNotificationPipe[2] = {-1, -1};
+
+// Data for the main poll() in the RAS thread.
+struct pollfd* rasPfds;
+static int nRasPfds;
+
+// We use it all over the place; no point in wasting the stack...
+char rasLine[SOCKET_NAME_MAXLEN+1];
+
+// An array holding the addresses of all NCCL communicators.  Modified by the NCCL threads (hence the mutex), read by
+// the RAS thread.
+std::mutex ncclCommsMutex;
+struct ncclComm** ncclComms = nullptr;
+int nNcclComms = 0;
+bool ncclCommsSorted = false; // Whether the array is currently sorted. We sort by the comms' commHash and rank.
+
+static ncclResult_t rasLocalNotify(const struct rasNotification* msg);
+static ncclResult_t rasLocalHandle();
+static void rasLocalHandleTerminate();
+
+static ncclResult_t rasMsgHandleConnInit(const struct rasMsg* msg, struct rasSocket* sock);
+static ncclResult_t rasMsgHandleConnInitAck(const struct rasMsg* msg, struct rasSocket* sock);
+static ncclResult_t rasNetSendNack(struct rasSocket* sock);
+
+static void* rasThreadMain(void*);
+
+NCCL_PARAM(RasTimeoutFactor, "RAS_TIMEOUT_FACTOR", 1);
+
+//////////////////////////////////////////////////
+// Functions invoked from regular NCCL threads. //
+//////////////////////////////////////////////////
+
+// Invoked by regular NCCL threads on every comm initialization.  This is the first function to call.
+// The myRank structure should be passed with the addr element initialized to the IP address of the bootstrap
+// network interface to use.  On a successful return, the address will be updated with the port number of the
+// RAS network listening socket.
+ncclResult_t ncclRasCommInit(struct ncclComm* comm, struct rasRankInit* myRank) {
+  ncclResult_t ret = ncclSuccess;
+  if (!rasInitialized) {
+    std::lock_guard<std::mutex> lock(rasInitMutex);
+    if (!rasInitialized) {
+      union ncclSocketAddress addr;
+
+      memcpy(&addr, &myRank->addr, sizeof(addr));
+      (addr.sa.sa_family == AF_INET ? addr.sin.sin_port : addr.sin6.sin6_port) = htons(0);
+      NCCLCHECKGOTO(ncclSocketInit(&rasNetListeningSocket, &addr, NCCL_SOCKET_MAGIC, ncclSocketTypeRasNetwork,
+                                   /*abortFlag*/nullptr, /*asyncFlag*/1), ret, fail);
+      NCCLCHECKGOTO(ncclSocketListen(&rasNetListeningSocket), ret, fail);
+      INFO(NCCL_RAS, "RAS network listening socket at %s",
+           ncclSocketToString(&rasNetListeningSocket.addr, rasLine));
+
+      (void)rasClientInitSocket();
+
+      SYSCHECKGOTO(pipe(rasNotificationPipe), "pipe", ret, fail);
+
+      PTHREADCHECKGOTO(pthread_create(&rasThread, nullptr, &rasThreadMain, nullptr), "pthread_create", ret, fail);
+      ncclSetThreadName(rasThread, "NCCL RAS");
+      (void)pthread_detach(rasThread);
+
+      rasInitialized = true;
+    }
+  }
+  ncclAtomicRefCountIncrement(&rasInitRefCount);
+
+  {
+    std::lock_guard<std::mutex> lock(ncclCommsMutex);
+
+    int i;
+    for (i = 0; i < nNcclComms; i++) {
+      if (ncclComms[i] == nullptr)
+        break;
+    }
+    if (i == nNcclComms) {
+      NCCLCHECK(ncclRealloc(&ncclComms, nNcclComms, nNcclComms+RAS_INCREMENT*8));
+      nNcclComms += RAS_INCREMENT*8;
+    }
+    ncclComms[i] = comm;
+    ncclCommsSorted = false;
+  }
+
+  if (myRank != nullptr)
+    memcpy(&myRank->addr, &rasNetListeningSocket.addr, sizeof(myRank->addr));
+
+exit:
+  return ret;
+fail:
+  if (rasNotificationPipe[1] != 0)
+    (void)close(rasNotificationPipe[1]);
+  if (rasNotificationPipe[0] != 0)
+    (void)close(rasNotificationPipe[0]);
+  (void)close(rasClientListeningSocket);
+  (void)ncclSocketClose(&rasNetListeningSocket);
+  goto exit;
+}
+
+// Invoked by regular NCCL threads on every comm termination.
+ncclResult_t ncclRasCommFini(const struct ncclComm* comm) {
+  if (!rasInitialized)
+    return ncclSuccess;
+  {
+    std::lock_guard<std::mutex> lock(ncclCommsMutex);
+    for (int i = 0; i < nNcclComms; i++) {
+      if (ncclComms[i] == comm) {
+        ncclComms[i] = nullptr;
+        ncclCommsSorted = false;
+        break;
+      }
+    }
+  }
+  if (ncclAtomicRefCountDecrement(&rasInitRefCount) == 0) {
+    struct rasNotification msg;
+    msg.type = RAS_TERMINATE;
+    NCCLCHECK(rasLocalNotify(&msg));
+  }
+  return ncclSuccess;
+}
+
+// Invoked by regular NCCL threads on every (non-split) comm initialization.  Provides info on all the ranks within
+// the communicator.
+ncclResult_t ncclRasAddRanks(struct rasRankInit* ranks, int nranks) {
+  struct rasNotification msg;
+  msg.type = RAS_ADD_RANKS;
+  msg.addRanks.ranks = ranks;
+  msg.addRanks.nranks = nranks;
+  NCCLCHECK(rasLocalNotify(&msg));
+  return ncclSuccess;
+}
+
+// Internal function running on regular NCCL threads -- asynchronously notifies the RAS thread.
+static ncclResult_t rasLocalNotify(const struct rasNotification* msg) {
+  if (!rasInitialized)
+    return ncclSuccess;
+
+  // Take an exclusive lock here to avoid multiplexing between multiple user threads (not sure if it's
+  // strictly required, but it won't hurt)...
+  std::lock_guard<std::mutex> lock(rasNotificationMutex);
+  size_t done = 0;
+  while (done < sizeof(*msg)) {
+    ssize_t written;
+    SYSCHECK(written = write(rasNotificationPipe[1], (char*)msg + done, sizeof(*msg) - done), "write");
+    done += written;
+  }
+  return ncclSuccess;
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////
+// Functions related to the handling of local notifications from NCCL threads. //
+/////////////////////////////////////////////////////////////////////////////////
+
+// Handles asynchronous local notifications arriving from regular NCCL threads.
+static ncclResult_t rasLocalHandle() {
+  struct rasNotification msg;
+
+  size_t done = 0;
+  while (done < sizeof(msg)) {
+    ssize_t nread;
+    SYSCHECK(nread = read(rasNotificationPipe[0], (char*)&msg + done, sizeof(msg) - done), "read");
+    if (nread == 0) // EOF
+      return ncclSystemError;
+    done += nread;
+  }
+
+  if (msg.type == RAS_ADD_RANKS) {
+    NCCLCHECK(rasLocalHandleAddRanks(msg.addRanks.ranks, msg.addRanks.nranks));
+  } else if (msg.type == RAS_TERMINATE) {
+    rasLocalHandleTerminate();
+  } else {
+    WARN("RAS received unknown notification type %d", msg.type);
+    return ncclInternalError;
+  }
+
+  return ncclSuccess;
+}
+
+// Handles local RAS_TERMINATE notification.
+static void rasLocalHandleTerminate() {
+  INFO(NCCL_RAS, "RAS handling local termination request");
+  // For now we don't do anything.
+}
+
+
+////////////////////////////////////////////////
+// Generic functions related to RAS messages. //
+////////////////////////////////////////////////
+
+// Allocates a RAS message of the desired length for sending.
+// Behind the scenes allocates encapsulating rasMsgMeta structure, which includes local metadata stored in front
+// of the message.
+// Must use rasMsgFree to free.
+ncclResult_t rasMsgAlloc(struct rasMsg** msg, size_t msgLen) {
+  struct rasMsgMeta* meta = nullptr;
+  NCCLCHECK(ncclCalloc((char**)&meta, offsetof(struct rasMsgMeta, msg) + msgLen));
+  *msg = &meta->msg;
+  // coverity[leaked_storage:FALSE] => rasMsgFree is used to free it
+  return ncclSuccess;
+}
+
+// To be used only with messages allocated with rasMsgAlloc.  I.e., it should be used for sent messages, not
+// for received ones.
+void rasMsgFree(struct rasMsg* msg) {
+  if (msg) {
+    struct rasMsgMeta* meta = (struct rasMsgMeta*)((char*)msg - offsetof(struct rasMsgMeta, msg));
+    free(meta);
+  }
+}
+
+// Enqueues a message for sending down a RAS connection.
+void rasConnEnqueueMsg(struct rasConnection* conn, struct rasMsg* msg, size_t msgLen, bool front) {
+  // Get to the metadata of this message.
+  struct rasMsgMeta* meta = (struct rasMsgMeta*)((char*)msg - offsetof(struct rasMsgMeta, msg));
+  bool ready = false;
+
+  meta->enqueueTime = clockNano();
+  meta->offset = 0;
+  meta->length = msgLen;
+
+  if (front)
+    ncclIntruQueueEnqueueFront(&conn->sendQ, meta);
+  else
+    ncclIntruQueueEnqueue(&conn->sendQ, meta);
+
+  if (conn->sockIdx != -1) {
+    struct rasSocket* sock = rasSockets+conn->sockIdx;
+    if (sock->status == RAS_SOCK_READY || (sock->status == RAS_SOCK_HANDSHAKE && msg->type == RAS_MSG_CONNINIT)) {
+      rasPfds[sock->pfd].events |= POLLOUT;
+      ready = true;
+    }
+  }
+  if (!ready) {
+    // It's not a bug, unless it's for things like keep-alive messages...
+    INFO(NCCL_RAS, "RAS enqueued message type %d on a non-ready connection with %s "
+         "(experiencingDelays %d, startRetryTime %.2fs, socket status %d)",
+         msg->type, ncclSocketToString(&conn->addr, rasLine),
+         conn->experiencingDelays, (conn->startRetryTime ? (clockNano()-conn->startRetryTime)/1e9 : 0.0),
+         (conn->sockIdx == -1 ? -1 : rasSockets[conn->sockIdx].status));
+  }
+}
+
+// Attempts to send the queued RAS messages to another RAS thread.
+ncclResult_t rasConnSendMsg(struct rasConnection* conn, int* closed, bool* allSent) {
+  struct ncclSocket* sock = &rasSockets[conn->sockIdx].sock;
+  struct rasMsgMeta* meta;
+  *closed = 0;
+  while ((meta = ncclIntruQueueHead(&conn->sendQ)) != nullptr) {
+    if (rasSockets[conn->sockIdx].status == RAS_SOCK_HANDSHAKE && meta->msg.type != RAS_MSG_CONNINIT) {
+      // We don't send anything beyond the handshake at this point.
+      meta = nullptr;
+      break;
+    }
+    if (meta->offset < sizeof(meta->length)) {
+      // Send the length of the message.
+      NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, sock, &meta->length, sizeof(meta->length), &meta->offset, closed));
+      if (*closed)
+        return ncclSuccess;
+      if (meta->offset < sizeof(meta->length))
+        break;
+    }
+    // Send the body of the message.
+    NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, sock, ((char*)&meta->msg)-sizeof(meta->length),
+                                 meta->length+sizeof(meta->length), &meta->offset, closed));
+    if (*closed)
+      return ncclSuccess;
+    if (meta->offset < meta->length+sizeof(meta->length))
+      break;
+    ncclIntruQueueDequeue(&conn->sendQ);
+    free(meta);
+  }
+
+  *allSent = !meta;
+
+  return ncclSuccess;
+}
+
+// Attempts to receive a message through a RAS socket.
+ncclResult_t rasMsgRecv(struct rasSocket* sock, struct rasMsg** msg, int* closed) {
+  *closed = 0;
+  if (sock->recvOffset < sizeof(sock->recvLength)) {
+    // Receive the length of the message.
+    NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &sock->sock, &sock->recvLength, sizeof(sock->recvLength),
+                                 &sock->recvOffset, closed));
+    if (*closed || sock->recvOffset < sizeof(sock->recvLength))
+      return ncclSuccess;
+    NCCLCHECK(ncclCalloc((char**)&sock->recvMsg, sock->recvLength));
+  }
+  // Receive the body of the message.
+  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &sock->sock, ((char*)sock->recvMsg)-sizeof(sock->recvLength),
+                               sock->recvLength+sizeof(sock->recvLength), &sock->recvOffset, closed));
+  if (*closed || sock->recvOffset < sock->recvLength+sizeof(sock->recvLength))
+    return ncclSuccess;
+
+  *msg = sock->recvMsg;
+  sock->recvMsg = nullptr;
+  sock->recvOffset = sock->recvLength = 0;
+
+  return ncclSuccess;
+}
+
+
+//////////////////////////////////////////////////////////////////
+// Functions related to the handling of specific message types. //
+//////////////////////////////////////////////////////////////////
+
+// Invoked from the main RAS thread to dispatch incoming messages to the appropriate handler.
+ncclResult_t rasMsgHandle(struct rasMsg* msg, struct rasSocket* sock) {
+  if (msg->type == RAS_MSG_CONNINIT) {
+    NCCLCHECK(rasMsgHandleConnInit(msg, sock));
+  } else if (msg->type == RAS_MSG_CONNINITACK) {
+    NCCLCHECK(rasMsgHandleConnInitAck(msg, sock));
+  } else if (msg->type == RAS_MSG_KEEPALIVE) {
+    NCCLCHECK(rasMsgHandleKeepAlive(msg, sock));
+  } else if (msg->type == RAS_MSG_PEERSUPDATE) {
+    NCCLCHECK(rasMsgHandlePeersUpdate(msg, sock));
+  } else if (msg->type == RAS_MSG_COLLREQ) {
+    NCCLCHECK(rasMsgHandleCollReq(msg, sock));
+  } else if (msg->type == RAS_MSG_COLLRESP) {
+    NCCLCHECK(rasMsgHandleCollResp(msg, sock));
+  } else {
+    WARN("RAS received unknown message type (%d) from %s", msg->type, ncclSocketToString(&sock->sock.addr, rasLine));
+    return ncclInternalError;
+  }
+
+  return ncclSuccess;
+}
+
+// Handles the first message sent over a RAS socket as part of the handshake.
+static ncclResult_t rasMsgHandleConnInit(const struct rasMsg* msg, struct rasSocket* sock) {
+  ncclResult_t ret = ncclSuccess;
+  struct rasConnection* conn = nullptr;
+  int connIdx, peerIdx;
+  struct rasMsg* newMsg = nullptr;
+  int newMsgLen;
+  char line[SOCKET_NAME_MAXLEN+1];
+
+  INFO(NCCL_RAS, "RAS handling connInit from %s (version %d, listeningAddr %s, peersHash 0x%lx, deadPeersHash 0x%lx)",
+       ncclSocketToString(&sock->sock.addr, rasLine), msg->connInit.ncclVersion,
+       ncclSocketToString(&msg->connInit.listeningAddr, line), msg->connInit.peersHash, msg->connInit.deadPeersHash);
+
+  if (msg->connInit.ncclVersion != NCCL_VERSION_CODE) {
+    // Close any such sockets immediately!  This is basically unrecoverable...
+    WARN("NCCL version mismatch with remote peer %s (local: %d, remote %d)",
+         ncclSocketToString(&sock->sock.addr, rasLine), NCCL_VERSION_CODE, msg->connInit.ncclVersion);
+    rasNetSendNack(sock);
+    rasSocketTerminate(sock, /*finalize*/true);
+    ret = ncclInvalidUsage;
+    goto exit;
+  }
+
+  if (rasPeerIsDead(&msg->connInit.listeningAddr)) {
+    // A peer long declared dead is suddenly alive again?!
+    INFO(NCCL_RAS, "RAS connection from peer %s that is considered dead!",
+         ncclSocketToString(&msg->connInit.listeningAddr, rasLine));
+    rasNetSendNack(sock);
+    rasSocketTerminate(sock, /*finalize*/true);
+    goto exit;
+  }
+
+  // Check for any existing connection with that RAS thread (could happen due to a network issue, or possibly a race).
+  connIdx = rasConnFind(&msg->connInit.listeningAddr);
+  if (connIdx != -1) {
+    conn = rasConns+connIdx;
+
+    INFO(NCCL_RAS,
+         "RAS found a matching existing connection (sendQ %sempty, experiencingDelays %d, startRetryTime %.2fs)",
+         (ncclIntruQueueEmpty(&conn->sendQ) ? "" : "not "),
+         conn->experiencingDelays, (conn->startRetryTime ? (clockNano()-conn->startRetryTime)/1e9 : 0.0));
+
+    if (conn->sockIdx != -1) {
+      struct rasSocket* connSock = rasSockets+conn->sockIdx;
+      INFO(NCCL_RAS, "RAS found an alternative existing socket (status %d, createTime %.2fs)",
+           connSock->status, (clockNano()-connSock->createTime)/1e9);
+      // In general we prefer to keep the newer connection, but "newer" can be a relative term: we may have
+      // a race where both sides attempt to establish a connection at roughly the same time, so the other side's
+      // incoming connection ends up looking newer than the locally-initiated one -- for *both* of them.
+      // If each side closed the "old" one, both would end up being closed.
+      // As we normally try to initiate connections from the side with a lower address (precisely to avoid such
+      // situations), we'll follow the same logic here: the "lower" side will reject the new connection (as it
+      // came from the "wrong" side), whereas the "higher" side will keep the new one (as it came from the correct
+      // side) and terminate the old one (that it presumably just opened).
+      if (ncclSocketsCompare(&rasNetListeningSocket.addr, &conn->addr) < 0) {
+        INFO(NCCL_RAS, "RAS terminating the new socket");
+        rasSocketTerminate(sock, /*finalize*/true);
+        goto exit;
+      } else {
+        INFO(NCCL_RAS, "RAS keeping the new socket and terminating the existing one");
+        rasSocketTerminate(connSock);
+      }
+    }
+  }
+  if (!conn) {
+    NCCLCHECK(getNewConnEntry(&conn));
+    memcpy(&conn->addr, &msg->connInit.listeningAddr, sizeof(conn->addr));
+    connIdx = conn - rasConns;
+  }
+
+  sock->status = RAS_SOCK_READY;
+  // rasConnResume will reset any experiencingDelays, startRetryTime, etc.
+
+  conn->sockIdx = sock-rasSockets;
+  sock->connIdx = connIdx;
+  memcpy(&sock->sock.addr, &msg->connInit.listeningAddr, sizeof(sock->sock.addr));
+
+  // Make sure that the connection is part of the right links forming the RAS network.  At this point we only
+  // update the expected (non-external) connections; external ones will be added during keep-alive handling.
+  peerIdx = rasPeerFind(&conn->addr);
+  // Note: it's possible for peerIdx to be -1 at this point if, due to races, the connInit arrives before
+  // the peers update.
+  if (peerIdx != -1) {
+    (void)rasLinkUpdateConn(&rasNextLink, connIdx, peerIdx);
+    (void)rasLinkUpdateConn(&rasPrevLink, connIdx, peerIdx);
+  }
+
+  // Send a confirmation to the server that requested the connection (so that the resilience code can mark
+  // the connection as live).
+  newMsgLen = rasMsgLength(RAS_MSG_CONNINITACK);
+  NCCLCHECK(rasMsgAlloc(&newMsg, newMsgLen));
+  newMsg->type = RAS_MSG_CONNINITACK;
+  newMsg->connInitAck.nack = 0;
+  rasConnEnqueueMsg(conn, newMsg, newMsgLen, /*front*/true);
+
+  conn->lastRecvPeersHash = msg->connInit.peersHash;
+  conn->lastRecvDeadPeersHash = msg->connInit.deadPeersHash;
+
+  if (msg->connInit.peersHash != rasPeersHash || msg->connInit.deadPeersHash != rasDeadPeersHash) {
+    // Send my rasPeers and request the same in return.
+    INFO(NCCL_RAS, "RAS connInit hash mismatch (my peersHash 0x%lx, deadPeersHash 0x%lx); sending my (dead) peers",
+         rasPeersHash, rasDeadPeersHash);
+    NCCLCHECK(rasConnSendPeersUpdate(conn, rasPeers, nRasPeers));
+  }
+exit:
+  return ret;
+}
+
+// Handles the second message sent over a RAS socket as part of the handshake.
+static ncclResult_t rasMsgHandleConnInitAck(const struct rasMsg* msg, struct rasSocket* sock) {
+  INFO(NCCL_RAS, "RAS handling connInitAck from %s (nack %d)",
+       ncclSocketToString(&sock->sock.addr, rasLine), msg->connInitAck.nack);
+
+  if (msg->connInitAck.nack) {
+    // The remote peer doesn't want to talk to us.  The easiest way to prevent it is by declaring it dead.
+    // We make a copy of the address because rasConnDisconnect will terminate the rasSocket.
+    union ncclSocketAddress addr;
+    memcpy(&addr, &sock->sock.addr, sizeof(addr));
+    rasConnDisconnect(&addr);
+    (void)rasPeerDeclareDead(&addr);
+
+    return ncclSuccess;
+  }
+
+  sock->status = RAS_SOCK_READY;
+  // rasConnResume will reset any experiencingDelays, startRetryTime, etc.
+
+  return ncclSuccess;
+}
+
+// Handles the deadPeer broadcast.
+void rasMsgHandleBCDeadPeer(const struct rasCollRequest* req, bool* pDone) {
+  INFO(NCCL_RAS, "RAS handling deadPeer (addr %s)", ncclSocketToString(&req->deadPeer.addr, rasLine));
+
+  if (!rasPeerIsDead(&req->deadPeer.addr)) {
+    rasConnDisconnect(&req->deadPeer.addr);
+    (void)rasPeerDeclareDead(&req->deadPeer.addr);
+    *pDone = false;
+  } else {
+    INFO(NCCL_RAS, "RAS already knew it was dead");
+    // No point in re-broadcasting what's already known.
+    *pDone = true;
+  }
+}
+
+// Attempts to immediately send a fatal NACK connInitAck response to a socket.  A bit of a hack (as it doesn't
+// follow our usual message queuing and polling convention) but, since this can be invoked only for newly opened
+// connections, and the message is tiny, it should be OK.  We can't use the regular path because the socket is
+// about to be terminated.
+static ncclResult_t rasNetSendNack(struct rasSocket* sock) {
+  struct rasMsg msg;
+  int length = rasMsgLength(RAS_MSG_CONNINITACK);
+  int closed = 0;
+  int offset;
+
+  INFO(NCCL_RAS, "RAS sending NACK to %s", ncclSocketToString(&sock->sock.addr, rasLine));
+
+  msg.type = RAS_MSG_CONNINITACK;
+  msg.connInitAck.nack = 1;
+  offset = 0;
+  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &sock->sock, &length, sizeof(length), &offset, &closed));
+  if (closed || offset < sizeof(length))
+    return ncclSuccess;
+  offset = 0;
+  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &sock->sock, &msg, length, &offset, &closed));
+  // We are closing this socket anyway -- it doesn't matter to us if we succeeded or not.
+
+  return ncclSuccess;
+}
+
+
+/////////////////////////////////////////////////////////////////
+// Functions related to the main event loop of the RAS thread. //
+/////////////////////////////////////////////////////////////////
+
+// Main function of the RAS thread.
+static void* rasThreadMain(void*) {
+  ncclResult_t ret = ncclSuccess; // Unused.
+  int pfd;
+  int rasNetListeningSocketFd;
+
+  INFO(NCCL_RAS, "RAS thread started");
+
+  // Initialize the global pollfd with the file descriptors we already have (the pipe and the listening socket).
+  NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, fail);
+  rasPfds[pfd].fd = rasNotificationPipe[0];
+  rasPfds[pfd].events = POLLIN;
+
+  NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, fail);
+  NCCLCHECKGOTO(ncclSocketGetFd(&rasNetListeningSocket, &rasNetListeningSocketFd), ret, fail);
+  rasPfds[pfd].fd = rasNetListeningSocketFd;
+  rasPfds[pfd].events = POLLIN;
+
+  NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, fail);
+  rasPfds[pfd].fd = rasClientListeningSocket;
+  rasPfds[pfd].events = POLLIN;
+
+  // Main event loop of the RAS thread.
+  for (int64_t nextWakeup=0;;) {
+    int timeout, nEvents;
+    int64_t now = clockNano();
+    if (nextWakeup > 0) {
+      // The "1" below helps avoid round-downs and especially zeroes.
+      if (nextWakeup > now)
+        timeout = (nextWakeup - now) / (CLOCK_UNITS_PER_SEC / 1000) + 1;
+      else
+        timeout = 1;
+    } else {
+      timeout = 1000; // 1 second.
+    }
+
+    nEvents = poll(rasPfds, nRasPfds, timeout);
+
+    nextWakeup = clockNano()+CLOCK_UNITS_PER_SEC;
+    if (nEvents == -1 && errno != EINTR)
+      INFO(NCCL_RAS, "RAS continuing in spite of an unexpected error from poll: %s", strerror(errno));
+
+    // Handle any poll-related events.
+    for (int pollIdx = 0; pollIdx < nRasPfds && nEvents > 0; pollIdx++) {
+      if (rasPfds[pollIdx].revents) {
+        nEvents--;
+        if (rasPfds[pollIdx].fd == rasNotificationPipe[0]) {
+          (void)rasLocalHandle();
+        } else if (rasPfds[pollIdx].fd == rasNetListeningSocketFd) {
+          (void)rasNetAcceptNewSocket();
+        } else if (rasPfds[pollIdx].fd == rasClientListeningSocket) {
+          (void)rasClientAcceptNewSocket();
+        } else {
+          // Check if it's one of the RAS sockets.
+          int sockIdx;
+          for (sockIdx = 0; sockIdx < nRasSockets; sockIdx++) {
+            struct rasSocket* sock = rasSockets+sockIdx;
+            if (sock->status != RAS_SOCK_CLOSED && rasPfds[pollIdx].fd == sock->sock.fd) {
+              rasSockEventLoop(sockIdx, pollIdx);
+              break;
+            }
+          } // for (sockIdx)
+
+          if (sockIdx == nRasSockets) {
+            // Try a client socket instead.
+            for (int clientIdx = 0; clientIdx < nRasClients; clientIdx++) {
+              struct rasClient* client = rasClients+clientIdx;
+              if (client->status != RAS_CLIENT_CLOSED && rasPfds[pollIdx].fd == client->sock) {
+                rasClientEventLoop(clientIdx, pollIdx);
+                break;
+              }
+            } // for (clientIdx)
+          } // if (sockIdx == nRasSockets)
+        } // dynamic fds
+      } // if (revents)
+    } // for (pollIdx)
+
+    now = clockNano();
+
+    rasSocksHandleTimeouts(now, &nextWakeup);
+
+    rasConnsHandleTimeouts(now, &nextWakeup);
+
+    rasNetHandleTimeouts(now, &nextWakeup);
+
+    rasCollsHandleTimeouts(now, &nextWakeup);
+  } // for (;;)
+
+fail:
+  WARN("fatal error - RAS thread terminating");
+  std::lock_guard<std::mutex> lock(rasInitMutex);
+  (void)close(rasNotificationPipe[1]);
+  (void)close(rasNotificationPipe[0]);
+  (void)close(rasClientListeningSocket);
+  (void)ncclSocketClose(&rasNetListeningSocket);
+  rasInitialized = false;
+  return nullptr;
+}
+
+// Returns the index of the first available entry in the rasPfds array, enlarging the array if necessary.
+ncclResult_t rasGetNewPollEntry(int* index) {
+  int i;
+  for (i = 0; i < nRasPfds; i++)
+    if (rasPfds[i].fd == -1)
+      break;
+  if (i == nRasPfds) {
+    NCCLCHECK(ncclRealloc(&rasPfds, nRasPfds, nRasPfds+RAS_INCREMENT));
+    nRasPfds += RAS_INCREMENT;
+    for (int j = i; j < nRasPfds; j++)
+      rasPfds[j].fd = -1;
+  }
+
+  memset(rasPfds+i, '\0', sizeof(*rasPfds));
+  rasPfds[i].fd = -1;
+
+  *index = i;
+  return ncclSuccess;
+}
@@ -0,0 +1,512 @@
+/*************************************************************************
+ * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_RAS_INTERNAL_H_
+#define NCCL_RAS_INTERNAL_H_
+
+#define NCCL_RAS_CLIENT_PORT 28028
+#define NCCL_RAS_CLIENT_PROTOCOL 2
+
+#define RAS_COLLECTIVE_LEG_TIMEOUT_SEC 5
+#define RAS_COLLECTIVE_EXTRA_TIMEOUT_SEC RAS_COLLECTIVE_LEG_TIMEOUT_SEC
+
+// End of the client section; everything below is meant for the NCCL threads only.
+#ifndef NCCL_RAS_CLIENT
+
+#include <mutex>
+
+#include "nccl.h"
+#include "ras.h"
+#include "socket.h"
+#include "utils.h"
+
+// Type of a RAS network or client message.
+typedef enum {
+  RAS_MSG_CONNINIT = 1,
+  RAS_MSG_CONNINITACK = 2,
+  RAS_MSG_KEEPALIVE = 3,
+  RAS_MSG_PEERSUPDATE = 4,
+  RAS_MSG_COLLREQ = 5,
+  RAS_MSG_COLLRESP = 6,
+} rasMsgType;
+
+// Type of a RAS network collective message.
+typedef enum {
+  RAS_MSG_NONE = 0,
+  RAS_BC_DEADPEER = 1,
+  // Broadcast operations above this line; collective operations below (1000 is the demarcation line).
+  RAS_COLL_CONNS = 1001, // Collect data about all RAS connections.
+  RAS_COLL_COMMS = 1002, // Collect data about all communicators.
+} rasCollectiveType;
+
+// Payload of a collective request message (RAS_MSG_COLLREQ).
+struct rasCollRequest {
+  union ncclSocketAddress rootAddr;
+  uint64_t rootId;
+
+  int64_t timeout;
+  rasCollectiveType type;
+  union {
+    struct {
+      union ncclSocketAddress addr;
+    } deadPeer;
+    struct {
+    } conns;
+    struct {
+    } comms;
+  };
+};
+
+// Payload of a collective response message (RAS_MSG_COLLRESP).
+struct rasCollResponse {
+  union ncclSocketAddress rootAddr;
+  uint64_t rootId;
+
+  int nLegTimeouts; // If >0, indicates incomplete data.
+  int nPeers;
+  int nData; // Size of data in bytes.
+  union ncclSocketAddress peers[0]; // Variable length.
+  // The peersAddrs array is followed by:
+  //alignas(int64_t) char data[0]; // Variable length, collective-dependent.
+};
+
+// Describes a peer NCCL process.  Every RAS thread keeps an (identical) array of them, one entry for each
+// NCCL process.
+struct rasPeerInfo {
+  union ncclSocketAddress addr;
+  pid_t pid;
+  uint64_t cudaDevs; // Bitmask.  Conveniently, NCCL_MAX_LOCAL_RANKS == 64.
+  uint64_t nvmlDevs; // Same, but not affected by CUDA_VISIBLE_DEVICES.
+};
+
+// Describes a RAS message.  Every message is preceded by a (32-bit) message length.  All data in the host
+// byte order.  Depending on the message type, the length of the message will vary.
+struct rasMsg {
+  rasMsgType type;
+  union {
+    struct {
+      int ncclVersion;
+      union ncclSocketAddress listeningAddr;
+      uint64_t peersHash;
+      uint64_t deadPeersHash;
+    } connInit; // Sent by the connecting side as the first message.
+    struct {
+      int nack; // If non-0, we should stop trying to reconnect.
+    } connInitAck; // Response from the accepting side to the above.
+    struct {
+      uint64_t peersHash;
+      uint64_t deadPeersHash;
+      int linkMask; // What links at the destination peer should the connection be part of
+                    // (bit 0: nextLink; bit 1: prevLink).
+      struct timespec realTime; // Wallclock time at the source, for statistical purposes (in principle there's
+                                // no guarantee that the nodes have synchronized clocks so we can't really rely
+                                // on it for anything important)..
+      int nack; // If non-0, it means that this message is a response to an unexpected keepAlive message.
+    } keepAlive;
+    struct {
+      uint64_t peersHash;
+      uint64_t deadPeersHash;
+      int nPeers;
+      int nDeadPeers;
+      struct rasPeerInfo peers[0]; // Variable length.
+      // The peers array is followed by the following:
+      //union ncclSocketAddress deadPeers[0]; // Variable length.
+    } peersUpdate;
+    struct {
+      int protocol; // Protocol version, sent to the client.
+    } clientInit;
+    struct {
+      int nData;
+      char data[0]; // Variable length.
+    } clientDump;
+    struct rasCollRequest collReq; // Variable length.
+    struct rasCollResponse collResp; // Variable length.
+  };
+};
+
+// Returns the size of the collective portion of a collective request message.
+static inline size_t rasCollDataLength(rasCollectiveType type) {
+  struct rasCollRequest* data;
+  switch (type) {
+    case RAS_BC_DEADPEER:
+      return offsetof(struct rasCollRequest, deadPeer) + sizeof(data->deadPeer);
+    case RAS_COLL_CONNS:
+      return offsetof(struct rasCollRequest, conns) + sizeof(data->conns);
+    case RAS_COLL_COMMS:
+      return offsetof(struct rasCollRequest, comms) + sizeof(data->comms);
+    case RAS_MSG_NONE:
+      return 0;
+  };
+  return 0;
+}
+
+// Returns the size for a message of a particular type.
+static inline size_t rasMsgLength(rasMsgType type, rasCollectiveType collType = RAS_MSG_NONE) {
+  struct rasMsg* msg;
+  switch (type) {
+    case RAS_MSG_CONNINIT:
+      return offsetof(struct rasMsg, connInit) + sizeof(msg->connInit);
+    case RAS_MSG_CONNINITACK:
+      return offsetof(struct rasMsg, connInitAck) + sizeof(msg->connInitAck);
+    case RAS_MSG_KEEPALIVE:
+      return offsetof(struct rasMsg, keepAlive) + sizeof(msg->keepAlive);
+    case RAS_MSG_PEERSUPDATE:
+      return offsetof(struct rasMsg, peersUpdate) + sizeof(msg->peersUpdate);
+    case RAS_MSG_COLLREQ:
+      return offsetof(struct rasMsg, collReq) + rasCollDataLength(collType);
+    case RAS_MSG_COLLRESP:
+      return offsetof(struct rasMsg, collResp) + sizeof(msg->collResp);
+  };
+  return 0;
+}
+
+// How much to enlarge any RAS array by if we run out of space.
+#define RAS_INCREMENT 4
+
+// Our clock has nanosecond resolution.
+#define CLOCK_UNITS_PER_SEC 1000000000L
+
+// Keep-alive messages are sent no sooner than a second after the last message was sent down a particular connection.
+#define RAS_KEEPALIVE_INTERVAL (1*CLOCK_UNITS_PER_SEC*ncclParamRasTimeoutFactor())
+
+// If no message arrives in 5 seconds via a particular connection that uses keep-alive messages, generate a warning
+// and try alternative connections.
+#define RAS_KEEPALIVE_TIMEOUT_WARN (5*CLOCK_UNITS_PER_SEC*ncclParamRasTimeoutFactor())
+
+// Abort a socket that uses keep-alive messages if no message arrives in 20 seconds.
+// We will try to re-establish communication via that connection (until RAS_PEER_DEAD_TIMEOUT).
+#define RAS_KEEPALIVE_TIMEOUT_ERROR RAS_STUCK_TIMEOUT
+
+// Retry connecting on failing sockets (ECONNREFUSED, etc.) once a second.
+#define RAS_CONNECT_RETRY (1*CLOCK_UNITS_PER_SEC*ncclParamRasTimeoutFactor())
+
+// If we can't connect in 5 seconds, we generate a warning and try alternative connections.
+#define RAS_CONNECT_WARN RAS_KEEPALIVE_TIMEOUT_WARN
+
+// Abort a busy socket (one we are trying to send on, or one that was being established) if there's been
+// no sign of progress in 20 second.  We will try to re-establish communication (up to RAS_PEER_DEAD_TIMEOUT).
+#define RAS_STUCK_TIMEOUT (20*CLOCK_UNITS_PER_SEC*ncclParamRasTimeoutFactor())
+
+// Terminate ad-hoc connections that have not been used in 60 seconds.
+#define RAS_IDLE_TIMEOUT (60*CLOCK_UNITS_PER_SEC*ncclParamRasTimeoutFactor())
+
+// If the socket is closed by peer within 5 seconds from the idle timeout, do not attempt to re-establish.
+#define RAS_IDLE_GRACE_PERIOD (5*CLOCK_UNITS_PER_SEC*ncclParamRasTimeoutFactor())
+
+// Declare a peer as dead and don't retry communicating with it if we couldn't reach it for 60 seconds.
+#define RAS_PEER_DEAD_TIMEOUT (60*CLOCK_UNITS_PER_SEC*ncclParamRasTimeoutFactor())
+
+// Abort a leg of a collective operation if the response takes more than 5 seconds to arrive *and* one of the
+// connections experiences delays.
+#define RAS_COLLECTIVE_LEG_TIMEOUT (RAS_COLLECTIVE_LEG_TIMEOUT_SEC*CLOCK_UNITS_PER_SEC*ncclParamRasTimeoutFactor())
+
+// Abort a whole collective operation after at most RAS_COLLECTIVE_LEG_TIMEOUT+RAS_COLLECTIVE_EXTRA_TIMEOUT (10s).
+#define RAS_COLLECTIVE_EXTRA_TIMEOUT (RAS_COLLECTIVE_EXTRA_TIMEOUT_SEC*CLOCK_UNITS_PER_SEC*ncclParamRasTimeoutFactor())
+
+// Structure used for tracking the progress of sending a RAS message.
+struct rasMsgMeta {
+  struct rasMsgMeta* next;
+  int64_t enqueueTime;
+  int offset; // Progress sending the message (including the message size itself (an int, which is sent first)).
+  int length; // Length of the message (*excluding* the message size).
+  struct rasMsg msg; // Variable length.
+};
+
+// Describes an ongoing collective RAS operation (apart from broadcasts, which don't need a response).
+// For every collective operation, each participating RAS thread will create its own.
+struct rasCollective {
+  union ncclSocketAddress rootAddr;
+  uint64_t rootId;
+
+  rasCollectiveType type;
+
+  int64_t timeout;
+  bool timeoutWarned;
+
+  int64_t startTime; // For timeout calculations.
+  int fromConnIdx; // The connection we received the request from.
+
+  int* fwdConns; // Indices of the connections we forwarded the request to; replaced by -1 as the responses arrive.
+  int nFwdSent; // Count of the above (local process only).
+  int nFwdRecv; // Count of the responses received or timeouts (local process only).
+
+  int nLegTimeouts; // Collective (from this process and the responses we received).
+
+  union ncclSocketAddress* peers; // Collective (from this process and the responses we received).
+  int nPeers;
+
+  char* data; // Collective (from this process and the responses we received).
+  int nData;
+};
+
+// Collective data in RAS_COLL_CONNS responses.
+struct rasCollConns {
+  int64_t travelTimeMin;
+  int64_t travelTimeMax;
+  int64_t travelTimeSum;
+  int64_t travelTimeCount;
+  int nConns;
+  int nNegativeMins;
+  struct negativeMin {
+    union ncclSocketAddress source;
+    union ncclSocketAddress dest;
+    int64_t travelTimeMin;
+  } negativeMins[0]; // Variable length.
+};
+
+// Collective data in RAS_COLL_COMMS responses.
+struct rasCollComms {
+  int nComms;
+  struct comm {
+    uint64_t commHash;
+    int commNRanks;
+    int nRanks; // number of elements in the array below, *not* in the communicator.
+    struct rank {
+      int commRank;
+      int peerIdx; // Index within rasCollective->peers, *not* rasPeers.
+      uint64_t collOpCount;
+      struct {
+        ncclResult_t initState:4;
+        ncclResult_t asyncError:4;
+        bool finalizeCalled:1;
+        bool destroyFlag:1;
+        bool abortFlag:1;
+      } status;
+      char cudaDev;
+      char nvmlDev;
+    } ranks[0]; // Variable length. Sorted by commRank.  Optimized for 1 GPU/process.
+  } comms[0]; // Variable length. Sorted by commHash.
+};
+
+// Holds data needed to keep track of a connection belonging to a RAS network link (either the primary one
+// or one of the fallbacks).
+struct rasLinkConn {
+  int peerIdx; // Index in the rasPeers array of the peer this entry describes.  Could be -1 (an entry initiated
+               // by an as of yet unknown peer -- should be a temporary situation that resolves via peer updates).
+  int connIdx; // Index in the rasConns array of the connection to the above peer.  Could be -1 (a placeholder
+               // for a connection to be started by the remote peer).
+  bool external; // true if the entry exists only due to an external request (requested by a remote peer, most
+                 // likely as part of fault recovery).  Such connections are kept as fallbacks even if there's a
+                 // valid primary connection, in order to ensure that keep-alive messages are sent.
+};
+
+// Describes a link that forms the backbone of the RAS network.  Links focus on direction (previous/next in
+// case of 1-D topology) rather than a particular destination.  The are implemented using rasConnections, but
+// they are persistent through the life of the RAS threads, whereas rasConnections can be terminated if the RAS
+// network is reconfigured or a peer dies.
+struct rasLink {
+  int direction; // 1 for nextLink, -1 for prevLink.
+
+  // Index 0 is the primary connection; any additional ones are fallbacks (that get created if we are having
+  // problems with the primary connection).  The elements are de-facto ordered (highest-preference ones have
+  // the lowest indices).
+  struct rasLinkConn* conns;
+  int nConns;
+  int connsSize; // Array size; could be larger than nConns.
+
+  // Keep track of a timeout in case we did not create a connection during the last peers update (because we expect
+  // the peer on the other side to do so) but that peer failed to initiate.
+  int64_t lastUpdatePeersTime;
+};
+
+// Describes a connection to another peer on the RAS network.  It is meant to be more persistent than a volatile
+// socket (described by the rasSocket structure), which can be affected by transient network issues.
+struct rasConnection {
+  bool inUse;
+
+  union ncclSocketAddress addr;
+
+  // Index of the current rasSocket in the rasSockets array.  Note that multiple rasSocket entries may point back
+  // to a single entry here, for sockets that are in the process of being terminated and re-established.
+  // We use indices, not pointers, because the arrays holding these structures can be re-alloced at run time.
+  // -1 if there is no such socket.
+  int sockIdx;
+
+  // We keep the rasPeersHash of remote connections to minimize the number of needless exchanges.
+  // There is a subtle difference in the meaning of lastSentPeersHash and lastRecvPeersHash.
+  // lastSentPeersHash stores *our* rasPeersHash from the time we last sent a peers *update* through this connection
+  // (which is different than sending just the hash, like we do in KEEPALIVE, etc.).
+  // lastRecvPeersHash stores the latest known rasPeersHash of the peer (received via KEEPALIVE, etc.).
+  uint64_t lastSentPeersHash;
+  uint64_t lastRecvPeersHash;
+
+  // Same but for rasDeadPeersHash.
+  uint64_t lastSentDeadPeersHash;
+  uint64_t lastRecvDeadPeersHash;
+
+  // Queue of messages to send.
+  struct ncclIntruQueue<struct rasMsgMeta, &rasMsgMeta::next> sendQ;
+
+  // Used for keeping track of timeouts that may extend beyond the lifetime of a socket.
+  // The timeout starts when the connection is being created (and is turned off when the initialization is completed
+  // successfully) or when we detect a problem, such as a socket timeout (in the latter case, we may need to
+  // retroactively calculate the start time).
+  // A value of 0 indicates that they are not currently in use.
+  int64_t startRetryTime;
+  int64_t lastRetryTime;
+
+  bool experiencingDelays; // A flag indicating that the connection is currently subject to RAS_KEEPALIVE_TIMEOUT_WARN
+                           // or RAS_CONNECT_WARN timeout.  If set, the warnings have been issued and the fallbacks
+                           // have been initiated if needed.
+  bool linkFlag; // Used within rasNet* calls to mark whether this connection was already handled when iterating over
+                 // multiple links (since a connection can belong to more than one link).
+  // The below four fields are for statistical purposes only.
+  int64_t travelTimeMin;
+  int64_t travelTimeMax;
+  int64_t travelTimeSum;
+  int64_t travelTimeCount;
+};
+
+// Status of a RAS socket.
+typedef enum {
+  RAS_SOCK_CLOSED = 0,
+  RAS_SOCK_CONNECTING = 1,
+  RAS_SOCK_HANDSHAKE = 2,
+  RAS_SOCK_READY = 3,
+  RAS_SOCK_TERMINATING = 4
+} rasSocketStatus;
+
+// Describes a socket implementing communication between two peers.
+struct rasSocket {
+  struct ncclSocket sock;
+
+  rasSocketStatus status;
+
+  int pfd; // Index in the rasPfds array.
+
+ // Index of the corresponding entry in the rasConns array.
+  // We use indices, not pointers, because the arrays holding these structures can be re-alloced at run time.
+  // -1 if there is no connection (normal condition on the accept side before the connInit message).
+  int connIdx;
+
+  int64_t createTime;
+  int64_t lastSendTime;
+  int64_t lastRecvTime;
+
+  // Data on the message currently being received.
+  int recvOffset;
+  int recvLength;
+  struct rasMsg* recvMsg;
+};
+
+// Status of a RAS client.
+typedef enum {
+  RAS_CLIENT_CLOSED = 0,
+  RAS_CLIENT_CONNECTED = 1,
+  RAS_CLIENT_INIT = 2,
+  RAS_CLIENT_CONNS = 3,
+  RAS_CLIENT_COMMS = 4,
+  RAS_CLIENT_FINISHED = 99
+} rasClientStatus;
+
+// Describes a RAS client.
+struct rasClient {
+  int sock;
+
+  rasClientStatus status;
+
+  int pfd; // Index in the rasPfds array.
+
+  char recvBuffer[1024];
+  int recvOffset;
+
+  // Queue of messages to send.
+  struct ncclIntruQueue<struct rasMsgMeta, &rasMsgMeta::next> sendQ;
+
+  int verbose;
+  int64_t timeout;
+
+  // State stored during asynchronous operations such as collectives.
+  int collIdx; // Index to the onging rasCollective.
+};
+
+
+// ras.cc
+extern struct pollfd* rasPfds;
+extern struct ncclSocket rasNetListeningSocket;
+extern std::mutex ncclCommsMutex;
+extern struct ncclComm** ncclComms;
+extern int nNcclComms;
+extern  bool ncclCommsSorted;
+extern char rasLine[SOCKET_NAME_MAXLEN+1];
+
+int64_t ncclParamRasTimeoutFactor();
+ncclResult_t rasMsgAlloc(struct rasMsg** msg, size_t msgLen);
+void rasMsgFree(struct rasMsg* msg);
+void rasConnEnqueueMsg(struct rasConnection* conn, struct rasMsg* msg, size_t msgLen, bool front = false);
+ncclResult_t rasConnSendMsg(struct rasConnection* conn, int* closed, bool* allSent);
+ncclResult_t rasMsgRecv(struct rasSocket* sock, struct rasMsg** msg, int* closed);
+ncclResult_t rasMsgHandle(struct rasMsg* msg, struct rasSocket* sock);
+void rasMsgHandleBCDeadPeer(const struct rasCollRequest* req, bool* pDone);
+ncclResult_t rasGetNewPollEntry(int* index);
+
+
+// rasnet.cc
+extern struct rasLink rasNextLink, rasPrevLink;
+extern struct rasConnection* rasConns;
+extern int nRasConns;
+extern struct rasSocket *rasSockets;
+extern int nRasSockets;
+
+ncclResult_t getNewConnEntry(struct rasConnection** pConn);
+ncclResult_t rasConnCreate(const union ncclSocketAddress* addr, int* pConnIdx);
+int rasConnFind(const union ncclSocketAddress* addr);
+void rasConnsHandleTimeouts(int64_t now, int64_t* nextWakeup);
+void rasConnDisconnect(const union ncclSocketAddress* addr);
+ncclResult_t rasNetAcceptNewSocket();
+void rasSocksHandleTimeouts(int64_t now, int64_t* nextWakeup);
+void rasSocketTerminate(struct rasSocket* sock, bool finalize = false, uint64_t startRetryOffset = 0,
+                        bool retry = true);
+void rasSockEventLoop(int sockIdx, int pollIdx);
+void rasNetHandleTimeouts(int64_t now, int64_t* nextWakeup);
+ncclResult_t rasMsgHandleKeepAlive(const struct rasMsg* msg, struct rasSocket* sock);
+ncclResult_t rasLinkUpdateConn(struct rasLink* link, int connIdx, int peerIdx, bool external = false,
+                               bool insert = false, bool pretend = false, int* pLinkIdx = nullptr);
+
+// peers.cc
+extern struct rasPeerInfo* rasPeers;
+extern int nRasPeers;
+extern uint64_t rasPeersHash;
+extern union ncclSocketAddress* rasDeadPeers;
+extern int nRasDeadPeers;
+extern uint64_t rasDeadPeersHash;
+
+ncclResult_t rasLocalHandleAddRanks(struct rasRankInit* ranks, int nranks);
+int rasPeerFind(const union ncclSocketAddress* addr);
+ncclResult_t rasConnSendPeersUpdate(struct rasConnection* conn, const struct rasPeerInfo* peers, int nPeers);
+ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock);
+int rasLinkCalculatePeer(const struct rasLink* link, int peerIdx, bool isFallback = false);
+ncclResult_t rasPeerDeclareDead(const union ncclSocketAddress* addr);
+bool rasPeerIsDead(const union ncclSocketAddress* addr);
+int ncclSocketsCompare(const void* p1, const void* p2);
+bool ncclSocketsSameNode(const union ncclSocketAddress* a1, const union ncclSocketAddress* a2);
+
+
+// collectives.cc
+extern struct rasCollective* rasCollectives;
+
+void rasCollReqInit(struct rasCollRequest* req);
+ncclResult_t rasNetSendCollReq(const struct rasCollRequest* req, size_t reqLen, bool* pAllDone = nullptr,
+                               int* pCollIdx = nullptr, int fromConnIdx = -1);
+ncclResult_t rasMsgHandleCollReq(struct rasMsg* msg, struct rasSocket* sock);
+ncclResult_t rasMsgHandleCollResp(struct rasMsg* msg, struct rasSocket* sock);
+void rasCollsPurgeConn(int connIdx);
+void rasCollFree(struct rasCollective* coll);
+void rasCollsHandleTimeouts(int64_t now, int64_t* nextWakeup);
+
+// client_support.cc
+extern int rasClientListeningSocket;
+extern struct rasClient* rasClients;
+extern int nRasClients;
+ncclResult_t rasClientInitSocket();
+ncclResult_t rasClientAcceptNewSocket();
+ncclResult_t rasClientResume(struct rasCollective* coll);
+void rasClientEventLoop(int clientIdx, int pollIdx);
+const char* rasGpuDevsToString(uint64_t cudaDevs, uint64_t nvmlDevs, char* buf, size_t size);
+
+#endif // !NCCL_RAS_CLIENT
+
+#endif // !NCCL_RAS_INTERNAL_H_
@@ -1,204 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "argcheck.h" // Need some checks here since we access comm
-#include "nccl.h"
-#include "comm.h"
-#include "net.h"
-#include "register.h"
-#include "transport.h"
-
-ncclResult_t ncclNetDeregister(struct ncclComm* comm, struct ncclReg* reg) {
-  struct ncclRegCache* cache = &comm->regCache;
-  ncclDebugNoWarn = NCCL_NET;
-  for (int d=0; d<reg->nDevs; d++) {
-    if (reg->handles[d] != NULL) NCCLCHECK(comm->ncclNet->deregMr(cache->sComms[reg->devs[d]], reg->handles[d]));
-  }
-  reg->nDevs = 0;
-  free(reg->handles);
-  reg->handles = NULL;
-  ncclDebugNoWarn = 0;
-  return ncclSuccess;
-}
-
-ncclResult_t ncclNetRegister(struct ncclComm* comm, void* addr, size_t size, struct ncclReg* reg) {
-  struct ncclRegCache* cache = &comm->regCache;
-  int netCount = 0;
-  if (comm->topo != NULL) NCCLCHECK(ncclTopoGetNetCount(comm->topo, &netCount));
-  if (netCount == 0) return ncclSuccess;
-
-  ncclResult_t ret = ncclSuccess;
-
-  // Find local devices for p2p operations
-  for (int c=0; c<comm->p2pnChannels; c++) {
-    int dev;
-    if (ncclTopoGetLocalNet(comm->topo, comm->rank, c, NULL, &dev) != ncclSuccess) goto end; // No local net
-    ncclNetProperties_t props;
-    NCCLCHECKGOTO(comm->ncclNet->getProperties(dev, &props), ret, end);
-    if (props.regIsGlobal == 0) { // We need to be sure all NICs support global registration.
-      reg->nDevs = 0;
-      break;
-    }
-    int found = 0;
-    for (int d=0; d<reg->nDevs; d++) if (reg->devs[d] == dev) found = 1;
-    if (!found) reg->devs[reg->nDevs++] = dev;
-  }
-
-  NCCLCHECKGOTO(ncclCalloc(&reg->handles, reg->nDevs), ret, end);
-
-  ncclDebugNoWarn = NCCL_NET;
-  for (int d=0; d<reg->nDevs; d++) {
-    int dev = reg->devs[d];
-    reg->handles[d] = NULL;
-
-    if (cache->sComms[dev] == NULL) {
-      // Create a loopback network comm object for that device to register the buffers.
-      void *lComm = NULL;
-      ncclNetHandle_t netHandle;
-      bool connected = false;
-      NCCLCHECKGOTO(comm->ncclNet->listen(dev, &netHandle, &lComm), ret, end);
-      while (!connected) {
-        if (*comm->abortFlag) {
-          goto end;
-        }
-        if (cache->sComms[dev] == NULL)
-          NCCLCHECKGOTO(comm->ncclNet->connect(dev, &netHandle, cache->sComms+dev, NULL), ret, end);
-        if (cache->rComms[dev] == NULL)
-          NCCLCHECKGOTO(comm->ncclNet->accept(lComm, cache->rComms+dev, NULL), ret, end);
-        connected = (cache->rComms[dev] != NULL) && (cache->sComms[dev] != NULL);
-      }
-      NCCLCHECK(comm->ncclNet->closeListen(lComm));
-    }
-    if (comm->ncclNet->regMr(cache->sComms[dev], addr, size, NCCL_PTR_CUDA, reg->handles+d) != ncclSuccess) {
-      reg->handles[d] = NULL;
-      NCCLCHECK(ncclNetDeregister(comm, reg));
-      reg->nDevs = 0;
-      goto end;
-    }
-  }
-end:
-  INFO(NCCL_INIT, "Register ptr %p size %ld on %d net devices", addr, size, reg->nDevs);
-  ncclDebugNoWarn = 0;
-  if (ret != ncclSuccess) NCCLCHECK(ncclNetDeregister(comm, reg));
-  return ret;
-}
-
-ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** reg) {
-  struct ncclRegCache* cache = &comm->regCache;
-  uintptr_t pageSize = cache->pageSize;
-  uintptr_t addr = (uintptr_t)data & -pageSize;
-  size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
-
-  *reg = NULL;
-  for (int slot=0; /*true*/; slot++) {
-    if (slot == cache->population || addr < cache->slots[slot]->addr) return ncclSuccess;
-    if ((addr >= cache->slots[slot]->addr) &&
-        ((addr-cache->slots[slot]->addr)/pageSize+pages) <= cache->slots[slot]->pages) {
-      *reg = cache->slots[slot];
-      return ncclSuccess;
-    }
-  }
-}
-NCCL_PARAM(LocalRegister, "LOCAL_REGISTER", 1);
-
-ncclResult_t ncclRegister(struct ncclComm* comm, void* data, size_t size, void** handle) {
-  if (!ncclParamLocalRegister()) {
-    *handle = NULL;
-    return ncclSuccess;
-  }
-  INFO(NCCL_REG, "register comm %p buffer %p size %zi", comm, data, size);
-  struct ncclRegCache* cache = &comm->regCache;
-  uintptr_t pageSize = cache->pageSize;
-  uintptr_t addr = (uintptr_t)data & -pageSize;
-  size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
-  for (int slot=0; /*true*/; slot++) {
-    if ((slot == cache->population) || (addr < cache->slots[slot]->addr)) {
-      if (cache->population == cache->capacity) { // must grow cache
-        cache->capacity = cache->capacity < 32 ? 32 : 2*cache->capacity;
-        NCCLCHECK(ncclRealloc(&cache->slots, cache->population, cache->capacity));
-      }
-      memmove(cache->slots+slot+1, cache->slots+slot, (cache->population-slot)*sizeof(struct ncclReg*));
-      NCCLCHECK(ncclCalloc(cache->slots+slot, 1));
-      struct ncclReg* regSlot = cache->slots[slot];
-      regSlot->addr = addr;
-      regSlot->pages = pages;
-      regSlot->refs = 1;
-      NCCLCHECK(ncclNetRegister(comm, (void*)addr, pages*pageSize, regSlot));
-      regSlot->state |= NET_REG_COMPLETE;
-      cache->population += 1;
-      *handle = regSlot;
-      return ncclSuccess;
-    } else if ((addr >= cache->slots[slot]->addr) &&
-        ((addr-cache->slots[slot]->addr)/pageSize+pages) <= cache->slots[slot]->pages) {
-      cache->slots[slot]->refs++;
-      *handle = cache->slots[slot];
-      return ncclSuccess;
-    }
-  }
-}
-
-ncclResult_t ncclRegCleanup(struct ncclComm* comm) {
-  struct ncclRegCache* cache = &comm->regCache;
-  for (int i=0; i<cache->population; i++) {
-    INFO(NCCL_INIT, "Cleanup buffer %p pages %lx", (void*)cache->slots[i]->addr, cache->slots[i]->pages);
-    NCCLCHECK(ncclNetDeregister(comm, cache->slots[i]));
-    if (cache->slots[i]->state & NVLS_REG_COMPLETE) NCCLCHECK(ncclNvlsDeregBuffer(&cache->slots[i]->mcHandle, cache->slots[i]->regAddr, cache->slots[i]->dev, cache->slots[i]->regSize));
-    free(cache->slots[i]);
-  }
-  free(cache->slots);
-  for (int d=0; d<MAXCHANNELS; d++) {
-    if (cache->sComms[d]) NCCLCHECK(comm->ncclNet->closeSend(cache->sComms[d]));
-    if (cache->rComms[d]) NCCLCHECK(comm->ncclNet->closeRecv(cache->rComms[d]));
-  }
-  return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclCommRegister, const ncclComm_t comm, void* buff, size_t size, void** handle);
-ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) {
-  NCCLCHECK(CommCheck(comm, "ncclCommRegister", "comm"));
-  if (comm->checkPointers) NCCLCHECK(CudaPtrCheck(buff, comm, "buff", "ncclCommRegister"));
-  NCCLCHECK(ncclRegister(comm, buff, size, handle));
-  return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclCommDeregister, const ncclComm_t comm, void* handle);
-ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle) {
-  NCCLCHECK(CommCheck(comm, "ncclCommRegister", "comm"));
-  struct ncclReg* reg = (struct ncclReg*)handle;
-  struct ncclRegCache* cache = &comm->regCache;
-  int slot;
-  int saveDev;
-  if (handle == NULL) goto exit;
-  CUDACHECK(cudaGetDevice(&saveDev));
-  CUDACHECK(cudaSetDevice(comm->cudaDev));
-  for (slot=0; slot<cache->population && cache->slots[slot] != reg; slot++);
-  if (slot == cache->population) {
-    WARN("Deregister: Could not find handle");
-    return ncclInvalidUsage;
-  }
-  if (--reg->refs) return ncclSuccess;
-  NCCLCHECK(ncclNetDeregister(comm, reg));
-  if (reg->state & NVLS_REG_COMPLETE) {
-    NCCLCHECK(ncclNvlsDeregBuffer(&reg->mcHandle, reg->regAddr, reg->dev, reg->regSize));
-    reg->regAddr = (CUdeviceptr)NULL;
-  }
-  if (reg->state & COLLNET_REG_COMPLETE) {
-    NCCLCHECK(ncclCollnetDeregBuffer(comm, reg->collnetProxyconn, reg->collnetHandle));
-  }
-  if (reg->state & IPC_REG_COMPLETE) {
-    for (int i = 0; i < NCCL_MAX_LOCAL_RANKS; ++i)
-      if (reg->ipcInfos[i])
-        NCCLCHECK(ncclIpcDeregBuffer(comm, reg->ipcInfos[i]));
-    if (reg->regIpcAddrs.hostPeerRmtAddrs) free(reg->regIpcAddrs.hostPeerRmtAddrs);
-    if (reg->regIpcAddrs.devPeerRmtAddrs) NCCLCHECK(ncclCudaFree(reg->regIpcAddrs.devPeerRmtAddrs));
-  }
-  free(reg);
-  memmove(cache->slots+slot, cache->slots+slot+1, (cache->population-slot-1)*sizeof(struct ncclReg*));
-  cache->population -= 1;
-  CUDACHECK(cudaSetDevice(saveDev));
-exit:
-  return ncclSuccess;
-}
@@ -0,0 +1,446 @@
+#include "register.h"
+#include "transport.h"
+#include "enqueue.h"
+
+static ncclResult_t registerCheckP2PConnection(struct ncclComm* comm, struct ncclConnector* conn, struct ncclTopoGraph* graph, int peer, bool* needReg) {
+  if (conn->connected) {
+    if (conn->conn.flags & (NCCL_P2P_READ | NCCL_P2P_WRITE)) {
+      *needReg = true;
+    } else {
+      // network connection
+      *needReg = false;
+    }
+  } else {
+    struct ncclPeerInfo* peerInfo = &comm->peerInfo[peer];
+    struct ncclPeerInfo* myInfo = &comm->peerInfo[comm->rank];
+    int canConnect = 0;
+    NCCLCHECK(ncclTransports[0]->canConnect(&canConnect, comm, graph, myInfo, peerInfo));
+    if (canConnect) {
+      *needReg = true;
+    } else {
+      *needReg = false;
+    }
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclRegisterCollNvlsBuffers(
+    struct ncclComm* comm, struct ncclTaskColl* info,
+    void* outRegBufSend[NCCL_MAX_LOCAL_RANKS],
+    void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS],
+    struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue,
+    bool* regNeedConnect
+  ) {
+  ncclResult_t result = ncclSuccess;
+
+  info->regBufType = NCCL_REGULAR_BUFFER;
+  *regNeedConnect = true;
+  if (!(ncclParamLocalRegister() || (comm->planner.persistent && ncclParamGraphRegister()))) goto exit;
+#if CUDART_VERSION >= 11030
+  if (info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) {
+    if (!comm->nvlsRegSupport || info->opDev.op == ncclDevPreMulSum) goto exit;
+    int nvlsReged = 0;
+    int collnetReged = 0;
+    const void *sendbuff = info->sendbuff;
+    void *recvbuff = info->recvbuff;
+    void *recvHandle = NULL, *sendHandle = NULL;
+    if (info->func == ncclFuncAllGather) sendbuff = NULL;
+    if (info->func == ncclFuncReduceScatter) recvbuff = NULL;
+    size_t elementSize = ncclTypeSize(info->datatype);
+    size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count);
+    size_t recvbuffSize = elementSize*ncclFuncRecvCount(info->func, comm->nRanks, info->count);
+
+    /* first try graph registration. */
+    if (comm->planner.persistent && ncclParamGraphRegister()) {
+      ncclNvlsGraphRegisterBuffer(comm, sendbuff, recvbuff, sendbuffSize, recvbuffSize, &nvlsReged, outRegBufSend, outRegBufRecv, cleanupQueue, &info->nCleanupQueueElts);
+    }
+
+    if (nvlsReged == 0 && ncclParamLocalRegister()) {
+      ncclNvlsLocalRegisterBuffer(comm, sendbuff, recvbuff, sendbuffSize, recvbuffSize, &nvlsReged, outRegBufSend, outRegBufRecv);
+    }
+
+    if (nvlsReged && comm->nNodes > 1 && info->algorithm == NCCL_ALGO_NVLS) {
+      if (comm->planner.persistent && ncclParamGraphRegister()) {
+        ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetSend, &collnetReged, &sendHandle, cleanupQueue, &info->nCleanupQueueElts);
+        if (collnetReged) ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &collnetReged, &recvHandle, cleanupQueue, &info->nCleanupQueueElts);
+      }
+
+      if (collnetReged == 0 && ncclParamLocalRegister()) {
+        ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetSend, &collnetReged, &sendHandle);
+        if (collnetReged) ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &collnetReged, &recvHandle);
+      }
+    }
+
+    if (nvlsReged) {
+      *regNeedConnect = 0;
+      /* tweak NVLS channels usage; for registered NVLS buffer, we only need 4/5 channels to
+       * saturate bandwidth. */
+      if (comm->nNodes == 1) {
+        if (info->func == ncclFuncReduceScatter)
+          info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 5));
+        else
+          info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 4));
+      } else {
+        info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 6));
+      }
+      info->regBufType |= NCCL_NVLS_REG_BUFFER;
+    }
+
+    if (collnetReged) {
+      info->regBufType |= NCCL_NET_REG_BUFFER;
+      info->sendMhandle = sendHandle;
+      info->recvMhandle = recvHandle;
+    }
+  }
+exit:
+#endif
+  return result;
+}
+
+ncclResult_t ncclRegisterCollBuffers(
+    struct ncclComm* comm, struct ncclTaskColl* info,
+    void* outRegBufSend[NCCL_MAX_LOCAL_RANKS],
+    void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS],
+    struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue,
+    bool* regNeedConnect
+  ) {
+  ncclResult_t result = ncclSuccess;
+
+  info->regBufType = NCCL_REGULAR_BUFFER;
+  *regNeedConnect = true;
+  if (!(ncclParamLocalRegister() || (comm->planner.persistent && ncclParamGraphRegister()))) goto exit;
+#if CUDART_VERSION >= 11030
+  if (info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) {
+    /* this part of nvls reg code is temporarily not used and obsolete. */
+    if (!comm->nvlsRegSupport || info->opDev.op == ncclDevPreMulSum) goto exit;
+    int nvlsReged = 0;
+    int collnetReged = 0;
+    const void *sendbuff = info->sendbuff;
+    void *recvbuff = info->recvbuff;
+    void *recvHandle = NULL, *sendHandle = NULL;
+    if (info->func == ncclFuncAllGather) sendbuff = NULL;
+    if (info->func == ncclFuncReduceScatter) recvbuff = NULL;
+    size_t elementSize = ncclTypeSize(info->datatype);
+    size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count);
+    size_t recvbuffSize = elementSize*ncclFuncRecvCount(info->func, comm->nRanks, info->count);
+
+    /* first try local registration. */
+    if (ncclParamLocalRegister()) {
+      ncclNvlsLocalRegisterBuffer(comm, sendbuff, recvbuff, sendbuffSize, recvbuffSize, &nvlsReged, outRegBufSend, outRegBufRecv);
+    }
+
+    if (nvlsReged == 0 && comm->planner.persistent && ncclParamGraphRegister()) {
+      ncclNvlsGraphRegisterBuffer(comm, sendbuff, recvbuff, sendbuffSize, recvbuffSize, &nvlsReged, outRegBufSend, outRegBufRecv, cleanupQueue, &info->nCleanupQueueElts);
+    }
+
+    if (comm->nNodes > 1 && info->algorithm == NCCL_ALGO_NVLS) {
+      if (ncclParamLocalRegister()) {
+        ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetSend, &collnetReged, &sendHandle);
+        if (collnetReged) ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &collnetReged, &recvHandle);
+      }
+
+      if (collnetReged == 0 && comm->planner.persistent && ncclParamGraphRegister()) {
+        ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetSend, &collnetReged, &sendHandle, cleanupQueue, &info->nCleanupQueueElts);
+        if (collnetReged) ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &collnetReged, &recvHandle, cleanupQueue, &info->nCleanupQueueElts);
+      }
+    }
+
+    if (nvlsReged) {
+      *regNeedConnect = 0;
+      /* tweak NVLS channels usage; for registered NVLS buffer, we only need 4/5 channels to
+       * saturate bandwidth. */
+      if (comm->nNodes == 1) {
+        if (info->func == ncclFuncReduceScatter)
+          info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 5));
+        else
+          info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 4));
+      } else {
+        info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 6));
+      }
+      info->regBufType |= NCCL_NVLS_REG_BUFFER;
+    }
+
+    if (collnetReged) {
+      info->regBufType |= NCCL_NET_REG_BUFFER;
+      info->sendMhandle = sendHandle;
+      info->recvMhandle = recvHandle;
+    }
+  } else if (info->protocol == NCCL_PROTO_SIMPLE) {
+    // IPC buffer registration
+    if (info->func == ncclFuncReduceScatter && info->algorithm != NCCL_ALGO_COLLNET_DIRECT) goto exit;
+    if (info->algorithm == NCCL_ALGO_RING && ((info->func == ncclFuncAllReduce && info->sendbuff == info->recvbuff) || info->func == ncclFuncReduce)) goto exit;
+    if ((info->algorithm == NCCL_ALGO_TREE || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) && info->sendbuff == info->recvbuff) goto exit;
+    if (info->func == ncclFuncAllGather && info->algorithm == NCCL_ALGO_PAT) goto exit;
+
+    int peerRanks[NCCL_MAX_LOCAL_RANKS];
+    int nPeers = 0;
+    size_t elementSize = ncclTypeSize(info->datatype);
+    size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count);
+    size_t recvbuffSize = elementSize*ncclFuncRecvCount(info->func, comm->nRanks, info->count);
+    int regBufFlag = 0;
+    memset(peerRanks, 0xff, sizeof(int) * NCCL_MAX_LOCAL_RANKS);
+
+    if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) {
+      struct ncclChannel* channel = comm->channels;
+      int ipcRegFlag = 0, netSendRegFlag = 0, netRecvRegFlag = 0;
+      void *sendHandle, *recvHandle;
+      if (info->func != ncclFuncReduceScatter && comm->intraNodeP2pSupport) {
+        for (int r = 0; r < NCCL_MAX_DIRECT_ARITY; ++r) {
+          for (int down = 0; down < 2; ++down) {
+            int peer = down ? channel->collnetDirect.down[r] : channel->collnetDirect.up[r];
+            if (peer != -1) {
+              struct ncclConnector* peerConn = &channel->peers[peer]->recv[0];
+              bool needReg = false;
+
+              NCCLCHECK(registerCheckP2PConnection(comm, peerConn, &comm->graphs[NCCL_ALGO_COLLNET_DIRECT], peer, &needReg));
+              if (needReg) {
+                bool found = false;
+                for (int p = 0; p < nPeers; ++p) {
+                  if (peerRanks[p] == peer) {
+                    found = true;
+                    break;
+                  }
+                }
+                if (!found) peerRanks[nPeers++] = peer;
+              }
+            }
+          }
+        }
+
+        if (nPeers > 0) {
+          if (comm->planner.persistent && ncclParamGraphRegister()) {
+            ncclIpcGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &ipcRegFlag, &info->sendbuffOffset, &info->sendbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts);
+            if (ipcRegFlag) ncclIpcGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &ipcRegFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts);
+          }
+          if (!ipcRegFlag && ncclParamLocalRegister()) {
+            ncclIpcLocalRegisterBuffer(comm, info->sendbuff, sendbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &ipcRegFlag, &info->sendbuffOffset, &info->sendbuffRmtAddrs);
+            if (ipcRegFlag) ncclIpcLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &ipcRegFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs);
+          }
+        }
+        if (ipcRegFlag) {
+          info->regBufType |= NCCL_IPC_REG_BUFFER;
+        }
+      }
+
+      // register collnet buffer
+      if (info->opDev.op != ncclDevPreMulSum && info->opDev.op != ncclDevSumPostDiv && !(info->func == ncclFuncAllReduce && !comm->isOneRPN)) {
+        if (comm->planner.persistent && ncclParamGraphRegister()) {
+          ncclCollnetGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &netSendRegFlag, &sendHandle, cleanupQueue, &info->nCleanupQueueElts);
+          info->sendMhandle = sendHandle;
+          if (netSendRegFlag) {
+            ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &netRecvRegFlag, &recvHandle, cleanupQueue, &info->nCleanupQueueElts);
+            info->recvMhandle = recvHandle;
+          }
+        }
+
+        if ((netSendRegFlag == 0 || netRecvRegFlag == 0) && ncclParamLocalRegister()) {
+          if (!netSendRegFlag) {
+            ncclCollnetLocalRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &netSendRegFlag, &sendHandle);
+            info->sendMhandle = sendHandle;
+          }
+          if (netSendRegFlag && !netRecvRegFlag) {
+            ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &netRecvRegFlag, &recvHandle);
+            info->recvMhandle = recvHandle;
+          }
+        }
+      }
+
+      if (netSendRegFlag && netRecvRegFlag) {
+        if (comm->isOneRPN) info->nMaxChannels = 1;
+        info->regBufType |= NCCL_NET_REG_BUFFER;
+      }
+    } else if (info->algorithm == NCCL_ALGO_RING) {
+      struct ncclReg* recvRegRecord = NULL;
+      struct ncclReg* sendRegRecord = NULL;
+      int sendNetPeers = comm->nChannels;
+      int recvNetPeers = comm->nChannels;
+      struct ncclConnector** sendNetConns = NULL;
+      struct ncclConnector** recvNetConns = NULL;
+      void** sendNetHandles = NULL;
+      void** recvNetHandles = NULL;
+      void** srecvNetHandles = NULL;
+      bool hasRecvNetPeer = false;
+      bool hasSendNetPeer = false;
+
+      NCCLCHECK(ncclRegFind(comm, info->recvbuff, recvbuffSize, &recvRegRecord));
+      if (recvRegRecord == NULL && !(comm->planner.persistent && ncclParamGraphRegister())) goto exit;
+      NCCLCHECK(ncclRegFind(comm, info->sendbuff, sendbuffSize, &sendRegRecord));
+      if (sendRegRecord == NULL && !(comm->planner.persistent && ncclParamGraphRegister())) goto exit;
+      NCCLCHECK(ncclCalloc(&sendNetConns, comm->nChannels));
+      NCCLCHECK(ncclCalloc(&sendNetHandles, comm->nChannels));
+      NCCLCHECK(ncclCalloc(&recvNetConns, comm->nChannels));
+      NCCLCHECK(ncclCalloc(&recvNetHandles, comm->nChannels));
+      NCCLCHECK(ncclCalloc(&srecvNetHandles, comm->nChannels));
+
+      for (int c = 0; c < comm->nChannels; ++c) {
+        struct ncclChannel* channel = comm->channels + c;
+        for (int r = 0; r < 2; ++r) {
+          int peer;
+          struct ncclConnector* peerConn;
+          if (r == 0) {
+            peer = channel->ring.prev;
+            peerConn = &channel->peers[peer]->recv[0];
+            if (peerConn->conn.flags & NCCL_DIRECT_NIC) {
+              recvNetConns[c] = peerConn;
+              hasRecvNetPeer = true;
+            }
+          } else {
+            peer = channel->ring.next;
+            peerConn = &channel->peers[peer]->send[0];
+            if (peerConn->conn.flags & NCCL_DIRECT_NIC) {
+              sendNetConns[c] = peerConn;
+              hasSendNetPeer = true;
+            }
+          }
+          if (peerConn->conn.flags & (NCCL_P2P_READ | NCCL_P2P_WRITE)) {
+            bool found = false;
+            for (int p = 0; p < nPeers; ++p) {
+              if (peerRanks[p] == peer) {
+                found = true;
+                break;
+              }
+            }
+            if (!found) peerRanks[nPeers++] = peer;
+          }
+        }
+      }
+      if (nPeers > 0 && comm->intraNodeP2pSupport) {
+        if (comm->planner.persistent && ncclParamGraphRegister()) {
+          ncclIpcGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts);
+        }
+        if (!regBufFlag && ncclParamLocalRegister()) {
+          ncclIpcLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs);
+        }
+      }
+      if (regBufFlag) {
+        info->regBufType = NCCL_IPC_REG_BUFFER;
+      }
+
+      // start net registration
+      regBufFlag = 0;
+      if (!comm->useNetPXN && comm->useGdr && comm->netDeviceType != NCCL_NET_DEVICE_UNPACK) {
+        if (comm->planner.persistent && ncclParamGraphRegister()) {
+          if (hasSendNetPeer) {
+            ncclNetGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, sendNetConns, sendNetPeers, &regBufFlag, sendNetHandles, cleanupQueue, &info->nCleanupQueueElts);
+            if (regBufFlag)
+              ncclNetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, sendNetConns, sendNetPeers, &regBufFlag, srecvNetHandles, cleanupQueue, &info->nCleanupQueueElts);
+          }
+          if ((regBufFlag || !hasSendNetPeer) && hasRecvNetPeer)
+            ncclNetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, recvNetConns, recvNetPeers, &regBufFlag, recvNetHandles, cleanupQueue, &info->nCleanupQueueElts);
+        }
+        if (!regBufFlag && ncclParamLocalRegister()) {
+          if (hasSendNetPeer) {
+            ncclNetLocalRegisterBuffer(comm, info->sendbuff, sendbuffSize, sendNetConns, sendNetPeers, &regBufFlag, sendNetHandles);
+            if (regBufFlag)
+              ncclNetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, sendNetConns, sendNetPeers, &regBufFlag, srecvNetHandles);
+          }
+          if ((regBufFlag || !hasSendNetPeer) && hasRecvNetPeer)
+            ncclNetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, recvNetConns, recvNetPeers, &regBufFlag, recvNetHandles);
+        }
+      }
+
+      if (regBufFlag) {
+        info->regBufType |= NCCL_NET_REG_BUFFER;
+        info->sendNetHandles = sendNetHandles;
+        info->recvNetHandles = recvNetHandles;
+        info->srecvNetHandles = srecvNetHandles;
+        if (comm->isOneRPN && (info->func == ncclFuncAllGather || info->func == ncclFuncBroadcast)) {
+          info->nMaxChannels = 1;
+        }
+      } else {
+        free(sendNetHandles);
+        free(recvNetHandles);
+        free(srecvNetHandles);
+      }
+
+      free(sendNetConns);
+      free(recvNetConns);
+    } else if (info->algorithm == NCCL_ALGO_TREE || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) {
+      struct ncclReg* recvRegRecord;
+      int netSendRegFlag = 0, netRecvRegFlag = 0;
+      void *sendHandle, *recvHandle;
+      NCCLCHECK(ncclRegFind(comm, info->recvbuff, recvbuffSize, &recvRegRecord));
+      if (recvRegRecord == NULL && !(comm->planner.persistent && ncclParamGraphRegister())) goto exit;
+      if (comm->intraNodeP2pSupport) {
+        for (int c = 0; c < comm->nChannels; ++c) {
+          struct ncclChannel* channel = comm->channels + c;
+          struct ncclTree* tree = NULL;
+          int peers[NCCL_MAX_TREE_ARITY + 1];
+
+          if (info->algorithm == NCCL_ALGO_TREE)
+            tree = &channel->tree;
+          else
+            tree = &channel->collnetChain;
+          for (int p = 0; p < NCCL_MAX_TREE_ARITY; ++p) peers[p] = tree->down[p];
+          peers[NCCL_MAX_TREE_ARITY] = tree->up;
+          for (int p = 0; p < NCCL_MAX_TREE_ARITY + 1; ++p) {
+            int peer = peers[p];
+            bool peerNeedReg = false;
+            struct ncclConnector* recvConn = NULL;
+            // P2P transport
+            if (peer == -1 || peer == comm->nRanks) continue;
+            recvConn = &channel->peers[peer]->recv[0];
+            NCCLCHECK(registerCheckP2PConnection(comm, recvConn, &comm->graphs[info->algorithm], peer, &peerNeedReg));
+
+            if (peerNeedReg) {
+              bool found = false;
+              for (int pindex = 0; pindex < nPeers; ++pindex) {
+                if (peerRanks[pindex] == peer) {
+                  found = true;
+                  break;
+                }
+              }
+              if (!found) peerRanks[nPeers++] = peer;
+            }
+          }
+        }
+        if (nPeers > 0) {
+          if (comm->planner.persistent && ncclParamGraphRegister()) {
+            ncclIpcGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts);
+          }
+          if (!regBufFlag && ncclParamLocalRegister()) {
+            ncclIpcLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs);
+          }
+        }
+        if (regBufFlag) {
+          info->regBufType = NCCL_IPC_REG_BUFFER;
+        }
+      }
+
+      // register collnet chain 1RPN buffer
+      if (info->algorithm == NCCL_ALGO_COLLNET_CHAIN && info->opDev.op != ncclDevPreMulSum && info->opDev.op != ncclDevSumPostDiv && comm->isOneRPN) {
+        if (comm->planner.persistent && ncclParamGraphRegister()) {
+          ncclCollnetGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &netSendRegFlag, &sendHandle, cleanupQueue, &info->nCleanupQueueElts);
+          info->sendMhandle = sendHandle;
+          if (netSendRegFlag) {
+            ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &netRecvRegFlag, &recvHandle, cleanupQueue, &info->nCleanupQueueElts);
+            info->recvMhandle = recvHandle;
+          }
+        }
+
+        if ((netSendRegFlag == 0 || netRecvRegFlag == 0) && ncclParamLocalRegister()) {
+          if (!netSendRegFlag) {
+            ncclCollnetLocalRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &netSendRegFlag, &sendHandle);
+            info->sendMhandle = sendHandle;
+          }
+          if (netSendRegFlag && !netRecvRegFlag) {
+            ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &netRecvRegFlag, &recvHandle);
+            info->recvMhandle = recvHandle;
+          }
+        }
+      }
+
+      if (netSendRegFlag && netRecvRegFlag) {
+        if (comm->isOneRPN) info->nMaxChannels = 1;
+        info->regBufType |= NCCL_NET_REG_BUFFER;
+      }
+    }
+
+    if (info->regBufType == NCCL_IPC_REG_BUFFER && comm->nNodes == 1 && 16 < info->nMaxChannels && info->nMaxChannels <= 24) {
+      info->nMaxChannels = 16;
+    }
+  }
+exit:
+#endif
+  return result;
+}
@@ -0,0 +1,179 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "argcheck.h" // Need some checks here since we access comm
+#include "nccl.h"
+#include "comm.h"
+#include "net.h"
+#include "register.h"
+#include "transport.h"
+
+ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** reg) {
+  struct ncclRegCache* cache = &comm->regCache;
+  uintptr_t pageSize = cache->pageSize;
+  uintptr_t addr = (uintptr_t)data & -pageSize;
+  size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
+
+  *reg = NULL;
+  for (int slot=0; /*true*/; slot++) {
+    if (slot == cache->population || addr < cache->slots[slot]->addr) return ncclSuccess;
+    if ((addr >= cache->slots[slot]->addr) &&
+        ((addr-cache->slots[slot]->addr)/pageSize+pages) <= cache->slots[slot]->pages) {
+      *reg = cache->slots[slot];
+      return ncclSuccess;
+    }
+  }
+}
+NCCL_PARAM(LocalRegister, "LOCAL_REGISTER", 1);
+
+ncclResult_t ncclRegLocalIsValid(struct ncclReg *reg, bool *isValid) {
+  if (reg && isValid) {
+    if (reg->localRefs)
+      *isValid = true;
+    else
+      *isValid = false;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclRegister(struct ncclComm* comm, void* data, size_t size, bool isGraph, void** handle) {
+  NCCLCHECK(CommCheck(comm, "ncclCommRegister", "comm"));
+  struct ncclRegCache* cache = &comm->regCache;
+  uintptr_t pageSize = cache->pageSize;
+  uintptr_t addr = (uintptr_t)data & -pageSize;
+  size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
+
+  if (comm->checkPointers) NCCLCHECK(CudaPtrCheck(data, comm, "buff", "ncclCommRegister"));
+  INFO(NCCL_REG, "register comm %p buffer %p size %zi", comm, data, size);
+
+  for (int slot=0; /*true*/; slot++) {
+    if ((slot == cache->population) || (addr < cache->slots[slot]->addr)) {
+      if (cache->population == cache->capacity) { // must grow cache
+        cache->capacity = cache->capacity < 32 ? 32 : 2*cache->capacity;
+        NCCLCHECK(ncclRealloc(&cache->slots, cache->population, cache->capacity));
+      }
+      memmove(cache->slots+slot+1, cache->slots+slot, (cache->population-slot)*sizeof(struct ncclReg*));
+      NCCLCHECK(ncclCalloc(cache->slots+slot, 1));
+      struct ncclReg* regSlot = cache->slots[slot];
+      regSlot->addr = addr;
+      regSlot->pages = pages;
+      if (isGraph) regSlot->graphRefs = 1;
+      else regSlot->localRefs = 1;
+      cache->population += 1;
+      *handle = regSlot;
+      goto exit;
+    } else if ((addr >= cache->slots[slot]->addr) &&
+        ((addr-cache->slots[slot]->addr)/pageSize+pages) <= cache->slots[slot]->pages) {
+      if (isGraph) cache->slots[slot]->graphRefs++;
+      else cache->slots[slot]->localRefs++;
+      *handle = cache->slots[slot];
+      goto exit;
+    }
+  }
+
+exit:
+  return ncclSuccess;
+}
+
+static ncclResult_t regCleanup(struct ncclComm* comm, struct ncclReg* reg) {
+  if (reg->state & NET_REG_COMPLETE) {
+    struct ncclRegNetHandles* netHandle = reg->netHandleHead;
+    struct ncclRegNetHandles* netHandlePrev;
+    while(netHandle) {
+      if (ncclNetDeregBuffer(comm, netHandle->proxyConn, netHandle->handle) != ncclSuccess) {
+        WARN("rank %d deregister NET buffer handle %p proxy rank %d failed\n", comm->rank, netHandle->handle, netHandle->proxyConn->rank);
+      }
+      netHandlePrev = netHandle;
+      netHandle = netHandle->next;
+      free(netHandlePrev);
+    }
+  }
+  if (reg->state & NVLS_REG_COMPLETE) {
+    if (ncclNvlsDeregBuffer(comm, &reg->mcHandle, reg->regAddr, reg->dev, reg->regSize) != ncclSuccess) {
+      WARN("rank %d deregister NVLS buffer %p dev %d size %ld failed", comm->rank, (void*)reg->regAddr, reg->dev, reg->regSize);
+    }
+    reg->regAddr = (CUdeviceptr)NULL;
+  }
+  if (reg->state & COLLNET_REG_COMPLETE) {
+    if (ncclCollnetDeregBuffer(comm, reg->collnetProxyconn, reg->collnetHandle) != ncclSuccess) {
+      WARN("rank %d deregister COLLNET buffer handle %p proxy rank %d failed", comm->rank, reg->collnetHandle, reg->collnetProxyconn->rank);
+    }
+  }
+  if (reg->state & IPC_REG_COMPLETE) {
+    for (int i = 0; i < NCCL_MAX_LOCAL_RANKS; ++i)
+      if (reg->ipcInfos[i]) {
+        if (ncclIpcDeregBuffer(comm, reg->ipcInfos[i]) != ncclSuccess) {
+          WARN("rank %d deregister IPC buffer %p peerRank %d failed", comm->rank, reg->ipcInfos[i]->baseAddr, reg->ipcInfos[i]->peerRank);
+        }
+        free(reg->ipcInfos[i]);
+      }
+    if (reg->regIpcAddrs.hostPeerRmtAddrs) free(reg->regIpcAddrs.hostPeerRmtAddrs);
+    if (reg->regIpcAddrs.devPeerRmtAddrs) NCCLCHECK(ncclCudaFree(reg->regIpcAddrs.devPeerRmtAddrs));
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclRegCleanup(struct ncclComm* comm) {
+  struct ncclRegCache* cache = &comm->regCache;
+  for (int i = 0; i < cache->population; i++) {
+    struct ncclReg* reg = cache->slots[i];
+    INFO(NCCL_INIT, "Cleanup buffer %p pages %lx", (void*)reg->addr, reg->pages);
+    NCCLCHECK(regCleanup(comm, reg));
+    free(reg);
+  }
+  free(cache->slots);
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclCommRegister, const ncclComm_t comm, void* buff, size_t size, void** handle);
+ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) {
+  if (!ncclParamLocalRegister())
+    *handle = NULL;
+  else
+    NCCLCHECK(ncclRegister(comm, buff, size, false, handle));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCommGraphRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) {
+  NCCLCHECK(ncclRegister(comm, buff, size, true, handle));
+  return ncclSuccess;
+}
+
+static ncclResult_t commDeregister(struct ncclComm *comm, bool isGraph, struct ncclReg* reg) {
+  NCCLCHECK(CommCheck(comm, "ncclCommRegister", "comm"));
+  struct ncclRegCache* cache = &comm->regCache;
+  int slot;
+  int saveDev;
+  if (reg == NULL) goto exit;
+  CUDACHECK(cudaGetDevice(&saveDev));
+  CUDACHECK(cudaSetDevice(comm->cudaDev));
+  for (slot = 0; slot < cache->population && cache->slots[slot] != reg; slot++);
+  if (slot == cache->population) {
+    WARN("Deregister: Could not find handle");
+    return ncclInvalidUsage;
+  }
+  if (isGraph) --reg->graphRefs;
+  else --reg->localRefs;
+  if (reg->localRefs || reg->graphRefs) return ncclSuccess;
+  NCCLCHECK(regCleanup(comm, reg));
+  free(reg);
+  memmove(cache->slots + slot, cache->slots + slot + 1, (cache->population - slot - 1) * sizeof(struct ncclReg*));
+  cache->population -= 1;
+  CUDACHECK(cudaSetDevice(saveDev));
+exit:
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclCommDeregister, const ncclComm_t comm, void* handle);
+ncclResult_t ncclCommDeregister(const ncclComm_t comm, void *handle) {
+  NCCLCHECK(commDeregister(comm, false, (struct ncclReg*)handle));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCommGraphDeregister(const ncclComm_t comm, struct ncclReg *handle) {
+  NCCLCHECK(commDeregister(comm, true, handle));
+  return ncclSuccess;
+}
@@ -0,0 +1,35 @@
+#include "register.h"
+#include "transport.h"
+
+ncclResult_t ncclRegisterP2pNetBuffer(struct ncclComm* comm, void* userbuff, size_t size, struct ncclConnector* conn, int* regFlag, void** handle, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue) {
+  ncclResult_t ret = ncclSuccess;
+
+  *regFlag = 0;
+  if (comm->netDeviceType != NCCL_NET_DEVICE_UNPACK) {
+    if (comm->planner.persistent && ncclParamGraphRegister()) {
+      ncclNetGraphRegisterBuffer(comm, userbuff, size, &conn, 1, regFlag, handle, cleanupQueue, NULL);
+    }
+    if (*regFlag == 0 && ncclParamLocalRegister()) {
+      ncclNetLocalRegisterBuffer(comm, userbuff, size, &conn, 1, regFlag, handle);
+    }
+  }
+  return ret;
+}
+
+ncclResult_t ncclRegisterP2pIpcBuffer(struct ncclComm* comm, void* userbuff, size_t size, int peerRank, int* regFlag, void** regAddr, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue) {
+  ncclResult_t ret = ncclSuccess;
+  uintptr_t offset = 0;
+  uintptr_t* peerRmtAddrs = NULL;
+
+  *regFlag = 0;
+  if (comm->planner.persistent && ncclParamGraphRegister()) {
+    ncclIpcGraphRegisterBuffer(comm, userbuff, size, &peerRank, 1, NCCL_IPC_SENDRECV, regFlag, &offset, &peerRmtAddrs, reinterpret_cast<void*>(cleanupQueue), NULL);
+  }
+  if (*regFlag == 0 && ncclParamLocalRegister()) {
+    ncclIpcLocalRegisterBuffer(comm, userbuff, size, &peerRank, 1, NCCL_IPC_SENDRECV, regFlag, &offset, &peerRmtAddrs);
+  }
+
+  if (*regFlag)
+    *regAddr = (void*)((uintptr_t)peerRmtAddrs + offset);
+  return ret;
+}
@@ -94,13 +94,13 @@ ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* intraNodeP2p
  }
  *intraNodeP2pSupport = supportFlag;
  *directMode = directFlag;
+  if (comm->rank == 0) INFO(NCCL_INIT, "Check P2P Type intraNodeP2pSupport %d directMode %d", supportFlag, directFlag);
  return ncclSuccess;
 }

-ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType/*=NULL*/) {
+ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex) {
  // Stream used during transport setup; need for P2P pre-connect + CUDA Graph
  ncclResult_t ret = ncclSuccess;
-  int highestType = TRANSPORT_UNDEFINED;  // track highest transport type
  struct ncclConnect** data; // Store intermediate send/recvData structs for connect
  struct ncclConnect** recvData = NULL; // Points to entries inside data for given recv connection within a channel
  struct ncclConnect** sendData = NULL; // Points to entries inside data for given send connection within a channel
@@ -131,7 +131,10 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
    // The next M entries contain sendData, connection information for send connections
    // It's not guaranteed that each entry of data has the same number of total or send/recv specific connections
    int p = i-(done+1);
-    if (recvMask || sendMask) NCCLCHECKGOTO(ncclCalloc(data+p, 2*MAXCHANNELS), ret, fail);
+    if (recvMask || sendMask) {
+      if (data[p] == NULL) NCCLCHECKGOTO(ncclCalloc(data + p, 2 * MAXCHANNELS), ret, fail);
+      else memset(data[p], 0, 2 * MAXCHANNELS * sizeof(struct ncclConnect));
+    }
    recvData[p] = data[p];
    int sendChannels = 0, recvChannels = 0;
    int type;
@@ -139,7 +142,6 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
    for (int c=0; c<MAXCHANNELS; c++) {
      if (recvMask & (1UL<<c)) {
        NCCLCHECKGOTO(selectTransport<0>(comm, graph, recvData[p]+recvChannels++, c, recvPeer, connIndex, &type), ret, fail);
-        if (type > highestType) highestType = type;
      }
    }
    TIME_STOP(0);
@@ -148,7 +150,6 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
    for (int c=0; c<MAXCHANNELS; c++) {
      if (sendMask & (1UL<<c)) {
        NCCLCHECKGOTO(selectTransport<1>(comm, graph, sendData[p]+sendChannels++, c, sendPeer, connIndex, &type), ret, fail);
-        if (type > highestType) highestType = type;
      }
    }
    TIME_STOP(1);
@@ -222,22 +223,18 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
            }
            TIME_STOP(4);
          }
-          if (sendMask || recvMask) {
-            free(data[p]);
-            data[p] = NULL;
-          }
        }
-	if (ncclParamReportConnectProgress() && comm->rank == 0 && done > 0) {
+        if (ncclParamReportConnectProgress() && comm->rank == 0 && done > 0) {
          struct timeval now;
          gettimeofday(&now, NULL);
-          if (((now.tv_sec - timeLast.tv_sec)*1.0 + (now.tv_usec-timeLast.tv_usec)*1e-6) > 1) {
-            float elapsed = (now.tv_sec - timeStart.tv_sec)*1.0 + (now.tv_usec-timeStart.tv_usec)*1e-6;
-	    float remaining = elapsed*(comm->nRanks-done)/done;
+          if (((now.tv_sec - timeLast.tv_sec) * 1.0 + (now.tv_usec - timeLast.tv_usec) * 1e-6) > 1) {
+            float elapsed = (now.tv_sec - timeStart.tv_sec) * 1.0 + (now.tv_usec - timeStart.tv_usec) * 1e-6;
+            float remaining = elapsed * (comm->nRanks - done) / done;
            printf("%sP2p connect: %g%% Elapsed %d:%02d Remaining %d:%02d                                       ",
-                timeReported ? "\r" : "", done*100.0/comm->nRanks, ((int)elapsed)/60, ((int)elapsed)%60, ((int)remaining)/60, ((int)remaining)%60);
+              timeReported ? "\r" : "", done * 100.0 / comm->nRanks, ((int)elapsed) / 60, ((int)elapsed) % 60, ((int)remaining) / 60, ((int)remaining) % 60);
            fflush(stdout);
            timeReported = true;
-	    timeLast = now; // struct copy;
+            timeLast = now; // struct copy;
          }
        }
      }
@@ -280,7 +277,6 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
    comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0UL;
  }

-  if (highestTransportType != NULL) *highestTransportType = highestType;
  TIME_PRINT("P2P Setup/Connect");
 exit:
  for(int i=0; i<maxPeers; ++i){
@@ -112,6 +112,7 @@ struct sendResources {
  uint64_t step;
  struct reqSlot (*reqFifo)[NCCL_STEPS];
  int collNetRank;
+  size_t maxCollBytes;
 };

 struct recvResources {
@@ -133,6 +134,7 @@ struct recvResources {
  uint64_t step;
  struct reqSlot reqFifo[COLLNET_MAX_GROUPS][NCCL_STEPS];
  int collNetRank;
+  size_t maxCollBytes;
 };

 static ncclResult_t canConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
@@ -157,7 +159,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
  int proxyRank;
  int64_t netId;
  NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &netId, &req.netDev, &proxyRank));
-  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 1, &req.useGdr));
+  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->rank, netId, 1, &req.useGdr));
  send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;

  send->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
@@ -177,10 +179,10 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
  int proxyRank;
  int64_t netId;
  NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &netId, &req.netDev, &proxyRank));
-  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 0, &req.useGdr));
+  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->rank, netId, 0, &req.useGdr));
  recv->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
  // Determine whether we need to flush the GDR buffer on recv or not
-  if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush));
+  if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm, req.netDev, myInfo->rank, &req.needFlush));

  recv->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, myInfo->rank, &recv->proxyConn));
@@ -319,6 +321,13 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc
  connection->collNet = req->collNet;
  /* DMA-BUF support */
  resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
+  /* collective size limits*/
+  resources->maxCollBytes = props.maxCollBytes;
+  if((resources->maxCollBytes <= 0) || (resources->maxCollBytes > NCCL_MAX_NET_SIZE_BYTES)) {
+    WARN("sendProxySetup: collnet plugin returned invalid value for maxCollBytes %ld \
+      [allowed range: %ld - %ld] \n", resources->maxCollBytes, 0L, NCCL_MAX_NET_SIZE_BYTES);
+    return ncclInternalError;
+  }
  return ncclSuccess;
 }

@@ -430,6 +439,12 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
  connection->collNet = req->collNet;
  /* DMA-BUF support */
  resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
+  resources->maxCollBytes = props.maxCollBytes;
+  if((resources->maxCollBytes <= 0) || (resources->maxCollBytes > NCCL_MAX_NET_SIZE_BYTES)) {
+    WARN("sendProxySetup: collnet plugin returned invalid value for maxCollBytes %ld \
+      [allowed range: %ld - %ld] \n", resources->maxCollBytes, 0L, NCCL_MAX_NET_SIZE_BYTES);
+    return ncclInternalError;
+  }

  collNetHandle_t* netHandle = (collNetHandle_t*) respBuff;
  if (respSize != sizeof(collNetHandle_t)) return ncclInternalError;
@@ -645,14 +660,14 @@ static size_t calcAlgoOffset(struct ncclProxyArgs* args, int isAllNotOne, int su
  return offset;
 }

-static int calcRegionOffset(
+static ssize_t calcRegionOffset(
    struct ncclProxyArgs* args, int isRecvNotSend, int sub, uint64_t step,
    int side // 0=begin, 1=end
  ) {
  struct ncclCollNetSharedRes* collNet = args->subs[0].connection->collNet;
-  int slotSize = collNet->buffSize/NCCL_STEPS;
-  int chunkSize = args->chunkSize;
-  int base = isRecvNotSend*NCCL_STEPS + (step%NCCL_STEPS);
+  ssize_t slotSize = collNet->buffSize/NCCL_STEPS;
+  ssize_t chunkSize = args->chunkSize;
+  ssize_t base = isRecvNotSend*NCCL_STEPS + (step%NCCL_STEPS);
  base *= collNet->nChannels*slotSize;
  if (args->coll == ncclFuncAllReduce) {
    return base + (sub+side)*chunkSize;
@@ -674,6 +689,165 @@ static constexpr int calcStepsPerGroup(int nGroups) {
  return NCCL_STEPS;
 }

+static ncclResult_t collNetRegIallreduce(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, int groupStart, ssize_t *nBytesInOut, void **request) {
+  ssize_t loopSize, winOffset, nBytes;
+  ssize_t eltSize = ncclTypeSize((ncclDataType_t)args->dtype);
+  // for UB iallreduce 1RPN case, user's send and recv buffers are both directly accessed by collnet network.
+  // we can just issue maximal collnet bytes by resources->maxCollBytes for each iallreduce.
+  // for multi-RPN case, we have to consider pipeline, so each time we only send groupSize * chunkSize (i.e., nBytesInOut)
+  // sub->loopOffset is data offset to the buffer for this head rank in each loop
+  // winOffset is used to find actual offset from send and recv buffer for this iallreduce
+  // loopSize is all bytes sent by all channels and head ranks in each loop.
+  // send and recv mem handle are retrieved from sub in which user buffer mem handles are stored.
+  if (sub->isOneRPN) {
+    winOffset = 0;
+    nBytes = std::min((size_t)sub->nbytes, resources->maxCollBytes);
+    loopSize = nBytes;
+  } else {
+    winOffset = sub->loopOffset + groupStart * args->chunkSize;
+    nBytes = std::min(sub->nbytes - winOffset, *nBytesInOut);
+    loopSize = sub->loopSize;
+  }
+
+  if (nBytes > 0) {
+    NCCLCHECK(proxyState->ncclCollNet->iallreduce(resources->collNetComm, sub->sendbuff + winOffset, sub->recvbuff + winOffset, nBytes / eltSize, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sub->sendMhandle, sub->recvMhandle, request));
+    if (*request) {
+      // if issued successfully, we need to move the pointer forward and reduce the existing nbytes.
+      sub->nbytes -= loopSize;
+      sub->sendbuff += loopSize;
+      sub->recvbuff += loopSize;
+      TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] registered Iallreduce posted sendbuff %p recvbuff %p size %ld loopSize %ld winOffset %ld isOneRPN %d req %p", (long)sub->transmitted, sub->nsteps, groupStart, sub->sendbuff, sub->recvbuff, nBytes, loopSize, winOffset, sub->isOneRPN, *request);
+    }
+  }
+  *nBytesInOut = nBytes;
+  return ncclSuccess;
+}
+
+static ncclResult_t collNetIallreduce(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytes, ssize_t sendBeg, ssize_t recvBeg, void **request) {
+  void *sendMhandle = resources->sendMhandles[NCCL_PROTO_SIMPLE];
+  void *recvMhandle = resources->recvMhandles[NCCL_PROTO_SIMPLE];
+  char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]);
+  ssize_t eltSize = ncclTypeSize((ncclDataType_t)args->dtype);
+  // non-UB iallreduce, region is intermediate buffer and sendBeg/recvBeg is the corresponding offset
+  // for send and recv data. The send and recv mem handle are retrieved from resources.
+  NCCLCHECK(proxyState->ncclCollNet->iallreduce(resources->collNetComm, region + sendBeg, region + recvBeg, nBytes / eltSize, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sendMhandle, recvMhandle, request));
+  if (*request)
+    TRACE(NCCL_NET, "sendProxy [%ld/%d] Iallreduce posted size %ld sendBeg %ld recvBeg %ld req %p", (long)sub->transmitted, sub->nsteps, nBytes, sendBeg, recvBeg, *request);
+  return ncclSuccess;
+}
+
+static ncclResult_t collNetRegIallgather(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytesIn, ssize_t allBeg, ssize_t recvBeg, void *recvMhandle, void **request) {
+  ncclNetSGE_v9_t recvParts;
+  ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank;
+  char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]);
+  ssize_t nBytes;
+  ssize_t winOffset;
+  void *sendbuff;
+  // UB iallgather 1RPN logic is the same as iallreduce.
+  // If iallgather is not 1RPN, we can let collnet network directly access sendbuff but not recvbuff;
+  // the main reason is non-1RPN case will cause non-contiguous recv data from network, so
+  // we have to use intermediate buffer "region" to recv data and copy into the recvbuff.
+  // so allBeg and recvMhandle, which are global window offset of recv buffer and mem handle for region,
+  // are only used in multi-RPN case.
+  if (sub->isOneRPN) {
+    nBytes = std::min((size_t)sub->nbytes, resources->maxCollBytes);
+    winOffset = sub->offset;
+    recvParts.mhandle = sub->recvMhandle;
+    recvParts.address = sub->recvbuff;
+  } else {
+    nBytes = nBytesIn;
+    winOffset = allBeg;
+    recvParts.mhandle = recvMhandle;
+    recvParts.address = region + recvBeg;
+  }
+  recvParts.size = nBytes;
+  if (winOffset / sizePerRank == args->specifics.collnetDirect.node) {
+    sendbuff = sub->sendbuff + winOffset % sizePerRank;
+  } else {
+    sendbuff = sub->sendbuff;
+  }
+  NCCLCHECK(proxyState->ncclCollNet->iallgather(resources->collNetComm, sendbuff, 1, &recvParts, sizePerRank, winOffset, nBytes, sub->sendMhandle, request));
+  if (*request) {
+    if (sub->isOneRPN) {
+      sub->recvbuff += nBytes;
+      sub->nbytes -= nBytes;
+      sub->offset += nBytes;
+    }
+    TRACE(NCCL_NET, "sendProxy [%ld/%d] registered Iallgather posted sizePerRank %ld winOffset %ld recvSize %ld isOneRPN %d request %p", sub->transmitted, sub->nsteps, sizePerRank, winOffset, nBytes, sub->isOneRPN, *request);
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t collNetIallgather(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytes, ssize_t allBeg, ssize_t sendBeg, ssize_t recvBeg, void *sendMhandle, void *recvMhandle, void **request) {
+  ncclNetSGE_v9_t recvParts;
+  ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank;
+  char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]);
+  recvParts.mhandle = recvMhandle;
+  recvParts.address = region + recvBeg;
+  recvParts.size = nBytes;
+  // non-UB iallgather, we use intermidate region buffers for both send and recv data.
+  // sendMhandle and recvMhandle are send and recv mem handles for region, and allBeg is
+  // the global window offset of recv buffer. sendBeg and recvBeg are offset to the region
+  // for intermediate data.
+  NCCLCHECK(proxyState->ncclCollNet->iallgather(resources->collNetComm, region + sendBeg, 1, &recvParts, sizePerRank, allBeg, nBytes, sendMhandle, request));
+  if (*request)
+    TRACE(NCCL_NET, "sendProxy [%ld/%d] Iallgather posted sizePerRank %ld winOffset %ld recvSize %ld request %p", sub->transmitted, sub->nsteps, sizePerRank, allBeg, nBytes, *request);
+  return ncclSuccess;
+}
+
+static ncclResult_t collNetRegIreducescatter(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytesIn, ssize_t allBeg, ssize_t sendBeg, void *sendMhandle, void **request) {
+  ncclNetSGE_v9_t sendParts;
+  ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank;
+  char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]);
+  ssize_t nBytes;
+  size_t winOffset;
+  void *recvbuff;
+  // Similar to iallgather, if ireducescatter is not 1RPN, we can let collnet network
+  // directly access recvbuff but not sendbuff. We use intermediate buffer "region" to
+  // send data and directly recv into the recvbuff.
+  if (sub->isOneRPN) {
+    nBytes = std::min((size_t)sub->nbytes, resources->maxCollBytes);
+    winOffset = sub->offset;
+    sendParts.mhandle = sub->sendMhandle;
+    sendParts.address = sub->sendbuff;
+  } else {
+    nBytes = nBytesIn;
+    winOffset = allBeg;
+    sendParts.mhandle = sendMhandle;
+    sendParts.address = region + sendBeg;
+  }
+  sendParts.size = nBytes;
+  if (winOffset / sizePerRank == args->specifics.collnetDirect.node) {
+    recvbuff = sub->recvbuff + winOffset % sizePerRank;
+  } else {
+    recvbuff = sub->recvbuff;
+  }
+  NCCLCHECK(proxyState->ncclCollNet->ireducescatter(resources->collNetComm, 1, &sendParts, recvbuff, sizePerRank, winOffset, nBytes, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sub->recvMhandle, request));
+  if (*request) {
+    if (sub->isOneRPN) {
+      sub->sendbuff += nBytes;
+      sub->nbytes -= nBytes;
+      sub->offset += nBytes;
+    }
+    TRACE(NCCL_NET, "sendProxy [%ld/%d] registered Ireducescatter posted sizePerRank %ld winOffset %ld sendSize %ld isOneRPN %d request %p", sub->transmitted, sub->nsteps, sizePerRank, winOffset, nBytes, sub->isOneRPN, *request);
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t collNetIreducescatter(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytes, ssize_t allBeg, ssize_t sendBeg, ssize_t recvBeg, void *sendMhandle, void *recvMhandle, void **request) {
+  ncclNetSGE_v9_t sendParts;
+  ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank;
+  char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]);
+  sendParts.mhandle = sendMhandle;
+  sendParts.address = region + sendBeg;
+  sendParts.size = nBytes;
+  // non-UB ireducescatter is the same as non-UB iallgather but in the reverse direction.
+  NCCLCHECK(proxyState->ncclCollNet->ireducescatter(resources->collNetComm, 1, &sendParts, region + recvBeg, sizePerRank, allBeg, nBytes, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, recvMhandle, request));
+  if (*request)
+    TRACE(NCCL_NET, "sendProxy [%ld/%d] Ireducescatter posted sizePerRank %ld winOffset %ld sendSize %ld request %p", sub->transmitted, sub->nsteps, sizePerRank, allBeg, nBytes, *request);
+  return ncclSuccess;
+}
+
 static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
  if (args->state == ncclProxyOpReady) {
    for (int s=0; s<args->nsubs; s++) {
@@ -683,6 +857,8 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
      sub->base = ROUNDUP(resources->step, args->chunkSteps);
      sub->posted = sub->received = sub->transmitted = sub->done = 0;
      resources->step = sub->base + sub->nsteps;
+      //adjust nsteps for registerd buffers as device signals a single step
+      if (sub->reg && sub->isOneRPN) sub->nsteps = DIVUP((size_t)sub->nbytes, resources->maxCollBytes);
    }
    args->state = ncclProxyOpProgress;
  }
@@ -695,28 +871,30 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
      struct sendResources* resources = (struct sendResources*) (sub->connection->transportResources);
      void* sendMhandle = resources->sendMhandles[p];
      void* recvMhandle = resources->recvMhandles[p];
-      char* region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[p]);
      auto reqFifo = resources->reqFifo;
      int group = s/COLLNET_GROUP_NSUBS;
      int groupStart = s - (s%COLLNET_GROUP_NSUBS);

      if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) {
        int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
-        if (sub->reg == 0) {
+        if (sub->reg == 0 || (!sub->isOneRPN && args->coll == ncclFuncReduceScatter)) {
          resources->recvMem->connFifo[buffSlot].offset = calcRegionOffset(args, 0, s, sub->posted, 0);
          __sync_synchronize();
        }
        volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
-        TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] posted offset %d @ %p signal %ld->%ld", long(sub->posted), group, buffSlot, resources->recvMem->connFifo[buffSlot].offset, &resources->recvMem->connFifo[buffSlot].offset, long(*sendHead), long(sub->base + sub->posted + args->sliceSteps - NCCL_STEPS));
+        TRACE(NCCL_NET, "sendProxy [%ld/%d/%d/%d] posted offset %d @ %p signal %ld->%ld", long(sub->posted), group, buffSlot, sub->nsteps, resources->recvMem->connFifo[buffSlot].offset, &resources->recvMem->connFifo[buffSlot].offset, long(*sendHead), long(sub->base + sub->posted + args->sliceSteps - NCCL_STEPS));
        sub->posted += args->sliceSteps;
-        *sendHead = sub->base + sub->posted - NCCL_STEPS;
+        // Only post one credit for registered buffer
+        if (sub->reg == 0 || !sub->isOneRPN || sub->posted == args->sliceSteps) *sendHead = sub->base + sub->posted - NCCL_STEPS;
        if (resources->gdcSync) wc_store_fence(); // Flush out WC write
      }
      if (sub->received < sub->posted && sub->received < sub->done + calcStepsPerGroup(nGroups)) {
        int buffSlot = (sub->base+sub->received)%NCCL_STEPS;
        volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
        volatile uint64_t* recvTail = &resources->recvMem->tail;
-        if ((connFifo[buffSlot].size != -1 || sub->reg) && ((*recvTail > (sub->base+sub->received)))) {
+        //device progresses tail by only 1 for registered buffers
+        uint64_t tail = sub->base + (sub->reg && sub->isOneRPN ? 0 : sub->received);
+        if ((connFifo[buffSlot].size != -1 || sub->reg) && (*recvTail > tail)) {
          if (args->coll != ncclFuncAllReduce && sub->reg == 0) {
            int sendBeg = calcRegionOffset(args, 0, s, sub->received, 0);
            int sendEnd = calcRegionOffset(args, 0, s, sub->received, 1);
@@ -738,110 +916,42 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
          int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
          if (!reqFifo[group][buffSlot].turnIsSendNotRecv) continue;

-          ssize_t sizePerRank = 0;
-          size_t allBeg = calcAlgoOffset(args, 1, groupStart, sub->transmitted);
-          size_t allEnd = calcAlgoOffset(args, 1, s+1, sub->transmitted);
-          int sendBeg = calcRegionOffset(args, 0, groupStart, sub->transmitted, 0);
-          int sendEnd = calcRegionOffset(args, 0, s, sub->transmitted, 1);
-          int recvBeg = calcRegionOffset(args, 1, groupStart, sub->transmitted, 0);
-          int recvEnd = calcRegionOffset(args, 1, s, sub->transmitted, 1);
+          ssize_t allBeg = calcAlgoOffset(args, 1, groupStart, sub->transmitted);
+          ssize_t allEnd = calcAlgoOffset(args, 1, s+1, sub->transmitted);
+          ssize_t sendBeg = calcRegionOffset(args, 0, groupStart, sub->transmitted, 0);
+          ssize_t sendEnd = calcRegionOffset(args, 0, s, sub->transmitted, 1);
+          ssize_t recvBeg = calcRegionOffset(args, 1, groupStart, sub->transmitted, 0);
+          ssize_t recvEnd = calcRegionOffset(args, 1, s, sub->transmitted, 1);
          reqFifo[group][buffSlot].size = recvEnd - recvBeg;
-          size_t eltSize = ncclTypeSize((ncclDataType_t)args->dtype);

-          if (sendBeg==sendEnd && recvBeg==recvEnd && sub->reg == 0) {
+          if (sendBeg==sendEnd && recvBeg==recvEnd) {
            sub->requests[buffSlot] = nullptr; // trivally finished request
          } else {
+            ssize_t nBytes = 0;
            if (args->coll == ncclFuncAllReduce) {
+              nBytes = sendEnd - sendBeg;
              if (sub->reg) {
-                size_t nBytes = std::min(sub->nbytes, NCCL_MAX_COLLNET_SIZE);
-                int count = (int)(nBytes / eltSize);
-                NCCLCHECK(proxyState->ncclCollNet->iallreduce(resources->collNetComm, sub->sendbuff, sub->recvbuff, count, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sub->sendMhandle, sub->recvMhandle, sub->requests + buffSlot));
-                if (sub->requests[buffSlot]) {
-                  sub->nbytes -= nBytes;
-                  sub->sendbuff += nBytes;
-                  sub->recvbuff += nBytes;
-                }
+                NCCLCHECK(collNetRegIallreduce(proxyState, resources, args, sub, groupStart, &nBytes, &sub->requests[buffSlot]));
              } else {
-                int count = (sendEnd - sendBeg) / eltSize;
-                NCCLCHECK(proxyState->ncclCollNet->iallreduce(resources->collNetComm, region + sendBeg, region + recvBeg, count, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sendMhandle, recvMhandle, sub->requests + buffSlot));
+                NCCLCHECK(collNetIallreduce(proxyState, resources, args, sub, nBytes, sendBeg, recvBeg, &sub->requests[buffSlot]));
              }
-            } else {
-              sizePerRank = args->specifics.collnetDirect.sizePerRank;
-              if (args->coll == ncclFuncAllGather) {
-                ncclNetSGE_v8_t recvParts;
-                if (sub->reg) {
-                  size_t nBytes = std::min(sub->nbytes, NCCL_MAX_COLLNET_SIZE);
-                  void *sendbuff;
-                  recvParts.mhandle = sub->recvMhandle;
-                  recvParts.address = sub->recvbuff;
-                  recvParts.size = nBytes;
-                  if (sub->offset / sizePerRank == args->specifics.collnetDirect.node) {
-                    sendbuff = sub->sendbuff + sub->offset % sizePerRank;
-                  } else {
-                    sendbuff = sub->sendbuff;
-                  }
-                  NCCLCHECK(proxyState->ncclCollNet->iallgather(
-                    resources->collNetComm, sendbuff, 1, &recvParts,
-                    sizePerRank, sub->offset, nBytes,
-                    sub->sendMhandle, sub->requests + buffSlot));
-                  if (sub->requests[buffSlot]) {
-                    sub->recvbuff += nBytes;
-                    sub->nbytes -= nBytes;
-                    sub->offset += nBytes;
-                  }
-                } else {
-                  recvParts.mhandle = recvMhandle;
-                  recvParts.address = region + recvBeg;
-                  recvParts.size = allEnd - allBeg;
-                  NCCLCHECK(proxyState->ncclCollNet->iallgather(
-                    resources->collNetComm, region + sendBeg, 1, &recvParts,
-                    sizePerRank, allBeg, allEnd - allBeg,
-                    sendMhandle, sub->requests + buffSlot));
-                }
-              } else {
-                ncclNetSGE_v8_t sendParts;
-                if (sub->reg) {
-                  size_t nBytes = std::min(sub->nbytes, NCCL_MAX_COLLNET_SIZE);
-                  void *recvbuff;
-                  sendParts.mhandle = sub->sendMhandle;
-                  sendParts.address = sub->sendbuff;
-                  sendParts.size = nBytes;
-                  if (sub->offset / sizePerRank == args->specifics.collnetDirect.node) {
-                    recvbuff = sub->recvbuff + sub->offset % sizePerRank;
-                  } else {
-                    recvbuff = sub->recvbuff;
-                  }
-                  NCCLCHECK(proxyState->ncclCollNet->ireducescatter(
-                    resources->collNetComm, 1, &sendParts, recvbuff,
-                    sizePerRank, sub->offset, nBytes,
-                    (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp,
-                    sub->recvMhandle, sub->requests + buffSlot));
-                  if (sub->requests[buffSlot]) {
-                    sub->sendbuff += nBytes;
-                    sub->nbytes -= nBytes;
-                    sub->offset += nBytes;
-                  }
-                } else {
-                  sendParts.mhandle = sendMhandle;
-                  sendParts.address = region + sendBeg;
-                  sendParts.size = allEnd - allBeg;
-                  NCCLCHECK(proxyState->ncclCollNet->ireducescatter(
-                    resources->collNetComm, 1, &sendParts, region + recvBeg,
-                    sizePerRank, allBeg, allEnd - allBeg,
-                    (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp,
-                    recvMhandle, sub->requests + buffSlot));
-                }
-              }
-            }
-            if (sub->requests[buffSlot] == nullptr) continue;
-
-            if (args->coll == ncclFuncAllReduce) {
-              TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Iallreduce posted, size %d req %p", (long)sub->transmitted, group, buffSlot, int(sendEnd-sendBeg), sub->requests[buffSlot]);
            } else if (args->coll == ncclFuncAllGather) {
-              TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Iallgather posted sendSize=%ld recvOffset=%ld recvSize=%ld request=%p", (long)sub->transmitted, group, buffSlot, long(sizePerRank), long(allBeg), long(allEnd-allBeg), sub->requests[buffSlot]);
+              nBytes = allEnd - allBeg;
+              if (sub->reg) {
+                NCCLCHECK(collNetRegIallgather(proxyState, resources, args, sub, nBytes, allBeg, recvBeg, recvMhandle, &sub->requests[buffSlot]));
+              } else {
+                NCCLCHECK(collNetIallgather(proxyState, resources, args, sub, nBytes, allBeg, sendBeg, recvBeg, sendMhandle, recvMhandle, &sub->requests[buffSlot]));
+              }
            } else {
-              TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Ireducescatter posted sendOffset=%ld sendSize=%ld recvSize=%ld request=%p", (long)sub->transmitted, group, buffSlot, long(allBeg), long(allEnd-allBeg), long(sizePerRank), sub->requests[buffSlot]);
+              // reducescatter
+              nBytes = allEnd - allBeg;
+              if (sub->reg) {
+                NCCLCHECK(collNetRegIreducescatter(proxyState, resources, args, sub, nBytes, allBeg, sendBeg, sendMhandle, &sub->requests[buffSlot]));
+              } else {
+                NCCLCHECK(collNetIreducescatter(proxyState, resources, args, sub, nBytes, allBeg, sendBeg, recvBeg, sendMhandle, recvMhandle, &sub->requests[buffSlot]));
+              }
            }
+            if (nBytes > 0 && sub->requests[buffSlot] == nullptr) continue;
          }
        }
        sub->transmitted += args->sliceSteps;
@@ -875,6 +985,52 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
  return ncclSuccess;
 }

+static ncclResult_t collNetRecvFlush(struct ncclProxyState* proxyState, struct recvResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, int groupStart, ssize_t nBytesIn, ssize_t recvBeg, void **request) {
+  char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]);
+  if (sub->reg && (sub->isOneRPN || args->coll != ncclFuncAllGather)) {
+    ssize_t nBytes, loopSize;
+    ssize_t offset = sub->offset + groupStart * args->chunkSize;
+    if (sub->isOneRPN) {
+      nBytes = std::min((size_t)sub->nbytes, resources->maxCollBytes);
+      loopSize = nBytes;
+    } else {
+      nBytes = std::min(sub->nbytes - sub->loopOffset, nBytesIn);
+      loopSize = sub->loopSize;
+    }
+    if (nBytes > 0) {
+      if (args->coll == ncclFuncReduceScatter) {
+        ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank;
+        ssize_t groupStartOffset = sub->offset + groupStart * args->chunkSize;
+        ssize_t groupEndOffset = groupStartOffset + nBytes;
+        int node = args->specifics.collnetDirect.node;
+        int startNode = groupStartOffset / sizePerRank;
+        int lastNode = groupEndOffset / sizePerRank;
+        if (startNode == node) {
+          offset = groupStartOffset % sizePerRank;
+          nBytes = std::min(sizePerRank - offset, nBytes);
+        } else if (startNode < node && node < lastNode) {
+          offset = 0;
+          nBytes = sizePerRank;
+        } else if (node == lastNode) {
+          offset = 0;
+          nBytes = groupEndOffset % sizePerRank;
+        } else {
+          // dummy flush
+          offset = 0;
+        }
+      }
+      NCCLCHECK(proxyState->ncclCollNet->iflush(resources->collNetComm, sub->recvbuff + offset + sub->loopOffset, nBytes, sub->recvMhandle, request));
+      if (*request) {
+        sub->nbytes -= loopSize;
+        sub->offset += loopSize;
+      }
+    }
+  } else {
+    NCCLCHECK(proxyState->ncclCollNet->iflush(resources->collNetComm, region + recvBeg, nBytesIn, resources->mhandles[NCCL_PROTO_SIMPLE], request));
+  }
+  return ncclSuccess;
+}
+
 static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
  if (args->state == ncclProxyOpReady) {
    for (int s=0; s<args->nsubs; s++) {
@@ -884,22 +1040,21 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
      sub->base = ROUNDUP(resources->step, args->chunkSteps);
      sub->posted = sub->received = sub->flushed = sub->transmitted = sub->done = 0;
      resources->step = sub->base + sub->nsteps;
+      //adjust nsteps for registerd buffers as device signals a single step
+      if (sub->reg && sub->isOneRPN) sub->nsteps = DIVUP((size_t)sub->nbytes, resources->maxCollBytes);
      memset(sub->requests, 0, sizeof(sub->requests));
    }
    args->state = ncclProxyOpProgress;
  }
  args->idle = 1;
  if (args->state == ncclProxyOpProgress) {
-    int p = NCCL_PROTO_SIMPLE;
    int nGroups = DIVUP(args->nsubs, COLLNET_GROUP_NSUBS);
    for (int s=0; s<args->nsubs; s++) {
      int group = s/COLLNET_GROUP_NSUBS;
      int groupStart = s - (s%COLLNET_GROUP_NSUBS);
      struct ncclProxySubArgs* sub = args->subs+s;
      struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
-      void* mhandle = resources->mhandles[p];
      auto reqFifo = resources->reqFifo;
-      char* region = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);

      // Enforce sync between operations of the same group.
      if (LAST_OF_GROUP(args, s) && (sub->posted < sub->done + calcStepsPerGroup(nGroups)) && (sub->posted < sub->nsteps)) {
@@ -913,10 +1068,10 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
      if (LAST_OF_GROUP(args, s) && (sub->received < sub->posted)) {
        int buffSlot = (sub->base+sub->received)%NCCL_STEPS;
        if (!reqFifo[group][buffSlot].turnIsSendNotRecv) { // Buffer is cleared : coll is complete
-          int recvBeg = calcRegionOffset(args, 1, groupStart, sub->received, 0);
-          int recvEnd = calcRegionOffset(args, 1, s, sub->received, 1);
-          int totalSize = recvEnd - recvBeg;
-          TRACE(NCCL_NET, "recvProxy [%ld/%d/%d] received, size %d chunkSize=%d", (long)sub->received, group, buffSlot, totalSize, args->chunkSize);
+          ssize_t recvBeg = calcRegionOffset(args, 1, groupStart, sub->received, 0);
+          ssize_t recvEnd = calcRegionOffset(args, 1, s, sub->received, 1);
+          ssize_t totalSize = recvEnd - recvBeg;
+          TRACE(NCCL_NET, "recvProxy [%ld/%d/%d] received, size %ld chunkSize=%ld", (long)sub->received, group, buffSlot, totalSize, args->chunkSize);
          sub->received += args->sliceSteps;
          if ((reqFifo[group][buffSlot].size > 0 || sub->reg) && resources->useGdr && resources->needFlush) {
            // GDRCOPY support
@@ -929,37 +1084,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
              return ncclInternalError;
 #endif
            } else {
-              if (sub->reg) {
-                size_t nBytes = std::min(sub->nbytes, NCCL_MAX_COLLNET_SIZE);
-                size_t offset = 0;
-                if (args->coll == ncclFuncReduceScatter) {
-                  size_t sizePerRank = args->specifics.collnetDirect.sizePerRank;
-                  int node = args->specifics.collnetDirect.node;
-                  int startNode = sub->offset / sizePerRank;
-                  int lastNode = (sub->offset + nBytes) / sizePerRank;
-                  if (startNode == node) {
-                    offset = sub->offset % sizePerRank;
-                    nBytes = std::min(sizePerRank - offset, nBytes);
-                  } else if (startNode < node && node < lastNode) {
-                    nBytes = sizePerRank;
-                  } else if (node == lastNode) {
-                    nBytes = (sub->offset + nBytes) % sizePerRank;
-                  } else {
-                    // no need to flush
-                    nBytes = 0;
-                  }
-                }
-                NCCLCHECK(proxyState->ncclCollNet->iflush(resources->collNetComm, sub->recvbuff + offset, nBytes, sub->recvMhandle, sub->requests+buffSlot));
-                if (sub->requests[buffSlot]) {
-                  sub->nbytes -= nBytes;
-                  sub->offset += nBytes;
-                  if (args->coll == ncclFuncAllGather || args->coll == ncclFuncAllReduce) {
-                    sub->recvbuff += nBytes;
-                  }
-                }
-              } else {
-                NCCLCHECK(proxyState->ncclCollNet->iflush(resources->collNetComm, region+recvBeg, totalSize, mhandle, sub->requests+buffSlot));
-              }
+              NCCLCHECK(collNetRecvFlush(proxyState, resources, args, sub, groupStart, totalSize, recvBeg, &sub->requests[buffSlot]));
            }
          }
          args->idle = 0;
@@ -980,14 +1105,19 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
        }
      }
      if (sub->transmitted < sub->flushed) {
-        if (sub->reg == 0) {
+        if (sub->reg == 0 || (!sub->isOneRPN && args->coll == ncclFuncAllGather)) {
          int buffSlot = (sub->base + sub->transmitted)%NCCL_STEPS;
          volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
          connFifo[buffSlot].offset = calcRegionOffset(args, 1, s, sub->transmitted, 0);
          __sync_synchronize();
        }
        volatile uint64_t* recvTail = resources->gdcSync ? resources->gdcSync : &resources->recvMem->tail;
-        *recvTail = sub->base + sub->flushed;
+        if (sub->reg && sub->isOneRPN) {
+          // We may have bumped net steps, but reg operations only have a single step w.r.t. the GPU.
+          if (sub->flushed == sub->nsteps) *recvTail = sub->base + args->sliceSteps;
+        } else {
+          *recvTail = sub->base + sub->flushed;
+        }
        if (resources->gdcSync) wc_store_fence(); // Flush out WC write
        sub->transmitted += args->sliceSteps;
        args->idle = 0;
@@ -999,7 +1129,8 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
      bool groupSync = s==0 ? args->subs[args->nsubs-1].done == sub->done
                            : (sub-1)->done > sub->done;
      volatile uint64_t* sendHead = &resources->sendMem->head;
-      if (groupSync && sub->done < sub->transmitted && (sub->base+sub->done) < *sendHead) {
+      int done = sub->reg && sub->isOneRPN ? 0 : sub->done;
+      if (groupSync && sub->done < sub->transmitted && sub->base + done < *sendHead) {
        sub->done += args->sliceSteps;
        args->idle = 0;
        if (sub->done == sub->nsteps && s == args->nsubs-1) {
@@ -1017,24 +1148,22 @@ struct collnetRegInfo {
  size_t size;
 };

-ncclResult_t ncclCollnetLocalRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, int* outRegBufFlag, void** outHandle) {
+static ncclResult_t collnetRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, struct ncclReg* regRecord, int* outRegBufFlag, void** outHandle) {
  ncclResult_t ret = ncclSuccess;
-  struct ncclReg *regRecord = NULL;
+  if (regRecord) {
+    if (regRecord->state & COLLNET_REG_COMPLETE) {
+      // reuse previous registration
+      *outRegBufFlag = 2;
+      *outHandle = regRecord->collnetHandle;
+      INFO(NCCL_REG, "rank %d - COLLNET reuse register userbuff %p (handle %p), buffSize %ld, type %s", comm->rank, userbuff, regRecord->collnetHandle, buffSize, type == collNetRecv ? "Recv" : "Send");
+      goto exit;
+    } else {
+      /* start register collnet buffer */
+      struct collnetRegInfo info = { regRecord->addr, regRecord->pages * comm->regCache.pageSize };
+      void* handle = NULL;
+      struct ncclConnInfo* conn = (type == collNetRecv) ? &comm->channels[0].peers[comm->nRanks]->recv[type].conn : &comm->channels[0].peers[comm->nRanks]->send[type].conn;

-  *outRegBufFlag = 0;
-  *outHandle = NULL;
-  if (comm && userbuff && buffSize > 0) {
-    NCCLCHECKGOTO(ncclRegFind(comm, userbuff, buffSize, &regRecord), ret, fail);
-    if (regRecord) {
-      if (regRecord->state & COLLNET_REG_COMPLETE) {
-        // reuse previous registration
-        *outRegBufFlag = 2;
-        *outHandle = regRecord->collnetHandle;
-        goto exit;
-      } else {
-        /* start register collnet buffer */
-        struct collnetRegInfo info = {regRecord->addr, regRecord->pages * comm->regCache.pageSize};
-        void* handle = NULL;
+      if (conn->flags & NCCL_DIRECT_NIC) {
        struct ncclProxyConnector* proxyconn = (type == collNetRecv) ? &comm->channels[0].peers[comm->nRanks]->recv[type].proxyConn : &comm->channels[0].peers[comm->nRanks]->send[type].proxyConn;
        NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyconn, ncclProxyMsgRegister, &info, sizeof(struct collnetRegInfo), &handle, sizeof(void*)), ret, fail);
        if (handle) {
@@ -1042,10 +1171,78 @@ ncclResult_t ncclCollnetLocalRegisterBuffer(struct ncclComm* comm, const void* u
          regRecord->collnetProxyconn = proxyconn;
          *outHandle = regRecord->collnetHandle = handle;
          *outRegBufFlag = 1;
+          INFO(NCCL_REG, "rank %d - COLLNET register userbuff %p (handle %p), buffSize %ld, type %s", comm->rank, userbuff, handle, buffSize, type == collNetRecv ? "Recv" : "Send");
        }
+      } else {
+        WARN("rank %d - COLLNET failed to register userbuff %p (handle %p), buffSize %ld, type %s, GDR is not enabled", comm->rank, userbuff, handle, buffSize, type == collNetRecv ? "Recv" : "Send");
      }
    }
  }
+exit:
+  return ret;
+fail:
+  *outRegBufFlag = 0;
+  *outHandle = NULL;
+  goto exit;
+}
+
+ncclResult_t ncclCollnetLocalRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, int* outRegBufFlag, void** outHandle) {
+  ncclResult_t ret = ncclSuccess;
+  struct ncclReg *regRecord = NULL;
+  bool isValid = false;
+
+  *outRegBufFlag = 0;
+  *outHandle = NULL;
+  if (comm && userbuff && buffSize > 0) {
+    NCCLCHECKGOTO(ncclRegFind(comm, userbuff, buffSize, &regRecord), ret, fail);
+    NCCLCHECKGOTO(ncclRegLocalIsValid(regRecord, &isValid), ret, fail);
+    if (isValid)
+      NCCLCHECKGOTO(collnetRegisterBuffer(comm, userbuff, buffSize, type, regRecord, outRegBufFlag, outHandle), ret, fail);
+  }
+exit:
+  return ret;
+fail:
+  *outRegBufFlag = 0;
+  goto exit;
+}
+
+struct ncclCollnetCleanupCallback {
+  struct ncclCommCallback base;
+  struct ncclComm *comm;
+  struct ncclReg *reg;
+};
+
+static ncclResult_t cleanupCollnet(struct ncclComm* comm, struct ncclCommCallback* cb) {
+  struct ncclCollnetCleanupCallback* obj = (struct ncclCollnetCleanupCallback*)cb;
+  NCCLCHECK(ncclCommGraphDeregister(obj->comm, obj->reg));
+  free(obj);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, int* outRegBufFlag, void** outHandle, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, int* nCleanupQueueElts) {
+  ncclResult_t ret = ncclSuccess;
+  struct ncclCollnetCleanupCallback* record = NULL;
+  struct ncclReg *regRecord = NULL;
+  void *baseSend = NULL;
+  size_t baseSendSize = 0;
+
+  *outRegBufFlag = 0;
+  if (comm && userbuff && buffSize > 0) {
+    CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &baseSendSize, (CUdeviceptr)userbuff), ret, fail);
+    NCCLCHECKGOTO(ncclCommGraphRegister(comm, baseSend, baseSendSize, (void**)&regRecord), ret, fail);
+    NCCLCHECKGOTO(collnetRegisterBuffer(comm, userbuff, buffSize, type, regRecord, outRegBufFlag, outHandle), ret, fail);
+
+    if (*outRegBufFlag) {
+      record = (struct ncclCollnetCleanupCallback*)malloc(sizeof(struct ncclCollnetCleanupCallback));
+      record->base.fn = cleanupCollnet;
+      record->comm = comm;
+      record->reg = regRecord;
+      ncclIntruQueueEnqueue(cleanupQueue, (struct ncclCommCallback*)record);
+      *nCleanupQueueElts += 1;
+    } else {
+      NCCLCHECKGOTO(ncclCommGraphDeregister(comm, regRecord), ret, fail);
+    }
+  }

 exit:
  return ret;
@@ -1055,55 +1252,9 @@ fail:
  goto exit;
 }

-struct ncclCollnetCleanupCallback {
-  struct ncclCommCallback base;
-  struct ncclProxyConnector* proxyConn;
-  void* buffer;
-  size_t size;
-  void* mhandle;
-};
-
-static ncclResult_t cleanupCollnet(struct ncclComm* comm, struct ncclCommCallback* cb) {
-  struct ncclCollnetCleanupCallback* obj = (struct ncclCollnetCleanupCallback*)cb;
-  NCCLCHECK(ncclCollnetDeregBuffer(comm, obj->proxyConn, obj->mhandle));
-  INFO(NCCL_REG, "rank %d - deregistered collnet buffer handle %p, size %ld, buff %p", comm->rank, obj->mhandle, obj->size, obj->buffer);
-  free(obj);
-  return ncclSuccess;
-}
-
-ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, int* outRegBufFlag, void** outHandle, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, int* nCleanupQueueElts) {
-  ncclResult_t ret = ncclSuccess;
-  void* handle = NULL;
-  struct ncclRegCache* cache = &comm->regCache;
-  uintptr_t pageSize = cache->pageSize;
-  uintptr_t addr = (uintptr_t)userbuff & -pageSize;
-  size_t size = DIVUP((uintptr_t)userbuff - addr + buffSize, pageSize) * pageSize;
-  collnetRegInfo info = {addr, size};
-  struct ncclCollnetCleanupCallback* record = NULL;
-  struct ncclProxyConnector* proxyConn = (type == collNetRecv) ? &comm->channels[0].peers[comm->nRanks]->recv[type].proxyConn : &comm->channels[0].peers[comm->nRanks]->send[type].proxyConn;
-
-  *outRegBufFlag = 0;
-  NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgRegister, &info, sizeof(struct collnetRegInfo), &handle, sizeof(void*)), ret, fail);
-  record = (struct ncclCollnetCleanupCallback*)malloc(sizeof(struct ncclCollnetCleanupCallback));
-  record->base.fn = cleanupCollnet;
-  record->proxyConn = proxyConn;
-  record->buffer = (void*)userbuff;
-  record->size = buffSize;
-  *outHandle = record->mhandle = handle;
-  *outRegBufFlag = 1;
-  ncclIntruQueueEnqueue(cleanupQueue, (struct ncclCommCallback*)record);
-  *nCleanupQueueElts += 1;
-
-exit:
-  return ret;
-fail:
-  *outRegBufFlag = 0;
-  *outHandle = NULL;
-  goto exit;
-}
-
 ncclResult_t ncclCollnetDeregBuffer(struct ncclComm* comm, struct ncclProxyConnector* proxyconn, void* handle) {
  NCCLCHECK(ncclProxyCallBlocking(comm, proxyconn, ncclProxyMsgDeregister, &handle, sizeof(void*), NULL, 0));
+  INFO(NCCL_REG, "rank %d - COLLNET deregistered buffer handle %p", comm->rank, handle);
  return ncclSuccess;
 }

@@ -1111,26 +1262,67 @@ static ncclResult_t sendProxyRegBuffer(struct ncclProxyConnection* connection, s
  void* handle;
  struct collnetRegInfo* info = (struct collnetRegInfo*)reqBuff;
  struct sendResources* resources = (struct sendResources*)(connection->transportResources);
+  ncclResult_t ret = ncclSuccess;
+  bool needReg = true;

  assert(reqSize == sizeof(struct collnetRegInfo));
  assert(respSize == sizeof(void*));
-  if (proxyState->ncclCollNet->regMr(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle) != ncclSuccess) handle = NULL;
+
+#if CUDART_VERSION >= 11070
+  /* DMA-BUF support */
+  if (resources->useGdr && resources->useDmaBuf) {
+    int dmabuf_fd;
+    CUCHECKGOTO(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0), ret, peermem);
+    NCCLCHECKGOTO(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem);
+    (void)close(dmabuf_fd);
+    needReg = false;
+  }
+#endif
+peermem:
+  if (needReg) {
+    NCCLCHECKGOTO(proxyState->ncclCollNet->regMr(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle), ret, fail);
+  }
+
+exit:
  memcpy(respBuff, (void*)&handle, sizeof(void*));
  *done = 1;
  return ncclSuccess;
+fail:
+  handle = NULL;
+  goto exit;
 }

 static ncclResult_t recvProxyRegBuffer(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
  void* handle;
  struct collnetRegInfo* info = (struct collnetRegInfo*)reqBuff;
  struct recvResources* resources = (struct recvResources*)(connection->transportResources);
+  ncclResult_t ret = ncclSuccess;
+  bool needReg = true;

  assert(reqSize == sizeof(struct collnetRegInfo));
  assert(respSize == sizeof(void*));
-  if (proxyState->ncclCollNet->regMr(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle) != ncclSuccess) handle = NULL;
+  #if CUDART_VERSION >= 11070
+  /* DMA-BUF support */
+  if (resources->useGdr && resources->useDmaBuf) {
+    int dmabuf_fd;
+    CUCHECKGOTO(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0), ret, peermem);
+    NCCLCHECKGOTO(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem);
+    (void)close(dmabuf_fd);
+    needReg = false;
+  }
+#endif
+peermem:
+  if (needReg) {
+    NCCLCHECKGOTO(proxyState->ncclCollNet->regMr(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle), ret, fail);
+  }
+
+exit:
  memcpy(respBuff, (void*)&handle, sizeof(void*));
  *done = 1;
  return ncclSuccess;
+fail:
+  handle = NULL;
+  goto exit;
 }

 static ncclResult_t sendProxyDeregBuffer(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, int* done) {
@@ -1155,13 +1347,6 @@ static ncclResult_t recvProxyDeregBuffer(struct ncclProxyConnection* connection,
  return ncclSuccess;
 }

-struct ncclTransport collNetTransport = {
-  "COL",
-  canConnect,
-  { sendSetup, sendConnect, sendFree, NULL, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress, sendProxyRegBuffer, sendProxyDeregBuffer },
-  { recvSetup, recvConnect, recvFree, NULL, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress, recvProxyRegBuffer, recvProxyDeregBuffer }
-};
-
 ncclResult_t ncclCollNetChainBufferSetup(ncclComm_t comm) {
  ncclResult_t ret = ncclSuccess;
  char line[1024];
@@ -1197,7 +1382,6 @@ fail:

 ncclResult_t ncclCollNetDirectBufferSetup(ncclComm_t comm) {
  ncclResult_t ret = ncclSuccess;
-  int highestTransportType0 = TRANSPORT_UNDEFINED, highestTransportType1 = TRANSPORT_UNDEFINED;

  if (comm->collNetSupport == 0) goto exit;

@@ -1206,13 +1390,13 @@ ncclResult_t ncclCollNetDirectBufferSetup(ncclComm_t comm) {
    struct ncclChannel* channelRecv = comm->channels + c;
    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.down, 0), ret, fail);
  }
-  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_COLLNET_DIRECT], 0, &highestTransportType0), ret, fail);
+  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_COLLNET_DIRECT], 0), ret, fail);

  for (int c = 0; c < comm->nChannels; c++) {
    struct ncclChannel* channelSend = comm->channels + c;
    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.down, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.up, 1), ret, fail);
  }
-  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_COLLNET_DIRECT], 1, &highestTransportType1), ret, fail);
+  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_COLLNET_DIRECT], 1), ret, fail);

  INFO(NCCL_INIT, "rank %d Connected CollNet", comm->rank);

@@ -1410,3 +1594,10 @@ fail:
  comm->collNetSupport = 0;
  goto exit;
 }
+
+struct ncclTransport collNetTransport = {
+  "COL",
+  canConnect,
+  { sendSetup, sendConnect, sendFree, NULL, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress, sendProxyRegBuffer, sendProxyDeregBuffer },
+  { recvSetup, recvConnect, recvFree, NULL, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress, recvProxyRegBuffer, recvProxyDeregBuffer }
+};
@@ -1,17 +1,37 @@
 #include "comm.h"
 #include "transport.h"
+#include "bootstrap.h"

 ncclResult_t ncclTransportRingConnect(struct ncclComm* comm) {
+  struct ringConnInfo {
+    bool useNetPXN;
+    bool useGdr;
+  };
+  struct ringConnInfo* ringInfo = NULL;
  ncclResult_t ret = ncclSuccess;
  if (comm && comm->nRanks > 1) {
+    comm->useGdr = true;
+    comm->useNetPXN = false;
    for (int c = 0; c < comm->nChannels; c++) {
      struct ncclChannel* channel = comm->channels + c;
      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, fail);
    }
    NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_RING], 0), ret, fail);
-    INFO(NCCL_INIT, "Connected all rings");
+    if (ncclParamLocalRegister() || ncclParamGraphRegister()) {
+      NCCLCHECK(ncclCalloc(&ringInfo, comm->nRanks));
+      ringInfo[comm->rank].useGdr = comm->useGdr;
+      ringInfo[comm->rank].useNetPXN = comm->useNetPXN;
+      NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, ringInfo, sizeof(struct ringConnInfo)), ret, fail);
+      for (int i = 0; i < comm->nRanks; ++i) {
+        if (!ringInfo[i].useGdr) comm->useGdr = false;
+        if (ringInfo[i].useNetPXN) comm->useNetPXN = true;
+        if (comm->useGdr == false && comm->useNetPXN == true) break;
+      }
+    }
+    INFO(NCCL_INIT, "Connected all rings, use ring PXN %d GDR %d", comm->useNetPXN, comm->useGdr);
  }
 exit:
+  free(ringInfo);
  return ret;
 fail:
  goto exit;
@@ -15,6 +15,7 @@
 #include "profiler.h"
 #include "transport.h"
 #include "shm.h"
+#include <assert.h>

 static_assert(sizeof(ncclNetHandle_t) <= CONNECT_SIZE, "NET Connect info is too large");

@@ -107,6 +108,7 @@ struct sendNetResources {
  int netDeviceVersion;
  ncclNetDeviceType netDeviceType;
  ncclNetDeviceHandle_t* netDeviceHandle;
+  size_t maxP2pBytes;
 };

 struct recvNetResources {
@@ -139,6 +141,12 @@ struct recvNetResources {
  int netDeviceVersion;
  ncclNetDeviceType netDeviceType;
  ncclNetDeviceHandle_t* netDeviceHandle;
+  size_t maxP2pBytes;
+};
+
+struct netRegInfo {
+  uintptr_t buffer;
+  size_t size;
 };

 /* Determine if two peers can communicate with NET */
@@ -166,6 +174,9 @@ struct setupReq {
  int connIndex;
 };

+NCCL_PARAM(NetOptionalRecvCompletion, "NET_OPTIONAL_RECV_COMPLETION", 1);
+
+static_assert(sizeof(ncclNetHandle_t) + sizeof(int) <= CONNECT_SIZE, "Not large enough ncclConnect to hold ncclNetHandle_t and useGdr flag");
 // Forward declaration
 static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args);

@@ -181,8 +192,10 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
  int proxyRank;
  int64_t netId;
  NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &netId, &req.netDev, &proxyRank));
-  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 1, &req.useGdr));
+  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->rank, netId, 1, &req.useGdr));
  send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
+  if (!req.useGdr && connIndex == 0) comm->useGdr = 0;
+  if (proxyRank != myInfo->rank && connIndex == 0) comm->useNetPXN = true;

  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, proxyRank, &send->proxyConn));
  req.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
@@ -198,6 +211,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
        proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
  }
  *((int*)connectInfo) = comm->topParentRanks[proxyRank];
+  memcpy((uint8_t*)connectInfo + sizeof(ncclNetHandle_t), &req.useGdr, sizeof(int));
  return ncclSuccess;
 }

@@ -218,10 +232,12 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
  int proxyRank;
  int64_t netId;
  NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, myInfo->rank, &netId, &req.netDev, &proxyRank));
-  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 0, &req.useGdr));
+  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->rank, netId, 0, &req.useGdr));
+  recv->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
+  if (!req.useGdr && connIndex == 0) comm->useGdr = 0;

  // Determine whether we need to flush the GDR buffer on recv or not
-  if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush));
+  if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm, req.netDev, myInfo->rank, &req.needFlush));

  // We don't support PXN on receive yet
  NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, myInfo->rank, &recv->proxyConn));
@@ -230,6 +246,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
  req.tpRank = comm->topParentRanks[myInfo->rank];
  req.tpRemoteRank = comm->topParentRanks[peerInfo->rank];
  NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t)));
+  memcpy((uint8_t*)connectInfo + sizeof(ncclNetHandle_t), &req.useGdr, sizeof(int));
  INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->nvmlDev, myInfo->rank, myInfo->nvmlDev, comm->ncclNet->name, req.netDev,
      req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
  return ncclSuccess;
@@ -283,8 +300,11 @@ struct netRecvConnectArgs {

 static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
  struct connectMap* map = (connectMap*) send->transportResources;
-
  void* opId;
+  int recvUseGdr;
+
+  memcpy(&recvUseGdr, (uint8_t*)connectInfo + sizeof(ncclNetHandle_t), sizeof(int));
+  if (!recvUseGdr) send->conn.flags &= ~NCCL_DIRECT_NIC;

  // map isn't allocated thus this op hasn't been submitted yet
  if (!map) {
@@ -391,6 +411,11 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
 static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
  struct connectMap* map = (connectMap*) recv->transportResources;
  void* opId;
+  int sendUseGdr;
+
+  memcpy(&sendUseGdr, (uint8_t*)connectInfo + sizeof(ncclNetHandle_t), sizeof(int));
+  if (!sendUseGdr) recv->conn.flags &= ~NCCL_DIRECT_NIC;
+
  if (!map) {
    NCCLCHECK(ncclCalloc(&map, 1));
    recv->transportResources = map;
@@ -522,7 +547,7 @@ static ncclResult_t sharedNetBuffersInit(struct ncclProxyState* proxyState, int
  return ncclSuccess;
 }

-static ncclResult_t sharedBuffersGet(struct ncclProxyState* proxyState, int channel, int slot, int* offset, int* size) {
+static ncclResult_t sharedBuffersGet(struct ncclProxyState* proxyState, int channel, int slot, int* offset, size_t* size) {
  // Use different pools for different channels and also separate send/recv.
  int globalSlot = (channel*NCCL_SHARED_STEPS)+slot;
  *offset = proxyState->p2pChunkSize * globalSlot;
@@ -590,6 +615,13 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc

  resources->netDeviceVersion = props.netDeviceVersion;
  resources->netDeviceType = props.netDeviceType;
+  /* point-to-point size limits*/
+  resources->maxP2pBytes = props.maxP2pBytes;
+  if((resources->maxP2pBytes <= 0) || (resources->maxP2pBytes > NCCL_MAX_NET_SIZE_BYTES)) {
+    WARN("sendProxySetup: net plugin returned invalid value for maxP2pBytes %ld \
+      [allowed range: %ld - %ld] \n", resources->maxP2pBytes, 0L, NCCL_MAX_NET_SIZE_BYTES);
+    return ncclInternalError;
+  }

  // We don't return any data
  if (respSize != 0) return ncclInternalError;
@@ -621,6 +653,13 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
  resources->maxRecvs = props.maxRecvs;
  resources->netDeviceVersion = props.netDeviceVersion;
  resources->netDeviceType = props.netDeviceType;
+  /* point-to-point size limits*/
+  resources->maxP2pBytes = props.maxP2pBytes;
+  if((resources->maxP2pBytes <= 0) || (resources->maxP2pBytes > NCCL_MAX_NET_SIZE_BYTES)) {
+    WARN("recvProxySetup: net plugin returned invalid value for maxP2pBytes %ld \
+      [allowed range: %ld - %ld] \n", resources->maxP2pBytes, 0L, NCCL_MAX_NET_SIZE_BYTES);
+    return ncclInternalError;
+  }

  if (respSize != sizeof(ncclNetHandle_t)) return ncclInternalError;
  NCCLCHECK(proxyState->ncclNet->listen(req->netDev, respBuff, &resources->netListenComm));
@@ -916,6 +955,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str

  resources->sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem);
  resources->recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem);
+  for (int i = 0; i < NCCL_STEPS; i++) resources->recvMem->connFifo[i].size = -1;
  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
    resources->buffers[p] = NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]);
    if (resources->buffers[p]) {
@@ -1032,7 +1072,6 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
 }

 static_assert(NCCL_STEPS <= NCCL_NET_MAX_REQUESTS, "Not enough net requests to cover for steps");
-#define MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two.

 static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
  if (args->state == ncclProxyOpReady) {
@@ -1045,11 +1084,8 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
      resources->step = sub->base + sub->nsteps;
      sub->posted = sub->transmitted = sub->done = 0;
      ncclProfilerStartSendProxyOpEvent(s, args);
-      if (sub->reg && sub->nbytes > 0) {
-        NCCLCHECK(proxyState->ncclNet->regMr(resources->netSendComm, sub->recvbuff, sub->nbytes, NCCL_PTR_CUDA, &sub->mhandle));
-      } else {
-        sub->mhandle = resources->mhandles[args->protocol];
-      }
+      if (!sub->reg)
+        sub->sendMhandle = resources->mhandles[args->protocol];
    }
    args->state = ncclProxyOpProgress;
  }
@@ -1059,6 +1095,9 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
    int maxDepth = std::min(NCCL_STEPS, NCCL_SHARED_STEPS/args->nsubs);
    for (int s=0; s<args->nsubs; s++) {
      struct ncclProxySubArgs* sub = args->subs+s;
+      int postedStepId = sub->posted;
+      int transmittedStepId = sub->transmitted;
+      int doneStepId = sub->done;
      if (sub->done == sub->nsteps) continue;
      struct sendNetResources* resources = (struct sendNetResources*) (sub->connection->transportResources);
      volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
@@ -1066,7 +1105,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
      char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
      // Post buffers to the GPU
      if (sub->posted < sub->nsteps && sub->posted < sub->done + maxDepth) {
-        ncclProfilerStartSendProxyStepEvents(s, args, sub->posted, sub->posted+args->sliceSteps);
+        ncclProfilerStartSendProxyStepEvent(s, args, postedStepId);
        int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
        if (resources->shared) {
          if (!sub->reg) {
@@ -1078,12 +1117,13 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
          }
          volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
          sub->posted += args->sliceSteps;
-          // Only post one credit for registered buffer
-          if (sub->reg == 0 || sub->posted == args->sliceSteps) *sendHead = sub->base + sub->posted - NCCL_STEPS;
+          *sendHead = sub->base + sub->posted - NCCL_STEPS;
          if (resources->gdcSync) wc_store_fence(); // Flush out WC write
-        } else sub->posted += args->sliceSteps;
+        } else {
+          sub->posted += args->sliceSteps;
+        }
        ncclProfilerRecordProxyOpEventState(s, args, sub->posted, sub->transSize, ncclProfilerProxyOpSendPosted);
-        ncclProfilerRecordProxyStepEventStates(s, args, sub->posted-args->sliceSteps, sub->posted, ncclProfilerProxyStepSendGPUWait);
+        ncclProfilerRecordProxyStepEventState(s, args, postedStepId, ncclProfilerProxyStepSendGPUWait);
        args->idle = 0;
        continue;
      }
@@ -1091,10 +1131,10 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
      if (sub->transmitted < sub->posted && sub->transmitted < sub->done + NCCL_STEPS) {
        int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
        volatile uint64_t* recvTail = &resources->recvMem->tail;
-        uint64_t tail = sub->base + (sub->reg ? 0 : sub->transmitted);
-        if ((sub->reg || connFifo[buffSlot].size != -1) && ((*recvTail > tail) || p == NCCL_PROTO_LL)) {
+        uint64_t tail = sub->base + sub->transmitted;
+        if (connFifo[buffSlot].size != -1 && (*recvTail > tail || p == NCCL_PROTO_LL)) {
          // We have something to receive, let's check if it's completely ready.
-          int size = sub->reg ? std::min(MAX_NET_SIZE, sub->nbytes) : connFifo[buffSlot].size;
+          int size = connFifo[buffSlot].size;
          bool shared = (p == NCCL_PROTO_SIMPLE) && resources->shared;
          char* buff = shared ? localBuff+connFifo[buffSlot].offset : localBuff+buffSlot*stepSize;
          int ready = 1;
@@ -1120,22 +1160,28 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
              volatile uint32_t *f2 = &lines[i].flag2;
              if (f1[0] != flag || f2[0] != flag) { ready = 0; break; }
            }
-          } else if (p == NCCL_PROTO_SIMPLE && resources->shared) {
-            buff = sub->reg ? (char*)sub->recvbuff : localBuff+resources->recvMem->connFifo[buffSlot].offset;
+          } else if (p == NCCL_PROTO_SIMPLE) {
+            if (resources->shared) {
+              buff = sub->reg ? (char*)sub->sendbuff + sub->transmitted * NCCL_MAX_NET_SIZE : localBuff + resources->recvMem->connFifo[buffSlot].offset;
+            } else if (sub->reg) {
+              size_t sendSize;
+              sub->ringAlgo->getNextSendAddr(sub->transmitted, (uint8_t**)&buff, &sendSize, &sub->sendMhandle);
+              assert(sendSize == size);
+            }
          }
          if (ready) {
-            ncclProfilerRecordProxyOpEventState(s, args, sub->transmitted + args->sliceSteps, sub->transSize, ncclProfilerProxyOpSendRemFifoWait);
+            ncclProfilerRecordProxyOpEventState(s, args, sub->transmitted+args->sliceSteps, sub->transSize, ncclProfilerProxyOpSendRemFifoWait);
            // Data is ready, try to send.
            // Coverity complains about the size here as pointing to an out-of-scope temporary.  Which is nonsense,
            // since size is a plain integer.
            // coverity[use_invalid:FALSE]
-            NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, sub->mhandle, sub->requests+buffSlot));
+            NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, sub->sendMhandle, sub->requests+buffSlot));
            if (sub->requests[buffSlot] != NULL) {
-              TRACE(NCCL_NET, "sendProxy [%ld/%d] Isend posted, req %p, size %d, proto %d, myRank %d, channelId %d", sub->transmitted, buffSlot, sub->requests[buffSlot], size, p, proxyState->tpRank, sub->channelId);
+              TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Isend posted, req %p, buff %p, size %d, proto %d, myRank %d, channelId %d, mhandle %p", sub->transmitted, buffSlot, sub->nsteps, sub->requests[buffSlot], buff, size, p, proxyState->tpRank, sub->channelId, sub->sendMhandle);
+              sub->transSize += size;
              sub->transmitted += args->sliceSteps;
              ncclProfilerRecordProxyOpEventState(s, args, sub->transmitted, sub->transSize, ncclProfilerProxyOpSendTransmitted);
-              ncclProfilerRecordProxyStepEventStates(s, args, sub->transmitted-args->sliceSteps, sub->transmitted, ncclProfilerProxyStepSendWait);
-              sub->transSize += size;
+              ncclProfilerRecordProxyStepEventState(s, args, transmittedStepId, ncclProfilerProxyStepSendWait);
              args->idle = 0;
              continue;
            }
@@ -1149,41 +1195,24 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
        int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
        NCCLCHECK(proxyState->ncclNet->test(sub->requests[buffSlot], &done, &size));
        if (done) {
-          if (sub->reg) {
-            if (size < sub->nbytes) {
-              sub->recvbuff += size;
-              sub->nbytes -= size;
-              // Do one more step (at least)
-              sub->nsteps++;
-            } else {
-              // Signal the GPU the send is complete and it can return.
-              connFifo[sub->base%NCCL_STEPS].size = -1;
-            }
-          }
          // Make sure size is reset to -1 before we update the head.
-          if (sub->reg == 0) connFifo[buffSlot].size = -1;
+          connFifo[buffSlot].size = -1;
          __sync_synchronize();
-          TRACE(NCCL_NET, "sendProxy [%ld/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]);
+          TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] request %p done", sub->done, buffSlot, sub->nsteps, sub->requests[buffSlot]);
          sub->done += args->sliceSteps;
-          ncclProfilerStopProxyStepEvents(s, args, sub->done-args->sliceSteps, sub->done);
+          ncclProfilerStopProxyStepEvent(s, args, doneStepId);
          ncclProfilerRecordProxyOpEventState(s, args, sub->done, sub->transSize, ncclProfilerProxyOpSendDone);

          if (resources->shared == 0) {
            volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
-            if (sub->reg) {
-              // We may have added more net steps, but reg operations only have a single step w.r.t. the GPU.
-              if (sub->done == sub->nsteps) *sendHead = sub->base + args->sliceSteps;
-            } else {
-              *sendHead = sub->base + sub->done;
-            }
+            *sendHead = sub->base + sub->done;
            if (resources->gdcSync) wc_store_fence(); // Flush out WC write
          }
          args->idle = 0;
          if (sub->done == sub->nsteps) {
-            if (sub->reg && sub->nbytes > 0) {
-              NCCLCHECK(proxyState->ncclNet->deregMr(resources->netSendComm, sub->mhandle));
-            }
            args->done++;
+            if (sub->ringAlgo && sub->ringAlgo->decRefCount() == 0) delete sub->ringAlgo;
+            sub->ringAlgo = NULL;
          }
        }
      }
@@ -1232,14 +1261,11 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
      // Set step base for next op
      resources->step = sub->base + sub->nsteps;
      sub->posted = sub->received = sub->transmitted = sub->done = 0;
+      sub->regBufferReady = 0;
      for (int i=0; i<groupSize; i++) sub[-i].groupSize = groupSize;
      ncclProfilerStartRecvProxyOpEvent(s, args);
-      if (sub->reg && sub->nbytes > 0) {
-        // Register buffer
-        NCCLCHECK(proxyState->ncclNet->regMr(resources->netRecvComm, sub->recvbuff, sub->nbytes, NCCL_PTR_CUDA, &sub->mhandle));
-      } else {
-        sub->mhandle = resources->mhandles[args->protocol];
-      }
+      if (!sub->reg)
+        sub->recvMhandle = resources->mhandles[args->protocol];
    }
    args->state = ncclProxyOpProgress;
  }
@@ -1251,32 +1277,44 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
      struct ncclProxySubArgs* subGroup = args->subs+s;
      int subCount = 0;
      void* ptrs[NCCL_PROXY_MAX_SUBS];
-      int sizes[NCCL_PROXY_MAX_SUBS];
+      size_t sizes[NCCL_PROXY_MAX_SUBS];
      int tags[NCCL_PROXY_MAX_SUBS];
      void* mhandles[NCCL_PROXY_MAX_SUBS];
      for (int i=0; i<subGroup->groupSize; i++) {
        struct ncclProxySubArgs* sub = subGroup + i;
+        int postedStepId = sub->posted;
        if (sub->posted < sub->nsteps) {
          if (sub->posted >= sub->done + maxDepth) { subCount = 0; break; }
-          ncclProfilerStartRecvProxyStepEvents(s+i, args, sub->posted, sub->posted+args->sliceSteps);
+          ncclProfilerStartRecvProxyStepEvent(s+i, args, postedStepId);
          struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
-          if (sub->reg) maxDepth = 1;
          int stepSize = resources->buffSizes[p] / NCCL_STEPS;
          char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
          int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
          volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
-          if (p == NCCL_PROTO_SIMPLE && resources->shared) {
-            if (sub->reg) {
-              // Wait until CUDA kernel has started before we access the user buffer directly.
-              if (connFifo[sub->base%NCCL_STEPS].size == -1) continue;
-              ptrs[subCount] = sub->recvbuff;
-              sizes[subCount] = std::min(MAX_NET_SIZE, sub->nbytes);
+          if (p == NCCL_PROTO_SIMPLE) {
+            if (resources->shared) {
+              if (sub->reg) {
+                // Wait until CUDA kernel has started before we access the user buffer directly.
+                if (!sub->regBufferReady && connFifo[sub->base % NCCL_STEPS].size == -1) continue;
+                sub->regBufferReady = 1;
+                ptrs[subCount] = sub->recvbuff + sub->posted * NCCL_MAX_NET_SIZE;
+                sizes[subCount] = std::min(NCCL_MAX_NET_SIZE, (ssize_t)(sub->nbytes - sub->posted * NCCL_MAX_NET_SIZE));
+              } else {
+                int sharedBuffSlot = sub->posted % maxDepth;
+                int offset;
+                NCCLCHECK(sharedBuffersGet(proxyState, sub->channelId, sharedBuffSlot * args->nsubs + s + i, &offset, sizes + subCount));
+                connFifo[buffSlot].offset = offset;
+                ptrs[subCount] = localBuff + offset;
+              }
            } else {
-              int sharedBuffSlot = sub->posted%maxDepth;
-              int offset;
-              NCCLCHECK(sharedBuffersGet(proxyState, sub->channelId, sharedBuffSlot*args->nsubs+s+i, &offset, sizes+subCount));
-              connFifo[buffSlot].offset = offset;
-              ptrs[subCount] = localBuff+offset;
+              if (sub->reg) {
+                if (!sub->regBufferReady && connFifo[sub->base % NCCL_STEPS].size == -1) continue;
+                sub->regBufferReady = 1;
+                sub->ringAlgo->getNextRecvAddr(sub->posted, (uint8_t**)&ptrs[subCount], &sizes[subCount], &sub->recvMhandle);
+              } else {
+                ptrs[subCount] = localBuff + buffSlot * stepSize;
+                sizes[subCount] = stepSize * args->sliceSteps;
+              }
            }
          } else {
            ptrs[subCount] = localBuff+buffSlot*stepSize;
@@ -1284,7 +1322,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
          }
          if (sub->nbytes < sizes[subCount]) sizes[subCount] = sub->nbytes;
          tags[subCount] = resources->tpRemoteRank;
-          mhandles[subCount] = sub->mhandle;
+          mhandles[subCount] = sub->recvMhandle;
          subCount++;
        }
      }
@@ -1292,15 +1330,19 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
        uint64_t step = subGroup->posted;
        struct recvNetResources* resources = (struct recvNetResources*) (subGroup->connection->transportResources);
        void** requestPtr = subGroup->requests+(step%NCCL_STEPS);
+        bool ignoreCompletion = ncclParamNetOptionalRecvCompletion() && ((args->protocol == NCCL_PROTO_LL128) || (args->protocol == NCCL_PROTO_LL)) && (subCount == 1);
+        if (ignoreCompletion) *requestPtr = (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION;
        NCCLCHECK(proxyState->ncclNet->irecv(resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, requestPtr));
        if (*requestPtr) {
          subGroup->recvRequestsCache[step%NCCL_STEPS] = *requestPtr;
          subGroup->recvRequestsSubCount = subCount;
          for (int i=0; i<subGroup->groupSize; i++) {
            struct ncclProxySubArgs* sub = subGroup+i;
+            int postedStepId = sub->posted;
+            TRACE(NCCL_NET, "recvProxy [%ld/%ld/%d] Irecv posted, buff %p, size %ld, myRank %d, channelId %d, mhandle %p", sub->posted, (sub->base + sub->posted) % NCCL_STEPS, sub->nsteps, ptrs[i], sizes[i], proxyState->tpRank, sub->channelId, mhandles[i]);
            sub->posted += args->sliceSteps;
            ncclProfilerRecordProxyOpEventState(s+i, args, sub->posted, sub->transSize, ncclProfilerProxyOpRecvPosted);
-            ncclProfilerRecordProxyStepEventStates(s+i, args, sub->posted-args->sliceSteps, sub->posted, ncclProfilerProxyStepRecvWait);
+            ncclProfilerRecordProxyStepEventState(s+i, args, postedStepId, ncclProfilerProxyStepRecvWait);
          }
          args->idle = 0;
        }
@@ -1321,31 +1363,18 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
        if (done) {
          int needFlush = 0;
          int totalSize = 0;
-          int subIndex = 0;
          for (int i=0; i<NCCL_PROXY_MAX_SUBS; i++) totalSize += sizes[i];
          for (int i=0; i<subGroup->groupSize; i++) {
            struct ncclProxySubArgs* sub = subGroup + i;
-            if (sub->received < sub->nsteps) {
-              int size = sizes[subIndex++];
-              if (sub->reg) {
-                if (size < sub->nbytes) {
-                  sub->recvbuff += size;
-                  sub->nbytes -= size;
-                  // Do one more step (at least)
-                  sub->nsteps++;
-                } else {
-                  // Reset connFifo size indicating the GPU was ready to receive.
-                  // There is a __sync_synchronize() later to ensure it is reset before it is set again by the GPU.
-                  struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
-                  volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
-                  connFifo[sub->base%NCCL_STEPS].size = -1;
-                }
-              }
-            }
-            sub->received += args->sliceSteps;
+            int receivedStepId = sub->received;
+            int buffSlot = (sub->base + sub->received) % NCCL_STEPS;
+            struct recvNetResources* resources = (struct recvNetResources*)(sub->connection->transportResources);
+            volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
+            connFifo[buffSlot].size = -1;
            sub->transSize += sizes[i];
+            sub->received += args->sliceSteps;
            ncclProfilerRecordProxyOpEventState(s+i, args, sub->received, sub->transSize, ncclProfilerProxyOpRecvReceived);
-            ncclProfilerRecordProxyStepEventStates(s+i, args, sub->received-args->sliceSteps, sub->received, ncclProfilerProxyStepRecvFlushWait);
+            ncclProfilerRecordProxyStepEventState(s+i, args, receivedStepId, ncclProfilerProxyStepRecvFlushWait);
            if (step < sub->nsteps) {
              struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
              if (resources->useGdr) needFlush |= resources->needFlush;
@@ -1372,10 +1401,16 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
                  int stepSize = resources->buffSizes[p] / NCCL_STEPS;
                  char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
                  int buffSlot = (sub->base+sub->received-args->sliceSteps)%NCCL_STEPS;
-                  ptrs[subCount] = resources->shared ?
-                    (sub->reg ? (char*)sub->recvbuff : localBuff+resources->recvMem->connFifo[buffSlot].offset) :
-                    localBuff+buffSlot*stepSize;
-                  mhandles[subCount] = sub->mhandle;
+                  if (resources->shared) {
+                    ptrs[subCount] = sub->reg ? (char*)sub->recvbuff + step * NCCL_MAX_NET_SIZE : localBuff + resources->recvMem->connFifo[buffSlot].offset;
+                  } else {
+                    if (sub->reg) {
+                      sub->ringAlgo->getNextRecvAddr(step, (uint8_t**)&ptrs[subCount], NULL, &sub->recvMhandle);
+                    } else {
+                      ptrs[subCount] = localBuff + buffSlot * stepSize;
+                    }
+                  }
+                  mhandles[subCount] = sub->recvMhandle;
                  subCount++;
                }
              }
@@ -1399,19 +1434,16 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
        if (done) {
          for (int i=0; i<subGroup->groupSize; i++) {
            struct ncclProxySubArgs* sub = subGroup + i;
+            int transmittedStepId = sub->transmitted;

            sub->transmitted += args->sliceSteps;
            ncclProfilerRecordProxyOpEventState(s+i, args, sub->transmitted, sub->transSize, ncclProfilerProxyOpRecvTransmitted);
-            ncclProfilerRecordProxyStepEventStates(s+i, args, sub->transmitted-args->sliceSteps, sub->transmitted, ncclProfilerProxyStepRecvGPUWait);
+            ncclProfilerRecordProxyStepEventState(s+i, args, transmittedStepId, ncclProfilerProxyStepRecvGPUWait);
            if (step < sub->nsteps) {
              __sync_synchronize();
              struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
              volatile uint64_t* recvTail = resources->gdcSync ? resources->gdcSync : &resources->recvMem->tail;
-              if (sub->reg) {
-                // We may have added more net steps, but reg operations only have a single step w.r.t. the GPU.
-                if (sub->transmitted == sub->nsteps) *recvTail = sub->base + args->sliceSteps;
-              } else
-                *recvTail = sub->base + sub->transmitted;
+              *recvTail = sub->base + sub->transmitted;
              if (resources->gdcSync) wc_store_fence(); // Flush out WC write
            }
          }
@@ -1425,11 +1457,12 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
      struct ncclProxySubArgs* subGroup = args->subs+s;
      for (int i=0; i<subGroup->groupSize; i++) {
        struct ncclProxySubArgs* sub = subGroup + i;
+        int doneStepId = sub->done;
        if (sub->done == sub->nsteps) continue;
        if (sub->transmitted > sub->done) {
          struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
          volatile uint64_t* sendHead = &resources->sendMem->head;
-          uint64_t done = sub->reg ? sub->base + sub->nsteps : *sendHead;
+          uint64_t done = *sendHead;
          while (done > sub->base + sub->done &&
              // LL and LL128 can acknowledge 0-bytes send before they even happen. Don't go past what we transmitted.
              sub->transmitted > sub->done) {
@@ -1440,15 +1473,13 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
              subGroup->recvRequestsCache[sub->done%NCCL_STEPS] = NULL;
            }
            sub->done += args->sliceSteps;
-            ncclProfilerStopProxyStepEvents(s+i, args, sub->done-args->sliceSteps, sub->done);
+            ncclProfilerStopProxyStepEvent(s+i, args, doneStepId);
            ncclProfilerRecordProxyOpEventState(s+i, args, sub->done, sub->transSize, ncclProfilerProxyOpRecvDone);
            args->idle = 0;
            if (sub->done == sub->nsteps) {
-              struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
-              if (sub->reg && sub->nbytes > 0) {
-                NCCLCHECK(proxyState->ncclNet->deregMr(resources->netRecvComm, sub->mhandle));
-              }
              args->done++;
+              if (sub->ringAlgo && sub->ringAlgo->decRefCount() == 0) delete sub->ringAlgo;
+              sub->ringAlgo = NULL;
              break;
            }
          }
@@ -1465,9 +1496,228 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
  return ncclSuccess;
 }

+ncclResult_t ncclNetDeregBuffer(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void* handle) {
+  NCCLCHECK(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgDeregister, &handle, sizeof(void*), NULL, 0));
+  INFO(NCCL_REG, "rank %d - deregistered net buffer handle %p", comm->rank, handle);
+  return ncclSuccess;
+}
+
+static ncclResult_t netRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, struct ncclConnector** peerConns, int nPeers, struct ncclReg* regRecord, int* outRegBufFlag, void** outHandle) {
+  ncclResult_t ret = ncclSuccess;
+  int gdrFlag = 1;
+
+  if (regRecord) {
+    for (int p = 0; p < nPeers; ++p) {
+      struct ncclConnector* peerConn = peerConns[p];
+      struct ncclProxyConnector* peerProxyConn = NULL;
+      struct ncclRegNetHandles* netHandle = NULL;
+      bool found = false;
+      if (peerConn == NULL) continue;
+      peerProxyConn = &peerConn->proxyConn;
+      netHandle = regRecord->netHandleHead;
+      while (netHandle) {
+        if (netHandle->proxyConn == peerProxyConn) {
+          found = true;
+          break;
+        }
+        netHandle = netHandle->next;
+      }
+      if (found) {
+        *outRegBufFlag = 1;
+        outHandle[p] = netHandle->handle;
+        INFO(NCCL_REG, "rank %d - NET reuse buffer %p size %ld (baseAddr %p size %ld) handle %p", comm->rank, userbuff, buffSize, (void*)regRecord->addr, regRecord->pages * comm->regCache.pageSize, netHandle->handle);
+      } else {
+        struct netRegInfo info = { regRecord->addr, regRecord->pages * comm->regCache.pageSize };
+        void* handle = NULL;
+
+        if (peerConn->conn.flags & NCCL_DIRECT_NIC) {
+          NCCLCHECKGOTO(ncclProxyCallBlocking(comm, peerProxyConn, ncclProxyMsgRegister, &info, sizeof(struct netRegInfo), &handle, sizeof(void*)), ret, fail);
+          if (handle) {
+            struct ncclRegNetHandles* netHandle;
+            regRecord->state |= NET_REG_COMPLETE;
+            NCCLCHECK(ncclCalloc(&netHandle, 1));
+            netHandle->handle = handle;
+            netHandle->proxyConn = peerProxyConn;
+            netHandle->next = regRecord->netHandleHead;
+            regRecord->netHandleHead = netHandle;
+            outHandle[p] = handle;
+            *outRegBufFlag = 1;
+            INFO(NCCL_REG, "rank %d - NET register userbuff %p (handle %p), buffSize %ld", comm->rank, userbuff, handle, buffSize);
+          } else {
+            goto fail;
+          }
+        } else {
+          gdrFlag = 0;
+          goto fail;
+        }
+      }
+    }
+  }
+
+exit:
+  return ret;
+fail:
+  *outRegBufFlag = 0;
+  WARN("rank %d failed to NET register userbuff %p buffSize %ld GDR flag %d", comm->rank, userbuff, buffSize, gdrFlag);
+  goto exit;
+}
+
+ncclResult_t ncclNetLocalRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, struct ncclConnector** peerConns, int nPeers, int* outRegBufFlag, void** outHandle) {
+  ncclResult_t ret = ncclSuccess;
+  struct ncclReg *regRecord = NULL;
+  bool isValid = false;
+
+  *outRegBufFlag = 0;
+  if (comm && userbuff && buffSize > 0 && nPeers > 0) {
+    NCCLCHECKGOTO(ncclRegFind(comm, userbuff, buffSize, &regRecord), ret, fail);
+    NCCLCHECKGOTO(ncclRegLocalIsValid(regRecord, &isValid), ret, fail);
+    if (isValid)
+      NCCLCHECKGOTO(netRegisterBuffer(comm, userbuff, buffSize, peerConns, nPeers, regRecord, outRegBufFlag, outHandle), ret, fail);
+  }
+
+exit:
+  return ret;
+fail:
+  *outRegBufFlag = 0;
+  goto exit;
+}
+
+struct ncclNetCleanupCallback {
+  struct ncclCommCallback base;
+  struct ncclComm *comm;
+  struct ncclReg *reg;
+};
+
+static ncclResult_t cleanupNet(struct ncclComm* comm, struct ncclCommCallback* cb) {
+  struct ncclNetCleanupCallback* obj = (struct ncclNetCleanupCallback*)cb;
+  NCCLCHECK(ncclCommGraphDeregister(obj->comm, obj->reg));
+  free(obj);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclNetGraphRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, struct ncclConnector** peerConns, int nPeers, int* outRegBufFlag, void** outHandle, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, int* nCleanupQueueElts) {
+  ncclResult_t ret = ncclSuccess;
+  struct ncclNetCleanupCallback *record = NULL;
+  struct ncclReg *regRecord = NULL;
+  void *baseSend;
+  size_t baseSendSize;
+
+  *outRegBufFlag = 0;
+  if (comm && userbuff && buffSize > 0 && nPeers > 0) {
+    CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &baseSendSize, (CUdeviceptr)userbuff), ret, fail);
+    NCCLCHECKGOTO(ncclCommGraphRegister(comm, baseSend, baseSendSize, (void**)&regRecord), ret, fail);
+    NCCLCHECKGOTO(netRegisterBuffer(comm, userbuff, buffSize, peerConns, nPeers, regRecord, outRegBufFlag, outHandle), ret, fail);
+    if (*outRegBufFlag) {
+      NCCLCHECKGOTO(ncclCalloc(&record, 1), ret, fail);
+      record->base.fn = cleanupNet;
+      record->comm = comm;
+      record->reg = regRecord;
+      ncclIntruQueueEnqueue(cleanupQueue, (struct ncclCommCallback*)record);
+      if (nCleanupQueueElts) *nCleanupQueueElts += 1;
+    } else {
+      NCCLCHECKGOTO(ncclCommGraphDeregister(comm, regRecord), ret, fail);
+    }
+  }
+exit:
+  return ret;
+fail:
+  *outRegBufFlag = 0;
+  goto exit;
+}
+
+static ncclResult_t sendProxyRegBuffer(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  void* handle;
+  struct netRegInfo* info = (struct netRegInfo*)reqBuff;
+  struct sendNetResources* resources = (struct sendNetResources*)(connection->transportResources);
+  ncclResult_t ret = ncclSuccess;
+  bool needReg = true;
+
+  assert(reqSize == sizeof(struct netRegInfo));
+  assert(respSize == sizeof(void*));
+
+#if CUDART_VERSION >= 11070
+  /* DMA-BUF support */
+  if (resources->useDmaBuf) {
+    int dmabuf_fd;
+    CUCHECKGOTO(cuMemGetHandleForAddressRange((void*)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0), ret, peermem);
+    NCCLCHECKGOTO(proxyState->ncclNet->regMrDmaBuf(resources->netSendComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem);
+    (void)close(dmabuf_fd);
+    needReg = false;
+  }
+peermem:
+#endif
+  if (needReg) {
+    NCCLCHECKGOTO(proxyState->ncclNet->regMr(resources->netSendComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle), ret, fail);
+  }
+
+exit:
+  memcpy(respBuff, (void*)&handle, sizeof(void*));
+  *done = 1;
+  return ncclSuccess;
+fail:
+  handle = NULL;
+  goto exit;
+}
+
+static ncclResult_t recvProxyRegBuffer(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  void* handle;
+  struct netRegInfo* info = (struct netRegInfo*)reqBuff;
+  struct recvNetResources* resources = (struct recvNetResources*)(connection->transportResources);
+  ncclResult_t ret = ncclSuccess;
+  bool needReg = true;
+
+  assert(reqSize == sizeof(struct netRegInfo));
+  assert(respSize == sizeof(void*));
+
+#if CUDART_VERSION >= 11070
+  /* DMA-BUF support */
+  if (resources->useDmaBuf) {
+    int dmabuf_fd;
+    CUCHECKGOTO(cuMemGetHandleForAddressRange((void*)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0), ret, peermem);
+    NCCLCHECKGOTO(proxyState->ncclNet->regMrDmaBuf(resources->netRecvComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem);
+    (void)close(dmabuf_fd);
+    needReg = false;
+  }
+peermem:
+#endif
+  if (needReg) {
+    NCCLCHECKGOTO(proxyState->ncclNet->regMr(resources->netRecvComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle), ret, fail);
+  }
+
+exit:
+  memcpy(respBuff, (void*)&handle, sizeof(void*));
+  *done = 1;
+  return ncclSuccess;
+fail:
+  handle = NULL;
+  goto exit;
+}
+
+static ncclResult_t sendProxyDeregBuffer(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, int* done) {
+  void* handle;
+  struct sendNetResources* resources = (struct sendNetResources*)(connection->transportResources);
+
+  assert(reqSize == sizeof(void*));
+  memcpy(&handle, reqBuff, sizeof(void*));
+  NCCLCHECK(proxyState->ncclNet->deregMr(resources->netSendComm, handle));
+  *done = 1;
+  return ncclSuccess;
+}
+
+static ncclResult_t recvProxyDeregBuffer(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, int* done) {
+  void* handle;
+  struct recvNetResources* resources = (struct recvNetResources*)(connection->transportResources);
+
+  assert(reqSize == sizeof(void*));
+  memcpy(&handle, reqBuff, sizeof(void*));
+  NCCLCHECK(proxyState->ncclNet->deregMr(resources->netRecvComm, handle));
+  *done = 1;
+  return ncclSuccess;
+}
+
 struct ncclTransport netTransport = {
  "NET",
  canConnect,
-  { sendSetup, sendConnect, sendFree, proxySharedInit, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress, NULL },
-  { recvSetup, recvConnect, recvFree, proxySharedInit, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress, NULL }
+  { sendSetup, sendConnect, sendFree, proxySharedInit, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress, sendProxyRegBuffer, sendProxyDeregBuffer },
+  { recvSetup, recvConnect, recvFree, proxySharedInit, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress, recvProxyRegBuffer, recvProxyDeregBuffer }
 };
@@ -44,6 +44,7 @@ ncclResult_t ncclNetSocketInit(ncclDebugLogger_t logFunction) {
      ncclNetIfs = ncclFindInterfaces(names, addrs, MAX_IF_NAME_SIZE, MAX_IFS);
      if (ncclNetIfs <= 0) {
        WARN("NET/Socket : no interface found");
+        pthread_mutex_unlock(&ncclNetSocketLock);
        return ncclInternalError;
      } else {
        #define MAX_LINE_LEN (2047)
@@ -76,7 +77,7 @@ static ncclResult_t ncclNetSocketGetSpeed(char* devName, int* speed) {
  ncclResult_t ret = ncclSuccess;
  *speed = 0;
  char speedPath[PATH_MAX];
-  sprintf(speedPath, "/sys/class/net/%s/speed", devName);
+  snprintf(speedPath, sizeof(speedPath), "/sys/class/net/%s/speed", devName);
  int fd = -1;
  SYSCHECKSYNC(open(speedPath, O_RDONLY), "open", fd);
  if (fd != -1) {
@@ -102,6 +103,7 @@ ncclResult_t ncclNetSocketGetProperties(int dev, ncclNetProperties_t* props) {
  props->guid = dev;
  props->ptrSupport = NCCL_PTR_HOST;
  props->regIsGlobal = 0;
+  props->forceFlush = 0;
  NCCLCHECK(ncclNetSocketGetSpeed(props->name, &props->speed));
  props->latency = 0; // Not set
  props->port = 0;
@@ -109,6 +111,7 @@ ncclResult_t ncclNetSocketGetProperties(int dev, ncclNetProperties_t* props) {
  props->maxRecvs = 1;
  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  props->maxP2pBytes = NCCL_MAX_NET_SIZE_BYTES;
  return ncclSuccess;
 }

@@ -297,6 +300,7 @@ fail:

 ncclResult_t ncclNetSocketListen(int dev, void* opaqueHandle, void** listenComm) {
  if (dev < 0 || dev >= ncclNetIfs) { // data transfer socket is based on specified dev
+    WARN("NET/Socket : ncclNetSocketListen dev=%d ncclNetIfs=%d", dev, ncclNetIfs);
    return ncclInternalError;
  }
  ncclResult_t ret = ncclSuccess;
@@ -558,16 +562,16 @@ ncclResult_t ncclNetSocketRegMr(void* comm, void* data, size_t size, int type, v
 }
 ncclResult_t ncclNetSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; }

-ncclResult_t ncclNetSocketIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
+ncclResult_t ncclNetSocketIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
  struct ncclNetSocketComm* comm = (struct ncclNetSocketComm*)sendComm;
-  NCCLCHECK(ncclNetSocketGetRequest(comm, NCCL_SOCKET_SEND, data, size, (struct ncclNetSocketRequest**)request));
+  NCCLCHECK(ncclNetSocketGetRequest(comm, NCCL_SOCKET_SEND, data, (int) size, (struct ncclNetSocketRequest**)request));
  return ncclSuccess;
 }

-ncclResult_t ncclNetSocketIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
+ncclResult_t ncclNetSocketIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
  struct ncclNetSocketComm* comm = (struct ncclNetSocketComm*)recvComm;
  if (n != 1) return ncclInternalError;
-  NCCLCHECK(ncclNetSocketGetRequest(comm, NCCL_SOCKET_RECV, data[0], sizes[0], (struct ncclNetSocketRequest**)request));
+  NCCLCHECK(ncclNetSocketGetRequest(comm, NCCL_SOCKET_RECV, data[0], (int)sizes[0], (struct ncclNetSocketRequest**)request));
  return ncclSuccess;
 }

@@ -632,5 +636,6 @@ ncclNet_t ncclNetSocket = {
  ncclNetSocketClose,
  ncclNetSocketCloseListen,
  NULL /* getDeviceMr */,
-  NULL /* irecvConsumed */
+  NULL /* irecvConsumed */,
+  NULL /* mergeDevices */
 };
@@ -108,11 +108,12 @@ ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, size_t size, CUmemGenericAll
  return ncclSuccess;
 }

-ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size) {
+ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size) {
  CUCHECK(cuMulticastUnbind(*mcHandler, dev, 0/*mcOffset*/, size));
  CUCHECK(cuMemUnmap(ptr, size));
  CUCHECK(cuMemAddressFree(ptr, size));
  CUCHECK(cuMemRelease(*mcHandler));
+  INFO(NCCL_NVLS, "rank %d - NVLS deregistered buffer %p on device %d, size %ld", comm->rank, (void*)ptr, dev, size);
  return ncclSuccess;
 }

@@ -450,11 +451,11 @@ setup:

    if (comm->localRank == 0) {
      shmPath[0] = '\0';
-      NCCLCHECKGOTO(ncclShmOpen(shmPath, (sizeof(size_t) + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, comm->localRanks - 1, &comm->nvlsResources->nvlsShmemHandle), res, fail);
+      NCCLCHECKGOTO(ncclShmOpen(shmPath, sizeof(shmPath), (sizeof(size_t) + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, comm->localRanks - 1, &comm->nvlsResources->nvlsShmemHandle), res, fail);
      NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shmPath, sizeof(shmPath)), res, fail);
    } else {
      NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shmPath, sizeof(shmPath)), res, fail);
-      NCCLCHECKGOTO(ncclShmOpen(shmPath, (sizeof(size_t) + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, -1, &comm->nvlsResources->nvlsShmemHandle), res, fail);
+      NCCLCHECKGOTO(ncclShmOpen(shmPath, sizeof(shmPath), (sizeof(size_t) + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, -1, &comm->nvlsResources->nvlsShmemHandle), res, fail);
    }
    /* need 2 pools and a shared counter for shmem-based collectives */
    comm->nvlsResources->nvlsShmem.cnt[0] = (size_t*)nvlsShmem;
@@ -495,7 +496,7 @@ ncclResult_t ncclNvlsFree(struct ncclComm* comm) {
  return ncclSuccess;
 }

-ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t buffSize, CUdeviceptr *regAddr, bool *regUsed) {
+ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t buffSize, CUdeviceptr *regAddr, int *regUsed) {
  ncclResult_t ret = ncclSuccess;
  struct ncclReg *regRecord = NULL;
  CUdeviceptr regPtr = 0;
@@ -601,43 +602,33 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
  }

  *regAddr = (uintptr_t)regPtr + regData[comm->localRank].offset;
-  *regUsed = true;
+  *regUsed = 1;
 exit:
  free(regData);
  return ret;
 fail:
-  *regUsed = false;
+  *regUsed = 0;
  goto exit;
 }

-ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) {
+static ncclResult_t nvlsRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, struct ncclReg *sendRegRecord, struct ncclReg *recvRegRecord, int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) {
  ncclResult_t ret = ncclSuccess;
-  bool localRegBufUsed = false;
+  int regBufUsed = 0;
  struct localRegData *regData = NULL;
  bool sendNeedReg = false, recvNeedReg = false;
  CUdeviceptr regSendPtr = 0;
  CUdeviceptr regRecvPtr = 0;
-  struct ncclReg *sendRegRecord = NULL;
-  struct ncclReg *recvRegRecord = NULL;
-
-  *outRegBufUsed = false;

  NCCLCHECKGOTO(ncclCalloc(&regData, comm->localRanks * 2), ret, fail);

-  if (sendbuff) {
-    NCCLCHECKGOTO(ncclRegFind(comm, sendbuff, sendbuffSize, &sendRegRecord), ret, fail);
-    if (sendRegRecord) {
-      memcpy(&regData[comm->localRank * 2].reg, sendRegRecord, sizeof(struct ncclReg));
-      regData[comm->localRank * 2].offset = (uintptr_t)sendbuff - sendRegRecord->addr;
-    }
+  if (sendRegRecord) {
+    memcpy(&regData[comm->localRank * 2].reg, sendRegRecord, sizeof(struct ncclReg));
+    regData[comm->localRank * 2].offset = (uintptr_t)sendbuff - sendRegRecord->addr;
  }

-  if (recvbuff) {
-    NCCLCHECKGOTO(ncclRegFind(comm, recvbuff, recvbuffSize, &recvRegRecord), ret, fail);
-    if (recvRegRecord) {
-      memcpy(&regData[comm->localRank * 2 + 1].reg, recvRegRecord, sizeof(struct ncclReg));
-      regData[comm->localRank * 2 + 1].offset = (uintptr_t)recvbuff - recvRegRecord->addr;
-    }
+  if (recvRegRecord) {
+    memcpy(&regData[comm->localRank * 2 + 1].reg, recvRegRecord, sizeof(struct ncclReg));
+    regData[comm->localRank * 2 + 1].offset = (uintptr_t)recvbuff - recvRegRecord->addr;
  }

  NCCLCHECKGOTO(ncclShmemAllgather(comm, &comm->nvlsResources->nvlsShmem, regData + comm->localRank * 2, regData, sizeof(struct localRegData) * 2), ret, fail);
@@ -682,229 +673,127 @@ ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *send
  }

  if ((!sendNeedReg || sendbuff == NULL) && (!recvNeedReg || recvbuff == NULL)) {
-    localRegBufUsed = true;
-    INFO(NCCL_NVLS, "rank %d reuse local-registered NVLS sendbuff %p, recvbuff %p, sendbuff size %ld, recvbuff size %ld, reg sendbuff %p, reg recvbuff %p", comm->rank, sendbuff, recvbuff, sendbuffSize, recvbuffSize, (void*)regSendPtr, (void*)regRecvPtr);
+    regBufUsed = 1;
+    INFO(NCCL_REG, "rank %d reuse registered NVLS sendbuff %p, recvbuff %p, sendbuff size %ld, recvbuff size %ld, reg sendbuff %p, reg recvbuff %p", comm->rank, sendbuff, recvbuff, sendbuffSize, recvbuffSize, (void*)regSendPtr, (void*)regRecvPtr);
    goto exit;
  }

  /* Start Registration. Not found registered buffers, then check whether both send and recv buffer locate
   * in register request cache. */
-  if (sendNeedReg && sendbuff) {
-    tryRegisterBuffer(comm, (uintptr_t)sendbuff, sendbuffSize, &regSendPtr, &localRegBufUsed);
-    if (localRegBufUsed == false) goto fail;
+  if (sendNeedReg && sendbuff && sendbuffSize > 0) {
+    tryRegisterBuffer(comm, (uintptr_t)sendbuff, sendbuffSize, &regSendPtr, &regBufUsed);
+    if (regBufUsed == 0) goto fail;
  }

-  if (recvNeedReg && recvbuff) {
-    tryRegisterBuffer(comm, (uintptr_t)recvbuff, recvbuffSize, &regRecvPtr, &localRegBufUsed);
-    if (localRegBufUsed == false) goto fail;
+  if (recvNeedReg && recvbuff && recvbuffSize > 0) {
+    tryRegisterBuffer(comm, (uintptr_t)recvbuff, recvbuffSize, &regRecvPtr, &regBufUsed);
+    if (regBufUsed == 0) goto fail;
  }

-  INFO(NCCL_NVLS, "rank %d successfully local-registered NVLS sendbuff %p, recvbuff %p, sendbuff size %ld, recvbuff size %ld, reg sendbuff %p, reg recvbuff %p", comm->rank, sendbuff, recvbuff, sendbuffSize, recvbuffSize, (void*)regSendPtr, (void*)regRecvPtr);
+  INFO(NCCL_REG, "rank %d successfully registered NVLS sendbuff %p, recvbuff %p, sendbuff size %ld, recvbuff size %ld, reg sendbuff %p, reg recvbuff %p", comm->rank, sendbuff, recvbuff, sendbuffSize, recvbuffSize, (void*)regSendPtr, (void*)regRecvPtr);

 exit:
  *outRegBufSend = (void*)regSendPtr;
  *outRegBufRecv = (void*)regRecvPtr;
-  *outRegBufUsed = localRegBufUsed;
+  *outRegBufUsed = regBufUsed;
  free(regData);
  return ncclSuccess;
 fail:
-  localRegBufUsed = false;
+  regBufUsed = 0;
+  WARN("rank %d failed to NVLS register sendbuff %p sendbuffSize %ld recvbuff %p recvbuffSize %ld", comm->rank, sendbuff, sendbuffSize, recvbuff, recvbuffSize);
  goto exit;
 }

+ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) {
+  struct ncclReg *sendRegRecord = NULL;
+  struct ncclReg *recvRegRecord = NULL;
+  bool sendIsValid = false;
+  bool recvIsValid = false;
+
+  *outRegBufUsed = 0;
+  if (sendbuff) {
+    NCCLCHECK(ncclRegFind(comm, sendbuff, sendbuffSize, &sendRegRecord));
+    NCCLCHECK(ncclRegLocalIsValid(sendRegRecord, &sendIsValid));
+  } else {
+    sendIsValid = true;
+  }
+  if (recvbuff) {
+    NCCLCHECK(ncclRegFind(comm, recvbuff, recvbuffSize, &recvRegRecord));
+    NCCLCHECK(ncclRegLocalIsValid(recvRegRecord, &recvIsValid));
+  } else {
+    recvIsValid = true;
+  }
+
+  if (sendIsValid && recvIsValid)
+    NCCLCHECK(nvlsRegisterBuffer(comm, sendbuff, recvbuff, sendbuffSize, recvbuffSize, sendRegRecord, recvRegRecord, outRegBufUsed, outRegBufSend, outRegBufRecv));
+
+  return ncclSuccess;
+}
+
 struct ncclNvlsCleanupCallback {
  struct ncclCommCallback base;
-  CUmemGenericAllocationHandle mcHandle;
-  CUdeviceptr ptr;
-  int dev;
-  size_t size;
+  struct ncclReg *reg;
+  struct ncclComm *comm;
 };

 static ncclResult_t cleanupNvls(struct ncclComm* comm, struct ncclCommCallback* cb) {
  struct ncclNvlsCleanupCallback* obj = (struct ncclNvlsCleanupCallback*)cb;
-  NCCLCHECK(ncclNvlsDeregBuffer(&obj->mcHandle, obj->ptr, obj->dev, obj->size));
-  INFO(NCCL_NVLS, "rank %d - deregistered buffer %p on device %d, size %ld", comm->rank, (void*)obj->ptr, obj->dev, obj->size);
+  NCCLCHECK(ncclCommGraphDeregister(obj->comm, obj->reg));
  free(obj);
  return ncclSuccess;
 }

 ncclResult_t ncclNvlsGraphRegisterBuffer(
    struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize,
-    bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv,
+    int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv,
    struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, int* nCleanupQueueEltsAdded
  ) {
-  ncclResult_t ret = ncclSuccess;
-  bool localRegBufUsed = false;
  struct ncclNvlsCleanupCallback* sendRecord = NULL;
  struct ncclNvlsCleanupCallback* recvRecord = NULL;
-  CUdeviceptr regSendPtr = 0;
-  CUdeviceptr regRecvPtr = 0;
-  CUmulticastObjectProp mcprop;
-  CUmemAllocationProp ucprop;
-  char shareableHandle[NVLS_HANDLE_SIZE];
-  CUmemGenericAllocationHandle sendMcHandle, recvMcHandle;
-  size_t sendGran = 0, recvGran = 0;
-  bool *regBufFlags = NULL;
-  struct graphRegData *rdata = NULL;
-  const void *baseSend = NULL;
-  const void *baseRecv = NULL;
-  size_t baseSendSize = 1;
-  size_t baseRecvSize = 1;
-  size_t ucgran;
+  void *baseSend = NULL;
+  void *baseRecv = NULL;
+  size_t baseSendSize = 0;
+  size_t baseRecvSize = 0;
+  struct ncclReg *sendRegRecord = NULL;
+  struct ncclReg *recvRegRecord = NULL;

-  *outRegBufUsed = false;
-  NCCLCHECKGOTO(ncclCalloc(&regBufFlags, comm->localRanks), ret, fail);
-  NCCLCHECKGOTO(ncclCalloc(&rdata, comm->localRanks), ret, fail);
-
-  if (sendbuffSize > 0 || recvbuffSize > 0) {
-    /* retrieve base pointer and size */
-    if (CUPFN(cuMemGetAddressRange) == nullptr) goto fail;
-    if (sendbuff != NULL)
-      CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &baseSendSize, (CUdeviceptr)sendbuff), ret, fail);
-    if (recvbuff != NULL)
-      CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&baseRecv, &baseRecvSize, (CUdeviceptr)recvbuff), ret, fail);
-
-    memset(&ucprop, 0, sizeof(CUmemAllocationProp));
-    ucprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
-    ucprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-    ucprop.location.id = comm->cudaDev;
-    ucprop.requestedHandleTypes = ncclCuMemHandleType;
-    CUCHECKGOTO(cuMemGetAllocationGranularity(&ucgran, &ucprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail);
-
-    localRegBufUsed = ((uint64_t)baseSend % ucgran != 0 || (uint64_t)baseRecv % ucgran != 0) ? false : true;
-    regBufFlags[comm->localRank] = localRegBufUsed;
-    NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, regBufFlags, sizeof(bool)), ret, fail);
-    for (int i = 0; i < comm->localRanks; ++i)
-      if (regBufFlags[i] == false) goto fail;
-
-    memset(&mcprop, 0, sizeof(CUmulticastObjectProp));
-    mcprop.numDevices = comm->localRanks;
-    mcprop.handleTypes = ncclCuMemHandleType;
-    mcprop.flags = 0;
-
-    if (sendbuff != NULL) {
-      mcprop.size = baseSendSize;
-      CUCHECKGOTO(cuMulticastGetGranularity(&sendGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail);
-
-      /* check send buffer offset and size */
-      rdata[comm->localRank].offset = (uintptr_t)sendbuff - (uintptr_t)baseSend;
-      rdata[comm->localRank].size = baseSendSize;
-      NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, rdata, sizeof(struct graphRegData)), ret, fail);
-      baseSendSize = rdata[0].size;
-      for (int i = 1; i < comm->localRanks; ++i) {
-        if (rdata[0].offset != rdata[i].offset) goto fail;
-        if (baseSendSize > rdata[i].size) baseSendSize = rdata[i].size;
-      }
-      if (baseSendSize % sendGran != 0) goto fail;
-
-      mcprop.size = baseSendSize;
-
-      /* register sendbuff */
-      if (comm->localRank == 0) {
-        NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, &sendMcHandle, shareableHandle), ret, fail);
-        NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
-      } else {
-        NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
-        NCCLCHECKGOTO(nvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], &sendMcHandle), ret, fail);
-      }
-
-      CUCHECKGOTO(cuMulticastAddDevice(sendMcHandle, comm->nvlsResources->dev), ret, fail);
-      CUCHECKGOTO(cuMulticastBindAddr(sendMcHandle, 0, (CUdeviceptr)baseSend, baseSendSize, 0), ret, fail);
-
-      // Create a VA for the NVLS
-      CUCHECKGOTO(cuMemAddressReserve(&regSendPtr, baseSendSize, sendGran, 0U, 0), ret, fail);
-      // Map the VA locally
-      CUCHECKGOTO(cuMemMap(regSendPtr, baseSendSize, 0, sendMcHandle, 0), ret, fail);
-      CUCHECKGOTO(cuMemSetAccess(regSendPtr, baseSendSize, &comm->nvlsResources->accessDesc, 1), ret, fail);
-
-      sendRecord = (struct ncclNvlsCleanupCallback*)malloc(sizeof(struct ncclNvlsCleanupCallback));
-      sendRecord->base.fn = cleanupNvls;
-      sendRecord->mcHandle = sendMcHandle;
-      sendRecord->ptr = regSendPtr;
-      sendRecord->dev = comm->nvlsResources->dev;
-      sendRecord->size = baseSendSize;
-    }
-
-    if (recvbuff != NULL) {
-      mcprop.size = baseRecvSize;
-      CUCHECKGOTO(cuMulticastGetGranularity(&recvGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail);
-
-      rdata[comm->localRank].offset = (uintptr_t)recvbuff - (uintptr_t)baseRecv;
-      rdata[comm->localRank].size = baseRecvSize;
-      NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, rdata, sizeof(struct graphRegData)), ret, fail);
-      baseRecvSize = rdata[0].size;
-      for (int i = 1; i < comm->localRanks; ++i) {
-        if (rdata[0].offset != rdata[i].offset) goto fail;
-        if (baseRecvSize > rdata[i].size) baseRecvSize = rdata[i].size;
-      }
-      if (baseRecvSize % recvGran != 0) goto fail;
-
-      mcprop.size = baseRecvSize;
-      if (comm->localRank == 0) {
-        NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, &recvMcHandle, shareableHandle), ret, fail);
-        NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
-      } else {
-        NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
-        NCCLCHECKGOTO(nvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], &recvMcHandle), ret, fail);
-      }
-
-      CUCHECKGOTO(cuMulticastAddDevice(recvMcHandle, comm->nvlsResources->dev), ret, fail);
-      CUCHECKGOTO(cuMulticastBindAddr(recvMcHandle, 0, (CUdeviceptr)baseRecv, baseRecvSize, 0), ret, fail);
-
-      // Create a VA for the NVLS
-      CUCHECKGOTO(cuMemAddressReserve(&regRecvPtr, baseRecvSize, recvGran, 0U, 0), ret, fail);
-      // Map the VA locally
-      CUCHECKGOTO(cuMemMap(regRecvPtr, baseRecvSize, 0, recvMcHandle, 0), ret, fail);
-      CUCHECKGOTO(cuMemSetAccess(regRecvPtr, baseRecvSize, &comm->nvlsResources->accessDesc, 1), ret, fail);
-
-      recvRecord = (struct ncclNvlsCleanupCallback*)malloc(sizeof(struct ncclNvlsCleanupCallback));
-      recvRecord->base.fn = cleanupNvls;
-      recvRecord->mcHandle = recvMcHandle;
-      recvRecord->ptr = regRecvPtr;
-      recvRecord->dev = comm->nvlsResources->dev;
-      recvRecord->size = baseRecvSize;
-    }
-
-    localRegBufUsed = true;
+  *outRegBufUsed = 0;
+  if (sendbuff) {
+    CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &baseSendSize, (CUdeviceptr)sendbuff));
+    NCCLCHECK(ncclCommGraphRegister(comm, baseSend, baseSendSize, (void**)&sendRegRecord));
  }

-exit:
-  if (localRegBufUsed == false) {
-    if (sendRecord) {
-      ncclNvlsDeregBuffer(&sendRecord->mcHandle, sendRecord->ptr, sendRecord->dev, sendRecord->size);
-      free(sendRecord);
-    }
+  if (recvbuff) {
+    CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseRecv, &baseRecvSize, (CUdeviceptr)recvbuff));
+    NCCLCHECK(ncclCommGraphRegister(comm, baseRecv, baseRecvSize, (void**)&recvRegRecord));
+  }

-    if (recvRecord) {
-      // Yes, it's a dead code.  That's fine...
-      // coverity[dead_error_begin]
-      ncclNvlsDeregBuffer(&recvRecord->mcHandle, recvRecord->ptr, recvRecord->dev, recvRecord->size);
-      free(recvRecord);
-    }
-  } else {
-    if (sendRecord) {
-      *outRegBufSend = (void*)((uintptr_t)regSendPtr + (uintptr_t)sendbuff - (uintptr_t)baseSend);
+  NCCLCHECK(nvlsRegisterBuffer(comm, baseSend, baseRecv, baseSendSize, baseRecvSize, sendRegRecord, recvRegRecord, outRegBufUsed, outRegBufSend, outRegBufRecv));
+
+  if (*outRegBufUsed) {
+    if (sendRegRecord) {
+      sendRecord = (struct ncclNvlsCleanupCallback*)malloc(sizeof(struct ncclNvlsCleanupCallback));
+      sendRecord->base.fn = cleanupNvls;
+      sendRecord->reg = sendRegRecord;
+      sendRecord->comm = comm;
      ncclIntruQueueEnqueue(cleanupQueue, (struct ncclCommCallback*)sendRecord);
      *nCleanupQueueEltsAdded += 1;
    }

-    if (recvRecord) {
-      *outRegBufRecv = (void*)((uintptr_t)regRecvPtr + (uintptr_t)recvbuff - (uintptr_t)baseRecv);
+    if (recvRegRecord) {
+      recvRecord = (struct ncclNvlsCleanupCallback*)malloc(sizeof(struct ncclNvlsCleanupCallback));
+      recvRecord->base.fn = cleanupNvls;
+      recvRecord->reg = recvRegRecord;
+      recvRecord->comm = comm;
      ncclIntruQueueEnqueue(cleanupQueue, (struct ncclCommCallback*)recvRecord);
      *nCleanupQueueEltsAdded += 1;
    }
-
-    INFO(NCCL_NVLS, "rank %d successfully graph-registered sendbuff %p, recvbuff %p, sendbuff size %ld (register size %ld, sendGran %ld), recvbuff size %ld (register size %ld, recvGran %ld), reg sendbuff %p, reg recvbuff %p", comm->rank, sendbuff, recvbuff, sendbuffSize, baseSendSize, sendGran, recvbuffSize, baseRecvSize, recvGran, (void*)regSendPtr, (void*)regRecvPtr);
+  } else {
+    if (sendbuff) NCCLCHECK(ncclCommGraphDeregister(comm, sendRegRecord));
+    if (recvbuff) NCCLCHECK(ncclCommGraphDeregister(comm, recvRegRecord));
  }

-  *outRegBufUsed = localRegBufUsed;
-  free(regBufFlags);
-  free(rdata);
-  /* always return success. */
  return ncclSuccess;
-fail:
-  localRegBufUsed = false;
-  goto exit;
 }

 #else
@@ -936,19 +825,19 @@ ncclResult_t ncclNvlsTreeConnect(struct ncclComm* comm) {

 ncclResult_t ncclNvlsGraphRegisterBuffer(
    struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize,
-    bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv,
+    int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv,
    struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, int* nCleanupQueueEltsAdded
  ) {
  *outRegBufUsed = false;
  return ncclSuccess;
 }

-ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) {
+ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) {
  *outRegBufUsed = false;
  return ncclSuccess;
 }

-ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size) {
+ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size) {
  return ncclSuccess;
 }

@@ -91,6 +91,8 @@ struct p2pCuMemProxyInfo {

 #include <sys/types.h>

+NCCL_PARAM(LegacyCudaRegister, "LEGACY_CUDA_REGISTER", 0);
+
 /* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */
 static int busIdToCudaDev(int64_t busId) {
  int ndev;
@@ -120,21 +122,9 @@ extern int64_t ncclParamMNNVLEnable();
 ncclResult_t p2pCanConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
  initCeOperation();

-  // MNNVL support
-  if (comm->MNNVL && info1->hostHash != info2->hostHash) {
-    NCCLCHECK(ncclTopoCheckMNNVL(comm->topo, info1, info2, ret));
-    if (*ret) return ncclSuccess;
-  }
-
-  // Rule out different nodes / isolated containers
-  if (info1->hostHash != info2->hostHash || info1->shmDev != info2->shmDev) {
-    *ret = 0;
-    return ncclSuccess;
-  }
-
  // Check topology / p2p level.
  int intermediateRank;
-  NCCLCHECK(ncclTopoCheckP2p(comm->topo, info1->rank, info2->rank, ret, NULL, &intermediateRank));
+  NCCLCHECK(ncclTopoCheckP2p(comm, comm->topo, info1->rank, info2->rank, ret, NULL, &intermediateRank));
  if (*ret == 0) return ncclSuccess;
  if (intermediateRank != -1) {
    if (useMemcpy) *ret = 0;
@@ -149,6 +139,12 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph
    return ncclSuccess;
  }

+  if (info1->hostHash != comm->peerInfo[comm->rank].hostHash ||
+      info1->hostHash != info2->hostHash) {
+    // If either peer is non-local then we are done.
+    return ncclSuccess;
+  }
+
  // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
  int cudaDev1 = busIdToCudaDev(info1->busId);
  int cudaDev2 = busIdToCudaDev(info2->busId);
@@ -313,11 +309,11 @@ NCCL_PARAM(P2pDirectDisable, "P2P_DIRECT_DISABLE", 0);

 #define P2P_SAME_PID(MYINFO, PEERINFO) ((MYINFO->hostHash == PEERINFO->hostHash) && (MYINFO->pidHash == PEERINFO->pidHash))

-static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* read, int* intermediateRank) {
+static ncclResult_t p2pGetInfo(struct ncclComm* comm, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* read, int* intermediateRank) {
  int p2p;
  // Queries the topology to see if the GPUs are Ampere and
  // connected via NVLink, if so we enable P2P Read by default
-  NCCLCHECK(ncclTopoCheckP2p(topo, info1->rank, info2->rank, &p2p, read, intermediateRank));
+  NCCLCHECK(ncclTopoCheckP2p(comm, comm->topo, info1->rank, info2->rank, &p2p, read, intermediateRank));

  int readEnable = ncclParamP2pReadEnable();
  if (readEnable != -2) *read = readEnable;
@@ -367,7 +363,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
  NCCLCHECK(ncclCalloc(&resources, 1));
  send->transportResources = resources;
  int useRead, intermediateRank;
-  NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank));
+  NCCLCHECK(p2pGetInfo(comm, myInfo, peerInfo, &useRead, &intermediateRank));
  if (useMemcpy) useRead = 0;

  static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
@@ -386,7 +382,6 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
    info->rank = myInfo->rank;
    if (P2P_SAME_PID(myInfo, peerInfo) && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0) {
      resources->type = P2P_DIRECT;
-      send->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
      INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%d] -> %d[%d] via P2P/direct pointer%s",
          channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, useReadStr);
    } else {
@@ -402,8 +397,8 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
        INFO(NCCL_INIT|NCCL_P2P,"Channel %02d/%01d : %d[%d] -> %d[%d] via P2P/IPC%s%s",
             channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, useReadStr, useMemcpy ? "/CE" : "");
      }
-      send->conn.flags |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
    }
+    send->conn.flags |= info->read ? NCCL_P2P_READ : NCCL_P2P_WRITE;
  } else {
    resources->type = P2P_INTERMEDIATE;
    info->rank = intermediateRank;
@@ -437,7 +432,7 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
  NCCLCHECK(ncclCalloc(&resources, 1));
  recv->transportResources = resources;
  int useRead, intermediateRank;
-  NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank));
+  NCCLCHECK(p2pGetInfo(comm, myInfo, peerInfo, &useRead, &intermediateRank));

  static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
  struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
@@ -454,7 +449,6 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
    info->rank = myInfo->rank;
    if (P2P_SAME_PID(myInfo, peerInfo) && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0) {
      resources->type = P2P_DIRECT;
-      recv->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
    } else {
      if (ncclCuMemEnable()) {
        // cuMem API support
@@ -465,8 +459,8 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
        // Legacy CUDA IPC
        resources->type = P2P_IPC;
      }
-      recv->conn.flags |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
    }
+    recv->conn.flags |= info->read ? NCCL_P2P_READ : NCCL_P2P_WRITE;
  } else {
    resources->type = P2P_INTERMEDIATE;
    info->rank = intermediateRank;
@@ -807,9 +801,8 @@ static ncclResult_t p2pSendProxyProgress(struct ncclProxyState* proxyState, stru
  return ncclSuccess;
 }

-ncclResult_t ncclIpcLocalRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut) {
-  ncclResult_t ret = ncclSuccess;
-  struct ncclReg *regRecord = NULL;
+static ncclResult_t ipcRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, struct ncclReg* regRecord, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut, bool* isLegacyIpc) {
+ncclResult_t ret = ncclSuccess;
  struct ncclIpcRegInfo* newInfo = NULL;
  uintptr_t* peerRmtAddrs = NULL;
  bool legacyIpcCap = false;
@@ -820,121 +813,149 @@ ncclResult_t ncclIpcLocalRegisterBuffer(ncclComm* comm, const void* userbuff, si
  *regBufFlag = 0;
  *offsetOut = 0;
  *peerRmtAddrsOut = NULL;
-  if (comm && userbuff && buffSize > 0 && nPeers > 0) {
-    NCCLCHECKGOTO(ncclRegFind(comm, userbuff, buffSize, &regRecord), ret, fail);
-    if (regRecord) {
-      // buffer was registered by by users, we need to start to register or reuse it
-      int peerLocalRank;
-      for (int p = 0; p < nPeers; p++) {
-        int peerRank = peerRanks[p];
-        peerLocalRank = comm->rankToLocalRank[peerRank];
-        if (regRecord->ipcInfos[peerLocalRank]) {
-          // We already have IPC info for peerLocalRank, no need to register it, we can reuse it
-          *regBufFlag = 1;
-          INFO(NCCL_REG, "rank %d - IPC local reuse buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p", comm->rank, userbuff, buffSize, (void*)regRecord->addr, regRecord->pages * comm->regCache.pageSize, peerRank, regRecord->ipcInfos[peerLocalRank]->impInfo.rmtRegAddr);
-        } else {
-          // Register buffer with peerLocalRank
-          struct ncclProxyConnector* proxyConn = NULL;
-          struct p2pIpcExpInfo ipcInfo;
+  if (isLegacyIpc) *isLegacyIpc = false;
+  if (regRecord) {
+    // buffer was registered by by users, we need to start to register or reuse it
+    int peerLocalRank;
+    for (int p = 0; p < nPeers; p++) {
+      int peerRank = peerRanks[p];
+      peerLocalRank = comm->rankToLocalRank[peerRank];
+      if (regRecord->ipcInfos[peerLocalRank]) {
+        // We already have IPC info for peerLocalRank, no need to register it, we can reuse it
+        *regBufFlag = 1;
+        if (isLegacyIpc) *isLegacyIpc = regRecord->ipcInfos[peerLocalRank]->impInfo.legacyIpcCap;
+        INFO(NCCL_REG, "rank %d - IPC reuse buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p", comm->rank, userbuff, buffSize, (void*)regRecord->addr, regRecord->pages * comm->regCache.pageSize, peerRank, regRecord->ipcInfos[peerLocalRank]->impInfo.rmtRegAddr);
+      } else {
+        // Register buffer with peerLocalRank
+        struct ncclProxyConnector* proxyConn = NULL;
+        struct p2pIpcExpInfo ipcInfo;

-          if (baseAddr == NULL) {
-            CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&baseAddr, &baseSize, (CUdeviceptr)userbuff), ret, fail);
-            CUCHECKGOTO(cuPointerGetAttribute((void*)&legacyIpcCap, CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE, (CUdeviceptr)baseAddr), ret, fail);
-          }
-          if (comm->gproxyConn[peerRank].initialized == false)
-            NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_P2P, 1, peerRank, &comm->gproxyConn[peerRank]), ret, fail);
-          proxyConn = &comm->gproxyConn[peerRank];
+        if (baseAddr == NULL) {
+          CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&baseAddr, &baseSize, (CUdeviceptr)userbuff), ret, fail);
+          CUCHECKGOTO(cuPointerGetAttribute((void*)&legacyIpcCap, CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE, (CUdeviceptr)baseAddr), ret, fail);
+        }
+        if (comm->gproxyConn[peerRank].initialized == false)
+          NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_P2P, 1, peerRank, &comm->gproxyConn[peerRank]), ret, fail);
+        proxyConn = &comm->gproxyConn[peerRank];

-          ipcInfo.legacyIpcCap = legacyIpcCap;
-          // Get the mem handle for that buffer. It may have been allocated through cudaMalloc in which case we'll
-          // get the CUDA legacy mem handle, or through cuMem*.
-          if (ipcInfo.legacyIpcCap) {
-            // legacy export
-            if (comm->directMode) goto fail;
+        // Get the mem handle for that buffer. It may have been allocated through cudaMalloc in which case we'll
+        // get the CUDA legacy mem handle, or through cuMem*.
+        if (ncclCuMemEnable()) {
+          CUmemGenericAllocationHandle handle;
+          if (CUPFN(cuMemRetainAllocationHandle(&handle, baseAddr)) != CUDA_SUCCESS) {
+            // if cuMem* export fails, retry legacy export
+            if (comm->directMode || !ncclParamLegacyCudaRegister()) goto fail;
            CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail);
-          } else if (ncclCuMemEnable()) {
-            CUmemGenericAllocationHandle handle;
-            if (CUPFN(cuMemRetainAllocationHandle(&handle, baseAddr)) != CUDA_SUCCESS) {
-              // if cuMem* export fails, retry legacy export
-              if (comm->directMode) goto fail;
-              CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail);
-              ipcInfo.legacyIpcCap = true;
+            ipcInfo.legacyIpcCap = true;
+            if (isLegacyIpc) *isLegacyIpc = true;
+          } else {
+            ipcInfo.legacyIpcCap = false;
+            if (isLegacyIpc) *isLegacyIpc = false;
+            // cuMem* export to file descriptor or fabric handle
+            if (proxyConn->sameProcess) {
+              memcpy(&ipcInfo.ipcDesc.memHandle, &handle, sizeof(CUmemGenericAllocationHandle));
            } else {
-              // cuMem* export to file descriptor or fabric handle
-              if (proxyConn->sameProcess) {
-                memcpy(&ipcInfo.ipcDesc.memHandle, &handle, sizeof(CUmemGenericAllocationHandle));
+              if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
+                int expFd = -1;
+                CUCHECKGOTO(cuMemExportToShareableHandle(&expFd, handle, ncclCuMemHandleType, 0), ret, fail);
+                NCCLCHECKGOTO(ncclProxyClientQueryFdBlocking(comm, proxyConn, expFd, &ipcInfo.impFd), ret, fail);
+                SYSCHECKGOTO(close(expFd), "close", ret, fail);
              } else {
-                if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
-                  int expFd = -1;
-                  CUCHECKGOTO(cuMemExportToShareableHandle(&expFd, handle, ncclCuMemHandleType, 0), ret, fail);
-                  NCCLCHECKGOTO(ncclProxyClientQueryFdBlocking(comm, proxyConn, expFd, &ipcInfo.impFd), ret, fail);
-                  SYSCHECKGOTO(close(expFd), "close", ret, fail);
-                } else {
-                  // Allow this to silently fail for cases where the user buff cannot be registered
-                  if (CUPFN(cuMemExportToShareableHandle(&ipcInfo.ipcDesc.cuDesc.handle, handle, ncclCuMemHandleType, 0)) != CUDA_SUCCESS) {
-                    CUCHECKGOTO(cuMemRelease(handle), ret, fail);
-                    goto fail;
-                  }
+                // Allow this to silently fail for cases where the user buff cannot be registered
+                if (CUPFN(cuMemExportToShareableHandle(&ipcInfo.ipcDesc.cuDesc.handle, handle, ncclCuMemHandleType, 0)) != CUDA_SUCCESS) {
+                  CUCHECKGOTO(cuMemRelease(handle), ret, fail);
+                  goto fail;
                }
              }
-              CUCHECKGOTO(cuMemRelease(handle), ret, fail);
            }
-          } else {
-            // nothing works, just return
-            goto fail;
+            CUCHECKGOTO(cuMemRelease(handle), ret, fail);
          }
-
-          void* rmtRegAddr = NULL;
-          ipcInfo.size = baseSize;
-          ipcInfo.offset = regRecord->addr - (uintptr_t)baseAddr;
-          // Now ipcInfo contains all necessary registration info. Start to register buffer on proxy side
-          // and get the remote register address back.
-          if (proxyConn)
-            NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgRegister, &ipcInfo, sizeof(p2pIpcExpInfo), &rmtRegAddr, sizeof(void*)), ret, fail);
-          if (rmtRegAddr) {
-            NCCLCHECKGOTO(ncclCalloc(&newInfo, 1), ret, fail);
-            assert(regRecord->ipcInfos[peerLocalRank] == NULL);
-            regRecord->state |= IPC_REG_COMPLETE;
-            newInfo->peerRank = peerRank;
-            newInfo->baseAddr = baseAddr;
-            newInfo->impInfo.rmtRegAddr = rmtRegAddr;
-            newInfo->impInfo.offset = ipcInfo.offset;
-            newInfo->impInfo.legacyIpcCap = ipcInfo.legacyIpcCap;
-            newInfo->ipcProxyconn = proxyConn;
-            regRecord->ipcInfos[peerLocalRank] = newInfo;
-            if (regRecord->regIpcAddrs.hostPeerRmtAddrs == NULL) {
-              NCCLCHECKGOTO(ncclCalloc(&regRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks), ret, fail);
-            }
-            regRecord->regIpcAddrs.hostPeerRmtAddrs[peerLocalRank] = (uintptr_t)rmtRegAddr;
-            needUpdate = true;
-            *regBufFlag = 1;
-            INFO(NCCL_REG, "rank %d - IPC local register buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p offsetOut %ld", comm->rank, userbuff, buffSize, (void*)regRecord->addr, ipcInfo.size, peerRank, rmtRegAddr, (uintptr_t)userbuff - regRecord->addr);
-          }
-        }
-      }
-
-      if (*regBufFlag) {
-        if (type == NCCL_IPC_COLLECTIVE) {
-          // for collective, store registered remote buffers into dev memory for future reference
-          if (regRecord->regIpcAddrs.devPeerRmtAddrs == NULL || needUpdate) {
-            NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail);
-            if (regRecord->regIpcAddrs.devPeerRmtAddrs == NULL)
-              NCCLCHECKGOTO(ncclCudaCallocAsync(&regRecord->regIpcAddrs.devPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
-            if (needUpdate)
-              NCCLCHECKGOTO(ncclCudaMemcpyAsync(regRecord->regIpcAddrs.devPeerRmtAddrs, regRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
-            NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), ret, fail);
-            NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), ret, fail);
-          }
-          peerRmtAddrs = regRecord->regIpcAddrs.devPeerRmtAddrs;
+        } else if (legacyIpcCap) {
+          // legacy export
+          if (comm->directMode || !ncclParamLegacyCudaRegister()) goto fail;
+          CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail);
+          ipcInfo.legacyIpcCap = true;
+          if (isLegacyIpc) *isLegacyIpc = true;
        } else {
-          assert(nPeers == 1);
-          // p2p always returns remote addr here since remote buffer addr is passed in ncclDevWorkP2p struct
-          peerRmtAddrs = (uintptr_t*)regRecord->regIpcAddrs.hostPeerRmtAddrs[peerLocalRank];
+          // nothing works, just return
+          goto fail;
+        }
+
+        void* rmtRegAddr = NULL;
+        ipcInfo.size = baseSize;
+        ipcInfo.offset = regRecord->addr - (uintptr_t)baseAddr;
+        // Now ipcInfo contains all necessary registration info. Start to register buffer on proxy side
+        // and get the remote register address back.
+        if (proxyConn)
+          NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgRegister, &ipcInfo, sizeof(p2pIpcExpInfo), &rmtRegAddr, sizeof(void*)), ret, fail);
+        if (rmtRegAddr) {
+          NCCLCHECKGOTO(ncclCalloc(&newInfo, 1), ret, fail);
+          assert(regRecord->ipcInfos[peerLocalRank] == NULL);
+          regRecord->state |= IPC_REG_COMPLETE;
+          newInfo->peerRank = peerRank;
+          newInfo->baseAddr = baseAddr;
+          newInfo->impInfo.rmtRegAddr = rmtRegAddr;
+          newInfo->impInfo.offset = ipcInfo.offset;
+          newInfo->impInfo.legacyIpcCap = ipcInfo.legacyIpcCap;
+          newInfo->ipcProxyconn = proxyConn;
+          regRecord->ipcInfos[peerLocalRank] = newInfo;
+          if (regRecord->regIpcAddrs.hostPeerRmtAddrs == NULL) {
+            NCCLCHECKGOTO(ncclCalloc(&regRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks), ret, fail);
+          }
+          regRecord->regIpcAddrs.hostPeerRmtAddrs[peerLocalRank] = (uintptr_t)rmtRegAddr;
+          needUpdate = true;
+          *regBufFlag = 1;
+          INFO(NCCL_REG, "rank %d - IPC register buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p offsetOut %ld", comm->rank, userbuff, buffSize, (void*)regRecord->addr, ipcInfo.size, peerRank, rmtRegAddr, (uintptr_t)userbuff - regRecord->addr);
        }
-        *offsetOut = (uintptr_t)userbuff - regRecord->addr;
-        *peerRmtAddrsOut = peerRmtAddrs;
      }
    }
+
+    if (*regBufFlag) {
+      if (type == NCCL_IPC_COLLECTIVE) {
+        // for collective, store registered remote buffers into dev memory for future reference
+        if (regRecord->regIpcAddrs.devPeerRmtAddrs == NULL || needUpdate) {
+          NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail);
+          if (regRecord->regIpcAddrs.devPeerRmtAddrs == NULL)
+            NCCLCHECKGOTO(ncclCudaCallocAsync(&regRecord->regIpcAddrs.devPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
+          if (needUpdate)
+            NCCLCHECKGOTO(ncclCudaMemcpyAsync(regRecord->regIpcAddrs.devPeerRmtAddrs, regRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
+          NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), ret, fail);
+          NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), ret, fail);
+        }
+        peerRmtAddrs = regRecord->regIpcAddrs.devPeerRmtAddrs;
+      } else {
+        assert(nPeers == 1);
+        // p2p always returns remote addr here since remote buffer addr is passed in ncclDevWorkP2p struct
+        peerRmtAddrs = (uintptr_t*)regRecord->regIpcAddrs.hostPeerRmtAddrs[peerLocalRank];
+      }
+      *offsetOut = (uintptr_t)userbuff - regRecord->addr;
+      *peerRmtAddrsOut = peerRmtAddrs;
+    }
+  }
+exit:
+  return ret;
+fail:
+  *regBufFlag = 0;
+  *offsetOut = 0;
+  *peerRmtAddrsOut = NULL;
+  if (newInfo) free(newInfo);
+  WARN("rank %d failed to IPC register userbuff %p buffSize %ld nPeers %d isLegacyIpc %p", comm->rank, userbuff, buffSize, nPeers, isLegacyIpc);
+  goto exit;
+}
+
+ncclResult_t ncclIpcLocalRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut) {
+  ncclResult_t ret = ncclSuccess;
+  struct ncclReg *regRecord = NULL;
+  bool isValid = false;
+
+  *regBufFlag = 0;
+  *offsetOut = 0;
+  *peerRmtAddrsOut = NULL;
+  if (comm && userbuff && buffSize > 0 && nPeers > 0) {
+    NCCLCHECKGOTO(ncclRegFind(comm, userbuff, buffSize, &regRecord), ret, fail);
+    NCCLCHECKGOTO(ncclRegLocalIsValid(regRecord, &isValid), ret, fail);
+    if (isValid)
+      NCCLCHECKGOTO(ipcRegisterBuffer(comm, userbuff, buffSize, peerRanks, nPeers, type, regRecord, regBufFlag, offsetOut, peerRmtAddrsOut, NULL), ret, fail);
  }

 exit:
@@ -943,147 +964,56 @@ fail:
  *regBufFlag = 0;
  *offsetOut = 0;
  *peerRmtAddrsOut = NULL;
-  if (newInfo) free(newInfo);
  goto exit;
 }

 struct ncclIpcCleanupCallback {
  struct ncclCommCallback base;
-  bool isAddrs;
-  union {
-    struct ncclIpcRegInfo regInfo;
-    struct ncclPeerRegIpcAddr regIpcAddrs;
-  };
+  struct ncclComm *comm;
+  struct ncclReg *reg;
 };

 static ncclResult_t cleanupIpc(struct ncclComm* comm, struct ncclCommCallback* cb) {
  struct ncclIpcCleanupCallback* obj = (struct ncclIpcCleanupCallback*)cb;
-  if (obj->isAddrs) {
-    if (obj->regIpcAddrs.hostPeerRmtAddrs)
-      free(obj->regIpcAddrs.hostPeerRmtAddrs);
-    if (obj->regIpcAddrs.devPeerRmtAddrs)
-      NCCLCHECK(ncclCudaFree(obj->regIpcAddrs.devPeerRmtAddrs));
-  } else {
-    NCCLCHECK(ncclIpcDeregBuffer(comm, &obj->regInfo));
-  }
+  NCCLCHECK(ncclCommGraphDeregister(obj->comm, obj->reg));
  free(obj);
  return ncclSuccess;
 }

 ncclResult_t ncclIpcGraphRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut, void* cleanupQueuePtr, int* nCleanupQueueElts) {
  ncclResult_t ret = ncclSuccess;
-  struct ncclProxyConnector* proxyConn = NULL;
-  struct p2pIpcExpInfo ipcInfo;
  void* baseAddr;
  size_t baseSize;
  struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue = reinterpret_cast<struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>*>(cleanupQueuePtr);
-  uintptr_t* peerRmtAddrs = NULL;
-  struct ncclIpcCleanupCallback* addrsRecord = NULL;
+  bool isLegacyIpc = false;
+  struct ncclReg *regRecord = NULL;

  *regBufFlag = 0;
-  CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&baseAddr, &baseSize, (CUdeviceptr)userbuff), ret, fail);
-  CUCHECKGOTO(cuPointerGetAttribute((void*)&ipcInfo.legacyIpcCap, CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE, (CUdeviceptr)baseAddr), ret, fail);
-
-  if (type == NCCL_IPC_COLLECTIVE) {
-    // collective needs host memory array to hold all remote buffer addrs.
-    // We need to put this into graph release queue
-    NCCLCHECKGOTO(ncclCalloc(&addrsRecord, 1), ret, fail);
-    addrsRecord->base.fn = cleanupIpc;
-    addrsRecord->isAddrs = true;
-    NCCLCHECKGOTO(ncclCalloc(&addrsRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks), ret, fail);
-  } else {
-    assert(nPeers == 1);
-    // p2p does not need anything, just returning the remote buffer is enough, but for now, we register
-    // peer one by one so nPeers must be 1
-  }
-
-  for (int p = 0; p < nPeers; ++p) {
-    int peerRank = peerRanks[p];
-    if (comm->gproxyConn[peerRank].initialized == false)
-      NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_P2P, 1, peerRank, &comm->gproxyConn[peerRank]), ret, fail);
-    proxyConn = &comm->gproxyConn[peerRank];
-    // Same as local registration. Get the mem handle for that buffer. It may have been allocated through
-    // cudaMalloc in which case we'll get the CUDA legacy mem handle, or through cuMem*.
-    if (ipcInfo.legacyIpcCap) {
-      if (comm->directMode) goto fail;
-      CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail);
-    } else if (ncclCuMemEnable()) {
-      // cuMem* export
-      CUmemGenericAllocationHandle handle;
-      if (pfn_cuMemRetainAllocationHandle(&handle, baseAddr) != CUDA_SUCCESS) {
-        if (comm->directMode) goto fail;
-        CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail);
-        ipcInfo.legacyIpcCap = true;
-      } else {
-        if (proxyConn->sameProcess) {
-          memcpy(&ipcInfo.ipcDesc.memHandle, &handle, sizeof(CUmemGenericAllocationHandle));
-        } else {
-          if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
-            int expFd = -1;
-            CUCHECKGOTO(cuMemExportToShareableHandle(&expFd, handle, ncclCuMemHandleType, 0), ret, fail);
-            if (proxyConn->sameProcess) {
-              ipcInfo.impFd = expFd;
-            } else {
-              NCCLCHECKGOTO(ncclProxyClientQueryFdBlocking(comm, proxyConn, expFd, &ipcInfo.impFd), ret, fail);
-              SYSCHECKGOTO(close(expFd), "close", ret, fail);
-            }
-          } else {
-            CUCHECKGOTO(cuMemExportToShareableHandle(&ipcInfo.ipcDesc.cuDesc.handle, handle, ncclCuMemHandleType, 0), ret, fail);
-          }
-        }
-        CUCHECKGOTO(cuMemRelease(handle), ret, fail);
-      }
-    } else {
-      goto fail;
-    }
-
-    void* rmtRegAddr = NULL;
-    ipcInfo.size = baseSize;
-    ipcInfo.offset = 0;
-    NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgRegister, &ipcInfo, sizeof(struct p2pIpcExpInfo), &rmtRegAddr, sizeof(void*)), ret, fail);
-    if (rmtRegAddr) {
+  *offsetOut = 0;
+  *peerRmtAddrsOut = NULL;
+  if (comm && userbuff && buffSize > 0 && nPeers > 0) {
+    CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&baseAddr, &baseSize, (CUdeviceptr)userbuff), ret, fail);
+    NCCLCHECKGOTO(ncclCommGraphRegister(comm, baseAddr, baseSize, (void**)&regRecord), ret, fail);
+    NCCLCHECKGOTO(ipcRegisterBuffer(comm, userbuff, buffSize, peerRanks, nPeers, type, regRecord, regBufFlag, offsetOut, peerRmtAddrsOut, &isLegacyIpc), ret, fail);
+    if (*regBufFlag) {
      struct ncclIpcCleanupCallback* record;
      NCCLCHECKGOTO(ncclCalloc(&record, 1), ret, fail);
      record->base.fn = cleanupIpc;
-      record->isAddrs = false;
-      record->regInfo.peerRank = peerRank;
-      record->regInfo.baseAddr = baseAddr;
-      record->regInfo.impInfo.rmtRegAddr = rmtRegAddr;
-      record->regInfo.impInfo.offset = 0;
-      record->regInfo.impInfo.legacyIpcCap = ipcInfo.legacyIpcCap;
-      record->regInfo.ipcProxyconn = proxyConn;
-      // store the remote address into host addr array
-      if (type == NCCL_IPC_COLLECTIVE)
-        addrsRecord->regIpcAddrs.hostPeerRmtAddrs[comm->rankToLocalRank[peerRank]] = (uintptr_t)rmtRegAddr;
-      else
-        peerRmtAddrs = (uintptr_t*)rmtRegAddr;
-      *regBufFlag = 1;
-      if (ipcInfo.legacyIpcCap)
-        ncclIntruQueueEnqueue(&comm->legacyRegCleanupQueue, &record->base);
-      else
-        ncclIntruQueueEnqueue(cleanupQueue, &record->base);
-      if (nCleanupQueueElts) *nCleanupQueueElts += 1;
-      INFO(NCCL_REG, "rank %d - IPC graph register buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p offsetOut %ld", comm->rank, userbuff, buffSize, baseAddr, ipcInfo.size, peerRank, rmtRegAddr, (uintptr_t)userbuff - (uintptr_t)baseAddr);
+      record->comm = comm;
+      record->reg = regRecord;
+      if (isLegacyIpc) {
+        ncclIntruQueueEnqueue(&comm->legacyRegCleanupQueue, (struct ncclCommCallback*)record);
+      } else {
+        ncclIntruQueueEnqueue(cleanupQueue, (struct ncclCommCallback*)record);
+        if (nCleanupQueueElts) *nCleanupQueueElts += 1;
+      }
+    } else {
+      NCCLCHECKGOTO(ncclCommGraphDeregister(comm, regRecord), ret, fail);
    }
  }

-  if (type == NCCL_IPC_COLLECTIVE) {
-    // allocate the dev addr array and copy all previously stored addrs into it.
-    NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail);
-    NCCLCHECKGOTO(ncclCudaCallocAsync(&addrsRecord->regIpcAddrs.devPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
-    NCCLCHECKGOTO(ncclCudaMemcpyAsync(addrsRecord->regIpcAddrs.devPeerRmtAddrs, addrsRecord->regIpcAddrs.hostPeerRmtAddrs, comm->nRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
-    NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), ret, fail);
-    NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), ret, fail);
-    peerRmtAddrs = addrsRecord->regIpcAddrs.devPeerRmtAddrs;
-    if (ipcInfo.legacyIpcCap)
-      ncclIntruQueueEnqueue(&comm->legacyRegCleanupQueue, &addrsRecord->base);
-    else
-      ncclIntruQueueEnqueue(cleanupQueue, &addrsRecord->base);
-  }
-  *offsetOut = (uintptr_t)userbuff - (uintptr_t)baseAddr;
-  *peerRmtAddrsOut = peerRmtAddrs;
-
 exit:
+  // coverity[leaked_storage:FALSE] => normally, addrsRecord is added to the cleanupQueue
  return ret;
 fail:
  *regBufFlag = 0;
@@ -454,6 +454,7 @@ static ncclResult_t shmRecvProxyProgress(struct ncclProxyState* proxyState, stru
 }

 static ncclResult_t shmSendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  ncclResult_t result = ncclSuccess;
  struct shmRequest* req = (struct shmRequest*)reqBuff;
  /* check message size */
  if (reqSize != sizeof(struct shmRequest)) return ncclInternalError;
@@ -463,13 +464,18 @@ static ncclResult_t shmSendProxySetup(struct ncclProxyConnection* connection, st
  struct shmProxyInfo* proxyInfo;

  NCCLCHECK(ncclCalloc(&proxyInfo, 1));
-  NCCLCHECK(ncclShmAllocateShareableBuffer(proxyState->tpRank, req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr));
+  NCCLCHECKGOTO(ncclShmAllocateShareableBuffer(proxyState->tpRank, req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr), result, fail);
  memcpy(&info->desc, &proxyInfo->desc, sizeof(ncclShmIpcDesc_t));
  connection->transportResources = proxyInfo;
-  return ncclSuccess;
+exit:
+  return result;
+fail:
+  free(proxyInfo);
+  goto exit;
 }

 static ncclResult_t shmRecvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  ncclResult_t result = ncclSuccess;
  struct shmRequest* req = (struct shmRequest*)reqBuff;
  /* check message size */
  if (reqSize != sizeof(struct shmRequest)) return ncclInternalError;
@@ -479,10 +485,14 @@ static ncclResult_t shmRecvProxySetup(struct ncclProxyConnection* connection, st
  struct shmProxyInfo* proxyInfo;

  NCCLCHECK(ncclCalloc(&proxyInfo, 1));
-  NCCLCHECK(ncclShmAllocateShareableBuffer(proxyState->tpRank, req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr));
+  NCCLCHECKGOTO(ncclShmAllocateShareableBuffer(proxyState->tpRank, req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr), result, fail);
  memcpy(&info->desc, &proxyInfo->desc, sizeof(ncclShmIpcDesc_t));
  connection->transportResources = proxyInfo;
-  return ncclSuccess;
+exit:
+  return result;
+fail:
+  free(proxyInfo);
+  goto exit;
 }

 static void initCeOperation() {
@@ -534,7 +544,7 @@ ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool l
  } else {
    char shmPath[SHM_PATH_MAX] = { '\0' };
    desc->shmli.shmSize = size;
-    NCCLCHECK(ncclShmOpen(shmPath, size, hptr, dptr, 1, &desc->shmli.handle));
+    NCCLCHECK(ncclShmOpen(shmPath, sizeof(shmPath), size, hptr, dptr, 1, &desc->shmli.handle));
    memcpy(desc->shmli.shmSuffix, shmPath + sizeof("/dev/shm/nccl-") - 1, sizeof(desc->shmli.shmSuffix));
    desc->legacy = true;
    INFO(NCCL_SHM, "MMAP allocated shareable host buffer %s size %zi ptr %p", shmPath, desc->shmli.shmSize, *hptr);
@@ -542,7 +552,7 @@ ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool l
 #else /* CUDART_VERSION >= 12020 */
  char shmPath[SHM_PATH_MAX] = { '\0' };
  desc->shmli.shmSize = size;
-  NCCLCHECK(ncclShmOpen(shmPath, size, hptr, dptr, 1, &desc->shmli.handle));
+  NCCLCHECK(ncclShmOpen(shmPath, sizeof(shmPath), size, hptr, dptr, 1, &desc->shmli.handle));
  memcpy(desc->shmli.shmSuffix, shmPath + sizeof("/dev/shm/nccl-") - 1, sizeof(desc->shmli.shmSuffix));
  desc->legacy = true;
  INFO(NCCL_SHM, "MMAP allocated shareable host buffer %s size %zi ptr %p", shmPath, size, *hptr);
@@ -618,15 +628,15 @@ ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_
    INFO(NCCL_SHM, "CUMEM imported shareable host buffer from tpProxyRank %d size %zi ptr %p, granularity %ld", desc->shmci.tpProxyRank, desc->shmci.size, descOut->shmci.ptr, granularity);
  } else {
    char shmPath[SHM_PATH_MAX];
-    sprintf(shmPath, "/dev/shm/nccl-%s", desc->shmli.shmSuffix);
-    NCCLCHECK(ncclShmOpen(shmPath, desc->shmli.shmSize, hptr, dptr, -1, &descOut->shmli.handle));
+    snprintf(shmPath, sizeof(shmPath), "/dev/shm/nccl-%s", desc->shmli.shmSuffix);
+    NCCLCHECK(ncclShmOpen(shmPath, sizeof(shmPath), desc->shmli.shmSize, hptr, dptr, -1, &descOut->shmli.handle));
    descOut->legacy = true;
    INFO(NCCL_SHM, "MMAP imported shareable host buffer %s size %zi ptr %p", shmPath, desc->shmli.shmSize, *hptr);
  }
 #else /* CUDART_VERSION >= 12020 */
  char shmPath[SHM_PATH_MAX];
-  sprintf(shmPath, "/dev/shm/nccl-%s", desc->shmli.shmSuffix);
-  NCCLCHECK(ncclShmOpen(shmPath, desc->shmli.shmSize, hptr, dptr, -1, &descOut->shmli.handle));
+  snprintf(shmPath, sizeof(shmPath), "/dev/shm/nccl-%s", desc->shmli.shmSuffix);
+  NCCLCHECK(ncclShmOpen(shmPath, sizeof(shmPath), desc->shmli.shmSize, hptr, dptr, -1, &descOut->shmli.handle));
  descOut->legacy = true;
  INFO(NCCL_SHM, "MMAP imported shareable host buffer %s size %zi ptr %p", shmPath, desc->shmli.shmSize, *hptr);
 #endif