Merge remote-tracking branch 'nccl/master' into develop

[ROCm/rccl commit: e1a835910e]
2024-04-23 13:33:19 -07:00
@@ -278,11 +278,12 @@ set(SRC_FILES
  src/enqueue.cc
  src/group.cc
  src/init.cc
+  src/init_nvtx.cc
  src/net.cc
  src/msccl.cc
  src/proxy.cc
+  src/register.cc
  src/transport.cc
-  src/init_nvtx.cc
 # src/clique/AllReduceCliqueKernel.h
 # src/clique/CliqueCommon.h
 # src/clique/CliqueManager.cc
@@ -370,6 +371,7 @@ set(SRC_FILES
  src/include/profiler.h
  src/include/proxy.h
  src/include/rccl_vars.h
+  src/include/register.h
  src/include/rccl_float8.h
  src/include/rocm_smi_wrap.h
  src/include/rocmwrap.h
@@ -17,13 +17,14 @@
 #define NCCL_PTR_DMABUF 0x4

 // Maximum number of requests per comm object
-#define NCCL_NET_MAX_REQUESTS 8
+#define NCCL_NET_MAX_REQUESTS 32

 typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
 typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_ALL=~0} ncclDebugLogSubSys;

 typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);

+#include "net_v8.h"
 #include "net_v7.h"
 #include "net_v6.h"
 #include "net_v5.h"
@@ -26,6 +26,7 @@ typedef struct {
  int needsProxyProgress;
 } ncclNetDeviceHandle_v7_t;

+typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
 typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_t;

 #endif
@@ -5,6 +5,8 @@
 #ifndef NCCL_NET_V6_H_
 #define NCCL_NET_V6_H_

+#define NCCL_NET_MAX_REQUESTS_V6 8
+
 typedef struct {
  char* name;     // Used mostly for logging.
  char* pciPath;  // Path to the PCI device in /sys.
@@ -22,8 +22,6 @@ typedef struct {
  int netDeviceVersion;            // Version number for network offload
 } ncclNetProperties_v7_t;

-typedef ncclNetProperties_v7_t ncclNetProperties_t;
-
 typedef struct {
  // Name of the network (mainly for logs)
  const char* name;
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NCCL_NET_V8_H_
+#define NCCL_NET_V8_H_
+
+#include "net_device.h"
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int regIsGlobal;                 // regMr is not tied to a particular comm
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+} ncclNetProperties_v8_t;
+
+typedef ncclNetProperties_v8_t ncclNetProperties_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+} ncclNet_v8_t;
+
+#endif // end include guard
@@ -15,15 +15,37 @@ __hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess;

 __hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; }
 __hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; }
-__hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_v7_t* props) {
-  //pluginPciPath(dev, &props.pciPath);
-  //pluginPtrSupport(dev, &props.ptrSupport);
+__hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_v8_t* props) {
+  // Below are default values, if unsure don't change.
+
+  props->name = "Example";
+  // Fill for proper topology detection, e.g. /sys/devices/pci0000:00/0000:00:10.0/0000:0b:00.0
+  props->pciPath = NULL;
+  // Only used to detect NICs with multiple PCI attachments.
+  props->guid = 0;
+  // Add NCCL_PTR_CUDA if GPU Direct RDMA is supported and regMr can take CUDA pointers.
+  props->ptrSupport = NCCL_PTR_HOST;
+  // If you regMr has a fast registration cache, set to 1. If set to 0, user buffer registration may be disabled.
+  props->regIsGlobal = 0;
+  // Speed in *Mbps*. 100000 means 100G
+  props->speed = 100000;
+  // Port number, used in conjunction with guid
+  props->port = 0;
+  // Custom latency (used to help tuning if latency is high. If set to 0, use default NCCL values.
+  props->latency = 0;
+  // Maximum number of comm objects we can create.
+  props->maxComms = 1024*1024;
+  // Maximum number of receive operations taken by irecv().
+  props->maxRecvs = 1;
+  // Coupling with NCCL network device-side code.
+  props->netDeviceType = 0;
+  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
  return ncclInternalError;
 }
 __hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; }
-__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm) { return ncclInternalError; }
-__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm) { return ncclInternalError; }
-__hidden ncclResult_t pluginRegMr(void* collComm, void* data, int size, int type, void** mhandle) { return ncclInternalError; }
+__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginRegMr(void* collComm, void* data, size_t size, int type, void** mhandle) { return ncclInternalError; }
 __hidden ncclResult_t pluginRegMrDmaBuf(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { return ncclInternalError; }
 __hidden ncclResult_t pluginDeregMr(void* collComm, void* mhandle) { return ncclInternalError;}
 __hidden ncclResult_t pluginIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { return ncclInternalError; }
@@ -38,7 +60,7 @@ __hidden ncclResult_t pluginGetDeviceMr(void* comm, void* mhandle, void** dptr_m

 #define PLUGIN_NAME "Plugin"

-const ncclNet_v7_t ncclNetPlugin_v7 = {
+const ncclNet_v8_t ncclNetPlugin_v8 = {
  .name = PLUGIN_NAME,
  .init = pluginInit,
  .devices = pluginDevices,
@@ -60,10 +82,62 @@ const ncclNet_v7_t ncclNetPlugin_v7 = {
  .irecvConsumed = pluginIrecvConsumed,
 };

-__hidden ncclResult_t pluginGetProperties_v6(int dev, ncclNetProperties_v6_t* props) {
-  //pluginPciPath(dev, &props.pciPath);
-  //pluginPtrSupport(dev, &props.ptrSupport);
-  return ncclInternalError;
+__hidden ncclResult_t pluginGetProperties_v7(int dev, ncclNetProperties_v7_t* props_v7) {
+  ncclNetProperties_t props;
+  ncclResult_t ret = pluginGetProperties(dev, &props);
+  if (ret != ncclSuccess) return ret;
+  props_v7->name = props.name;
+  props_v7->pciPath = props.pciPath;
+  props_v7->guid = props.guid;
+  props_v7->ptrSupport = props.ptrSupport;
+  props_v7->speed = props.speed;
+  props_v7->port = props.port;
+  props_v7->maxComms = props.maxComms;
+  props_v7->maxRecvs = props.maxRecvs;
+  props_v7->netDeviceType = props.netDeviceType;
+  props_v7->netDeviceVersion = props.netDeviceVersion;
+  return ncclSuccess;
+}
+
+__hidden ncclResult_t pluginRegMr_v7(void* collComm, void* data, int size, int type, void** mhandle) {
+  return pluginRegMr(collComm, data, size, type, mhandle);
+}
+
+const ncclNet_v7_t ncclNetPlugin_v7 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit,
+  .devices = pluginDevices,
+  .getProperties = pluginGetProperties_v7,
+  .listen = pluginListen,
+  .connect = pluginConnect,
+  .accept = pluginAccept,
+  .regMr = pluginRegMr_v7,
+  .regMrDmaBuf = pluginRegMrDmaBuf,
+  .deregMr = pluginDeregMr,
+  .isend = pluginIsend,
+  .irecv = pluginIrecv,
+  .iflush = pluginIflush,
+  .test = pluginTest,
+  .closeSend = pluginCloseSend,
+  .closeRecv = pluginCloseRecv,
+  .closeListen = pluginCloseListen,
+  .getDeviceMr = pluginGetDeviceMr,
+  .irecvConsumed = pluginIrecvConsumed,
+};
+
+__hidden ncclResult_t pluginGetProperties_v6(int dev, ncclNetProperties_v6_t* props_v6) {
+  ncclNetProperties_t props;
+  ncclResult_t ret = pluginGetProperties(dev, &props);
+  if (ret != ncclSuccess) return ret;
+  props_v6->name = props.name;
+  props_v6->pciPath = props.pciPath;
+  props_v6->guid = props.guid;
+  props_v6->ptrSupport = props.ptrSupport;
+  props_v6->speed = props.speed;
+  props_v6->port = props.port;
+  props_v6->maxComms = props.maxComms;
+  props_v6->maxRecvs = props.maxRecvs;
+  return ncclSuccess;
 }

 __hidden ncclResult_t pluginConnect_v6(int dev, void* handle, void** sendComm) { return ncclInternalError; }
@@ -77,7 +151,7 @@ const ncclNet_v6_t ncclNetPlugin_v6 = {
  .listen = pluginListen,
  .connect = pluginConnect_v6,
  .accept = pluginAccept_v6,
-  .regMr = pluginRegMr,
+  .regMr = pluginRegMr_v7,
  .regMrDmaBuf = pluginRegMrDmaBuf,
  .deregMr = pluginDeregMr,
  .isend = pluginIsend,
@@ -98,7 +172,7 @@ const ncclNet_v5_t ncclNetPlugin_v5 = {
  .listen = pluginListen,
  .connect = pluginConnect_v6,
  .accept = pluginAccept_v6,
-  .regMr = pluginRegMr,
+  .regMr = pluginRegMr_v7,
  .deregMr = pluginDeregMr,
  .isend = pluginIsend,
  .irecv = pluginIrecv,
@@ -110,17 +184,17 @@ const ncclNet_v5_t ncclNetPlugin_v5 = {
 };

 /* v4 Compat */
-static ncclResult_t pluginGetProperties_v4(int dev, ncclNetProperties_v4_t* props) {
-  ncclNetProperties_v6_t props_v6;
-  ncclResult_t ret = pluginGetProperties_v6(dev, &props_v6);
+static ncclResult_t pluginGetProperties_v4(int dev, ncclNetProperties_v4_t* props_v4) {
+  ncclNetProperties_t props;
+  ncclResult_t ret = pluginGetProperties(dev, &props);
  if (ret != ncclSuccess) return ret;
-  props->name = props_v6.name;
-  props->pciPath = props_v6.pciPath;
-  props->guid = props_v6.guid;
-  props->ptrSupport = props_v6.ptrSupport;
-  props->speed = props_v6.speed;
-  props->port = props_v6.port;
-  props->maxComms = props_v6.maxComms;
+  props_v4->name = props.name;
+  props_v4->pciPath = props.pciPath;
+  props_v4->guid = props.guid;
+  props_v4->ptrSupport = props.ptrSupport;
+  props_v4->speed = props.speed;
+  props_v4->port = props.port;
+  props_v4->maxComms = props.maxComms;
  return ncclSuccess;
 }
 static ncclResult_t pluginIsend_v4(void *sendComm, void* data, int size, void *mhandle, void** request) {
@@ -157,7 +231,7 @@ const ncclNet_v4_t ncclNetPlugin_v4 = {
  .listen = pluginListen,
  .connect = pluginConnect_v4,
  .accept = pluginAccept_v4,
-  .regMr = pluginRegMr,
+  .regMr = pluginRegMr_v7,
  .deregMr = pluginDeregMr,
  .isend = pluginIsend_v4,
  .irecv = pluginIrecv_v4,
@@ -202,7 +276,7 @@ const ncclNet_v3_t ncclNetPlugin_v3 = {
  .listen = pluginListen_v3,
  .connect = pluginConnect_v3,
  .accept = pluginAccept_v4,
-  .regMr = pluginRegMr,
+  .regMr = pluginRegMr_v7,
  .deregMr = pluginDeregMr,
  .isend = pluginIsend_v4,
  .irecv = pluginIrecv_v4,
@@ -223,7 +297,7 @@ const ncclNet_v2_t ncclNetPlugin_v2 = {
  .listen = pluginListen,
  .connect = pluginConnect_v4,
  .accept = pluginAccept_v4,
-  .regMr = pluginRegMr,
+  .regMr = pluginRegMr_v7,
  .deregMr = pluginDeregMr,
  .isend = pluginIsend_v4,
  .irecv = pluginIrecv_v4,
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 19
-NCCL_PATCH   := 4
+NCCL_MINOR   := 20
+NCCL_PATCH   := 5
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
@@ -10,7 +10,7 @@ include ../makefiles/version.mk
 INCEXPORTS  := nccl.h nccl_net.h
 LIBSRCFILES := \
 	bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \
-	init.cc init_nvtx.cc net.cc proxy.cc transport.cc \
+	init.cc init_nvtx.cc net.cc proxy.cc transport.cc register.cc \
 	$(wildcard graph/*.cc) \
 	$(wildcard misc/*.cc) \
 	$(wildcard transport/*.cc)
@@ -222,6 +222,7 @@ struct bootstrapState {
  struct ncclSocket ringSendSocket;
  union ncclSocketAddress* peerCommAddresses;
  union ncclSocketAddress* peerProxyAddresses;
+  uint64_t* peerProxyAddressesUDS;
  struct unexConn* unexpectedConnections;
  int cudaDev;
  int rank;
@@ -300,6 +301,7 @@ ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm*

  // Create the service proxy
  NCCLCHECK(ncclCalloc(&state->peerProxyAddresses, nranks));
+  NCCLCHECK(ncclCalloc(&state->peerProxyAddressesUDS, nranks));

  // proxy is aborted through a message; don't set abortFlag
  NCCLCHECK(ncclCalloc(&proxySocket, 1));
@@ -307,7 +309,13 @@ ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm*
  NCCLCHECK(ncclSocketListen(proxySocket));
  NCCLCHECK(ncclSocketGetAddr(proxySocket, state->peerProxyAddresses+rank));
  NCCLCHECK(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress)));
-  NCCLCHECK(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses));
+  // cuMem UDS support
+  // Make sure we create a unique UDS socket name
+  uint64_t randId;
+  NCCLCHECK(getRandomData(&randId, sizeof(randId)));
+  state->peerProxyAddressesUDS[rank] = getPidHash()+randId;
+  NCCLCHECK(bootstrapAllGather(state, state->peerProxyAddressesUDS, sizeof(*state->peerProxyAddressesUDS)));
+  NCCLCHECK(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses, state->peerProxyAddressesUDS));

  TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);

@@ -360,8 +368,6 @@ ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm*
    for (int i = 0; i < nranks; ++i) {
      comm->topParentRanks[i] = parent->topParentRanks[parentRanks[i]];
    }
-    comm->proxyState = parent->sharedRes->proxyState;
-    ncclAtomicRefCountIncrement(&parent->sharedRes->proxyState->refCount);
  } else {
    // Create the service proxy
    NCCLCHECKGOTO(ncclCalloc(&state->peerProxyAddresses, nranks), ret, fail);
@@ -371,10 +377,17 @@ ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm*
    NCCLCHECKGOTO(ncclSocketGetAddr(proxySocket, &tmpAddr), ret, fail);
    memcpy(state->peerProxyAddresses + rank, &tmpAddr, sizeof(union ncclSocketAddress));
    NCCLCHECKGOTO(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress)), ret, fail);
-    NCCLCHECKGOTO(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses), ret, fail);
+    // cuMem UDS support
+    NCCLCHECKGOTO(ncclCalloc(&state->peerProxyAddressesUDS, nranks), ret, fail);
+    // Make sure we create a unique UDS socket name
+    uint64_t randId;
+    NCCLCHECKGOTO(getRandomData(&randId, sizeof(randId)), ret, fail);
+    state->peerProxyAddressesUDS[rank] = getPidHash()+randId;
+    NCCLCHECKGOTO(bootstrapAllGather(state, state->peerProxyAddressesUDS, sizeof(*state->peerProxyAddressesUDS)), ret, fail);
+    NCCLCHECKGOTO(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses, state->peerProxyAddressesUDS), ret, fail);
  }

-  INFO(NCCL_INIT, "bootstrapSplit: rank %d nranks %d color %d key %d prev %d next %d - DONE", rank, nranks, color, key, prev, next);
+  INFO(NCCL_INIT, "bootstrapSplit: comm %p parent %p rank %d nranks %d color %d key %d prev %d next %d - DONE", comm, parent, rank, nranks, color, key, prev, next);

 exit:
  return ret;
@@ -573,7 +586,7 @@ ncclResult_t bootstrapClose(void* commState) {
  struct bootstrapState* state = (struct bootstrapState*)commState;
  if (state->unexpectedConnections != NULL) {
    unexpectedFree(state);
-    if (*state->abortFlag == 0) {
+    if (__atomic_load_n(state->abortFlag, __ATOMIC_RELAXED) == 0) {
      WARN("Unexpected connections are not empty");
      return ncclInternalError;
    }
@@ -597,6 +610,7 @@ ncclResult_t bootstrapAbort(void* commState) {
  NCCLCHECK(ncclSocketClose(&state->ringRecvSocket));
  free(state->peerCommAddresses);
  free(state->peerProxyAddresses);
+  free(state->peerProxyAddressesUDS);
  free(state);
  return ncclSuccess;
 }
@@ -191,6 +191,9 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
    va_start(vargs, fmt);
    len += vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
    va_end(vargs);
+    // vsnprintf may return len > sizeof(buffer) in the case of a truncated output.
+    // Rewind len so that we can replace the final \0 by \n
+    if (len > sizeof(buffer)) len = sizeof(buffer)-1;
    buffer[len++] = '\n';
    fwrite(buffer, 1, len, ncclDebugFile);
  }
@@ -17,20 +17,21 @@ namespace {
  __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
 #endif
    const int tid = threadIdx.x;
-    const int nthreads = args->nWarps*WARP_SIZE;
-    const int bid = args->bid;
-    const int nChannels = args->nChannels;
+    const int nthreads = (int)args->nWarps * WARP_SIZE;
    ncclRing *ring = &ncclShmem.channel.ring;
    const int *ringRanks = ring->userRanks;
-    const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLGATHER_CHUNKSTEPS : 1));
-    // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
-    const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2);
    const int nranks = ncclShmem.comm.nRanks;
-    const ssize_t loopSize = nChannels*int(chunkSize);
-    const ssize_t size = args->count;
+    const size_t chunkCount = args->chunkCount;
+    const size_t channelCount = args->workCount;
+    const size_t gridOffset = args->workOffset;
+    const size_t count = args->count;
+    size_t offset;
+    size_t dataOffset;
+    int nelem;
+    int rankDest;

 #if defined(ENABLE_NPKIT)
-    int npKitCtxIdx = bid;
+    int npKitCtxIdx = gridOffset / channelCount;
 #endif

 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
@@ -50,7 +51,7 @@ namespace {

 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_ENTRY)
    if (tid == 0) {
-      NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_ENTRY, size*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
+      NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_ENTRY, count*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
    }
 #endif
@@ -66,28 +67,14 @@ namespace {
    }
 #endif

-    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-      ssize_t realChunkSize;
-      if (Proto::Id == NCCL_PROTO_SIMPLE) {
-        realChunkSize = min(chunkSize, divUp(size-gridOffset,nChannels));
-        realChunkSize = roundUp(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-      }
-      else if (Proto::Id == NCCL_PROTO_LL)
-        realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize;
-      else if (Proto::Id == NCCL_PROTO_LL128)
-        realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128);
-      realChunkSize = int(realChunkSize);
-
-      ssize_t chunkOffset = gridOffset + int(bid*realChunkSize);
-
+    for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
      /////////////// begin AllGather steps ///////////////
-      ssize_t offset;
-      int nelem = min(realChunkSize, size-chunkOffset);
-      int rankDest;
+      nelem = min(chunkCount, channelCount - elemOffset);
+      dataOffset = gridOffset + elemOffset;

      // step 0: push data to next GPU
      rankDest = ringRanks[0];
-      offset = chunkOffset + rankDest * size;
+      offset = dataOffset + rankDest * count;

 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_SEND_ENTRY)
      if (tid == 0) {
@@ -97,10 +84,10 @@ namespace {
      }
 #endif

-      if (inputBuf + chunkOffset == outputBuf + offset) { // In place
-        prims.directSend(chunkOffset, offset, nelem);
+      if (inputBuf + dataOffset == outputBuf + offset) { // In place
+        prims.directSend(dataOffset, offset, nelem);
      } else {
-        prims.directCopySend(chunkOffset, offset, nelem);
+        prims.directCopySend(dataOffset, offset, nelem);
      }

 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_SEND_EXIT)
@@ -121,7 +108,7 @@ namespace {
      // k-2 steps: copy to next GPU
      for (int j=1; j<nranks-1; ++j) {
        rankDest = ringRanks[nranks-j];
-        offset = chunkOffset + rankDest * size;
+        offset = dataOffset + rankDest * count;

        prims.directRecvCopySend(offset, nelem);
      }
@@ -135,7 +122,7 @@ namespace {

      // Make final copy from buffer to dest.
      rankDest = ringRanks[1];
-      offset = chunkOffset + rankDest * size;
+      offset = dataOffset + rankDest * count;

 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_ENTRY)
      if (tid == 0) {
@@ -159,7 +146,7 @@ namespace {
    }
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_EXIT)
    if (tid == 0) {
-      NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_EXIT, size*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
+      NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_EXIT, count*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
    }
 #endif
@@ -192,13 +179,14 @@ template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
  __device__ __forceinline__ void run(ncclWorkElem *args) {
    const int tid = threadIdx.x;
-    const int bid = args->bid;
-    const int nChannels = args->nChannels;
    struct ncclNvls* nvls = &ncclShmem.channel.nvls;
-    const ssize_t chunkSize = int(args->lastChunkSize);
-    const ssize_t size = args->count;
-    const ssize_t loopSize = nChannels*chunkSize;
+    const ssize_t count = args->count;
    const ssize_t rank = ncclShmem.comm.rank;
+    const size_t chunkCount = args->chunkCount;
+    size_t gridOffset = args->workOffset;
+    size_t channelCount = args->workCount;
+    size_t offset;
+    int nelem;

    const int nThreadsBcast = args->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : 4 * WARP_SIZE;
    const int nThreadsGather = args->regUsed ? WARP_SIZE : NCCL_MAX_NTHREADS - nThreadsBcast;
@@ -212,10 +200,10 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
        Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
          prims(tid, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
            args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
-        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + bid * chunkSize;
-          int nelem = min(chunkSize, size - offset);
-          prims.gather(offset, nvls->nHeads * size, nelem, size, -1, 0);
+        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
+          offset = gridOffset + elemOffset;
+          nelem = min(chunkCount, channelCount - elemOffset);
+          prims.gather(offset, nvls->nHeads * count, nelem, count, -1, 0);
        }
      } else if (tid < tidEndBcast) {
        // Bcast through NVLS
@@ -223,9 +211,9 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
        Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
          prims(tid - tidEndGather, nThreadsBcast, NULL, &nvls->down, args->sendbuff, NULL,
            args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0);
-        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + bid * chunkSize;
-          int nelem = min(chunkSize, size - offset);
+        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
+          offset = gridOffset + elemOffset;
+          nelem = min(chunkCount, channelCount - elemOffset);
          prims.send(offset, nelem);
        }
      }
@@ -240,7 +228,7 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
        /* used as sync */
        prims.scatter(0, 0, 0, 0, -1, 0);

-        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
          prims.gather(0, 0, 0, 0, -1, 0);
        }
      } else if (tid < tidEndBcast) {
@@ -251,13 +239,158 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
        /* used as sync */
        prims.recv(0, 0);

-        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t inpOffset = gridOffset + bid * chunkSize;
-          ssize_t outOffset = inpOffset + rank * size;
-          int nelem = min(chunkSize, size - inpOffset);
+        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
+          ssize_t inpOffset = gridOffset + elemOffset;
+          ssize_t outOffset = inpOffset + rank * count;
+          nelem = min(chunkCount, channelCount - elemOffset);
          prims.directSend(inpOffset, outOffset, nelem);
        }
      }
    }
  }
 };
+
+template<typename T, typename RedOp>
+struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_SIMPLE> {
+  template<bool BcastSendNotRecv>
+  struct Scatterer {
+    struct ncclWorkElem* args;
+    ssize_t chunkSize;
+    ssize_t railGridOffset;
+
+    template<int SlicePerChunk, int MinSrcs, int MaxSrcs, int MinDsts, int MaxDsts>
+    __device__ __forceinline__ void operator()(
+        int tid, int tn, int slice, int maxSliceSize,
+        int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes
+      ) {
+      static_assert(SlicePerChunk==1, "require: SlicePerChunk==1");
+      static_assert(MaxDsts<=1 || MaxSrcs<=1, "require: MaxDsts<=1 || MaxSrcs<=1");
+
+      struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
+      int nNodes = ncclShmem.comm.nNodes;
+      int nRails = direct->nHeads;
+      int bid = args->bid;
+      char* inbuf = (char*)args->sendbuff;
+      char* outbuf = (char*)args->recvbuff;
+      ssize_t sizePerRank = args->count*sizeof(T);
+      bool inPlace = (inbuf == outbuf + ncclShmem.comm.rank*sizePerRank);
+
+      ssize_t railAllBeg = min(railGridOffset + bid*chunkSize, nNodes*sizePerRank);
+      ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*sizePerRank);
+      int railAllSize = railAllEnd - railAllBeg;
+      if (tid < nDsts) dstSizes[tid] = railAllSize;
+
+      int src = 0;
+      int rail;
+      if (BcastSendNotRecv) {
+        rail = direct->headRank;
+      } else {
+        rail = direct->headRank+1;
+        if (rail == nRails) rail = 0;
+      }
+      do {
+        int node = railAllBeg/sizePerRank;
+        int railAllOffset = 0;
+        while (railAllOffset < railAllSize) {
+          ssize_t railOneBeg = node*sizePerRank;
+          ssize_t railOneEnd = railOneBeg + sizePerRank;
+          ssize_t railOneOffset = (railAllBeg+railAllOffset) - railOneBeg;
+          int delta = min(railAllEnd, railOneEnd) - (railAllBeg+railAllOffset);
+          int rank = ncclShmem.comm.collNetDenseToUserRank[node*nRails + rail];
+          ssize_t userOneBeg = rank*sizePerRank + railOneOffset;
+          int outIsDst = (inPlace && rank == ncclShmem.comm.rank) ? 0 : 1;
+          reduceCopy<ncclCollUnroll(), RedOp, T,
+                     /*MultimemSrcs,MinSrcs,MaxSrcs=*/0,1,1,
+                     /*MultimemDsts=*/0, 0+MinDsts, 1+MaxDsts,
+                     /*PreOpSrcs=*/0>
+            (tid, tn, 0, nullptr, false,
+             /*nSrcs=*/1, [=]__device__(int s/*==0*/) -> void* {
+               return (char*)srcPtrs[src] + railAllOffset;
+             },
+             /*nDsts=*/outIsDst+nDsts, [=]__device__(int d) -> void* {
+               return d < outIsDst ? outbuf + userOneBeg
+                                   : (char*)dstPtrs[d-outIsDst] + railAllOffset;
+             },
+             delta);
+          railAllOffset += delta;
+          node += 1;
+        }
+        src += 1;
+        rail += 1;
+        if (rail == nRails) rail = 0;
+      } while (!BcastSendNotRecv && src < nRails-1);
+    }
+  };
+
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
+    int tid = threadIdx.x;
+    const int nChannels = args->nChannels;
+    struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
+    int const &nNodes = ncclShmem.comm.nNodes;
+    ssize_t chunkSize = int(args->chunkCount);
+    ssize_t const &sizePerRank = args->count;
+
+    bool isMultiRail = (direct->nHeads > 1);
+    int nWarps1 = 1;
+    int nWarps2 = (isMultiRail ? 2 : 1);
+    int nWarps3 = (isMultiRail ? 2 : 0);
+    float denom = float(args->nWarps)/float(nWarps1+nWarps2+nWarps3);
+    nWarps3 = int(denom*nWarps3);
+    nWarps2 = int(denom*nWarps2);
+    nWarps1 = args->nWarps - (nWarps2+nWarps3);
+
+    using Proto = ProtoSimple<1, 1>;
+
+    int tn = nWarps1*WARP_SIZE;
+    if (tid < tn) {
+      // Phase 1: send to network
+      Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
+        prims(tid, tn, nullptr, &direct->out, args->sendbuff, nullptr,
+              /*redOpArg=*/0, 0*Proto::MaxGroupWidth, 1, 1);
+      for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
+        ssize_t railAllBeg = railGridOffset + args->bid*chunkSize;
+        ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*sizePerRank);
+        ssize_t railOneBeg = ncclShmem.comm.node*sizePerRank;
+        ssize_t railOneEnd = railOneBeg + sizePerRank;
+        ssize_t beg = max(railAllBeg, railOneBeg);
+        ssize_t end = min(railAllEnd, railOneEnd);
+        prims.send(beg-railOneBeg, max(ssize_t(0), end-beg));
+      }
+      return;
+    }
+    tid -= tn;
+
+    tn = nWarps2*WARP_SIZE;
+    if (tid < tn) {
+      // Phase 2: Recv network -> deposit output + send to bcast
+      Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
+        prims(tid, tn, &direct->out, direct->heads+1, nullptr, nullptr,
+              /*redOpArg=*/0, 1*Proto::MaxGroupWidth, 0, 0);
+      for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
+        Scatterer</*BcastSendNotRecv=*/true> scat;
+        scat.args = args;
+        scat.chunkSize = chunkSize;
+        scat.railGridOffset = railGridOffset;
+        prims.template process</*Recv=*/1, /*Send=*/1>(scat);
+      }
+      return;
+    }
+    tid -= tn;
+
+    tn = nWarps3*WARP_SIZE;
+    if (tid < tn) {
+      // Phase 3: Recv bcast -> deposit output
+      Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 0>, /*Direct=*/0, Proto, 0>
+        prims(tid, tn, direct->heads+1, nullptr, nullptr, nullptr,
+              /*redOpArg=*/0, 2*Proto::MaxGroupWidth, 0, 0);
+      for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
+        Scatterer</*BcastSendNotRecv=*/false> scat;
+        scat.args = args;
+        scat.chunkSize = chunkSize;
+        scat.railGridOffset = railGridOffset;
+        prims.template process</*Recv=*/1, /*Send=*/0>(scat);
+      }
+      return;
+    }
+  }
+};
@@ -21,18 +21,21 @@ namespace {
  __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
 #endif
    const int tid = threadIdx.x;
-    const int nthreads = args->nWarps*WARP_SIZE;
-    const int bid = args->bid;
-    const int nChannels = args->nChannels;
+    const int nthreads = (int)args->nWarps * WARP_SIZE;
    ncclRing *ring = &ncclShmem.channel.ring;
    int ringIx = ring->index;
-    const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLREDUCE_CHUNKSTEPS : 1));
+    ssize_t chunkCount = args->chunkCount;
    const int nranks = ncclShmem.comm.nRanks;
-    const ssize_t loopSize = nChannels*nranks*chunkSize;
+    const ssize_t loopCount = nranks * chunkCount;
+    ssize_t offset;
+    ssize_t gridOffset = args->workOffset;
+    ssize_t channelCount = args->workCount;
    const ssize_t size = args->count;
+    int nelem;
+    int chunk;

 #if defined(ENABLE_NPKIT)
-    int npKitCtxIdx = bid;
+    int npKitCtxIdx = gridOffset / channelCount;
 #endif

 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
@@ -74,34 +77,21 @@ namespace {
    }
 #endif

-    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-      ssize_t realChunkSize;
-      if (Proto::Id == NCCL_PROTO_SIMPLE) {
-        realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels*nranks));
-        realChunkSize = roundUp(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-      }
-      else
-        realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels*nranks*minChunkSize)*minChunkSize);
-      realChunkSize = int(realChunkSize);
+    for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
+      ssize_t remCount = channelCount - elemOffset;
+      ssize_t chunkOffset;
+
+      if (remCount < loopCount) chunkCount = args->lastChunkCount;

-      auto calcOffset = [&]__device__(int chunk)->ssize_t {
-        if (Proto::Id == NCCL_PROTO_SIMPLE)
-          return gridOffset + bid*nranks*realChunkSize + chunk*realChunkSize;
-        else
-          return gridOffset + (chunk*nChannels + bid)*realChunkSize;
-      };
      auto modRanks = [&]__device__(int r)->int {
        return r - (r >= nranks ? nranks : 0);
      };

-      ssize_t offset;
-      int nelem;
-      int chunk;
-
      // step 0: push data to next GPU
-      chunk = modRanks(ringIx + nranks-1);
-      offset = calcOffset(chunk);
-      nelem = min(realChunkSize, size-offset);
+      chunk = modRanks(ringIx + nranks - 1);
+      chunkOffset = chunk * chunkCount;
+      offset = gridOffset + elemOffset + chunkOffset;
+      nelem = (int)min(chunkCount, remCount - chunkOffset);

 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_SEND_ENTRY)
      if (tid == 0) {
@@ -130,10 +120,11 @@ namespace {
      }
 #endif

-      for (int j=2; j<nranks; ++j) {
-        chunk = modRanks(ringIx + nranks-j);
-        offset = calcOffset(chunk);
-        nelem = min(realChunkSize, size-offset);
+      for (int j = 2; j < nranks; ++j) {
+        chunk = modRanks(ringIx + nranks - j);
+        chunkOffset = chunk * chunkCount;
+        offset = gridOffset + elemOffset + chunkOffset;
+        nelem = (int)min(chunkCount, remCount - chunkOffset);
        prims.recvReduceSend(offset, nelem);
      }

@@ -147,8 +138,9 @@ namespace {
      // step k-1: reduce this buffer and data, which will produce the final
      // result that we store in this data and push to the next GPU
      chunk = ringIx + 0;
-      offset = calcOffset(chunk);
-      nelem = min(realChunkSize, size-offset);
+      chunkOffset = chunk * chunkCount;
+      offset = gridOffset + elemOffset + chunkOffset;
+      nelem = (int)min(chunkCount, remCount - chunkOffset);

 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY)
      if (tid == 0) {
@@ -176,10 +168,11 @@ namespace {
 #endif

      // k-2 steps: copy to next GPU
-      for (int j=1; j<nranks-1; ++j) {
-        chunk = modRanks(ringIx + nranks-j);
-        offset = calcOffset(chunk);
-        nelem = min(realChunkSize, size-offset);
+      for (int j = 1; j < nranks - 1; ++j) {
+        chunk = modRanks(ringIx + nranks - j);
+        chunkOffset = chunk * chunkCount;
+        offset = gridOffset + elemOffset + chunkOffset;
+        nelem = (int)min(chunkCount, remCount - chunkOffset);
        prims.directRecvCopySend(offset, nelem);
      }

@@ -200,8 +193,9 @@ namespace {

      // Make final copy from buffer to dest.
      chunk = modRanks(ringIx + 1);
-      offset = calcOffset(chunk);
-      nelem = min(realChunkSize, size-offset);
+      chunkOffset = chunk * chunkCount;
+      offset = gridOffset + elemOffset + chunkOffset;
+      nelem = (int)min(chunkCount, remCount - chunkOffset);
      prims.directRecv(offset, nelem);

 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT)
@@ -229,21 +223,17 @@ namespace {
  __device__ __attribute__((noinline)) void runTreeUpDown(ncclWorkElem *args) {
 #endif
    const int tid = threadIdx.x;
-    const int nthreads = args->nWarps*WARP_SIZE;
-    const int bid = args->bid;
-    const int nChannels = args->nChannels;
+    const int nthreads = (int)args->nWarps * WARP_SIZE;
    ncclTree *tree = &ncclShmem.channel.tree;
-    ssize_t chunkSize = int(
-      Proto::Id == NCCL_PROTO_SIMPLE ? args->lastChunkSize
-                   /* LL & LL128 */  : Proto::calcBytePerStep()/sizeof(T));
-    const ssize_t minChunkSize = int(
-      Proto::Id == NCCL_PROTO_SIMPLE ? nthreads*8*(sizeof(uint64_t)/sizeof(T))
-                   /* LL & LL128 */  : nthreads*(Proto::calcBytePerGrain()/sizeof(T)));
-    const ssize_t loopSize = int(nChannels*chunkSize);
+    const size_t channelCount = args->workCount;
+    const size_t gridOffset = args->workOffset;
+    const size_t chunkCount = args->chunkCount;
    const ssize_t size = args->count;
+    size_t offset;
+    int nelem;

 #if defined(ENABLE_NPKIT)
-    int npKitCtxIdx = bid;
+    int npKitCtxIdx = gridOffset / channelCount;
 #endif

 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
@@ -268,9 +258,6 @@ namespace {
    }
 #endif

-    if (loopSize > size)
-      chunkSize = divUp((int)size, int(nChannels*minChunkSize))*int(minChunkSize);
-
    { // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
      Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DEV_ARITY, 1>, /*Direct=*/0, Proto, 0> prims
        (tid, nthreads, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->redOpArg);
@@ -290,23 +277,23 @@ namespace {
 #endif

      if (tree->up == -1) {
-        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + bid*int(chunkSize);
-          int nelem = min(chunkSize, size-offset);
+        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
+          offset = gridOffset + elemOffset;
+          nelem = min(chunkCount, channelCount - elemOffset);
          prims.recvReduceCopy(offset, offset, nelem, /*postOp=*/true);
        }
      }
      else if (tree->down[0] == -1) {
-        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + bid*int(chunkSize);
-          int nelem = min(chunkSize, size-offset);
+        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
+          offset = gridOffset + elemOffset;
+          nelem = min(chunkCount, channelCount - elemOffset);
          prims.send(offset, nelem);
        }
      }
      else {
-        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + bid*int(chunkSize);
-          int nelem = min(chunkSize, size-offset);
+        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
+          offset = gridOffset + elemOffset;
+          nelem = min(chunkCount, channelCount - elemOffset);
          prims.recvReduceSend(offset, nelem);
        }
      }
@@ -339,23 +326,23 @@ namespace {
 #endif

      if (tree->up == -1) {
-        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + bid*int(chunkSize);
-          int nelem = min(chunkSize, size-offset);
+        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
+          offset = gridOffset + elemOffset;
+          nelem = min(chunkCount, channelCount - elemOffset);
          prims.directSendFromOutput(offset, nelem);
        }
      }
      else if (tree->down[0] == -1) {
-        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + bid*int(chunkSize);
-          int nelem = min(chunkSize, size-offset);
+        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
+          offset = gridOffset + elemOffset;
+          nelem = min(chunkCount, channelCount - elemOffset);
          prims.directRecv(offset, nelem);
        }
      }
      else {
-        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + bid*int(chunkSize);
-          int nelem = min(chunkSize, size-offset);
+        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
+          offset = gridOffset + elemOffset;
+          nelem = min(chunkCount, channelCount - elemOffset);
          prims.directRecvCopySend(offset, nelem);
        }
      }
@@ -385,19 +372,16 @@ namespace {
  __device__ __attribute__((noinline)) void runTreeSplit(ncclWorkElem *args) {
 #endif
    const int tid = threadIdx.x;
-    const int nthreads = args->nWarps*WARP_SIZE;
-    const int bid = args->bid;
-    const int nChannels = args->nChannels;
+    const int nthreads = (int)args->nWarps * WARP_SIZE;
    ncclTree *tree = &ncclShmem.channel.tree;
-    ssize_t chunkSize = int(
-      Proto::Id != NCCL_PROTO_LL ? args->lastChunkSize
-                                 : Proto::calcBytePerStep()/sizeof(T));
-    const ssize_t minChunkSize = int(
-      Proto::Id == NCCL_PROTO_SIMPLE ? nthreads*8*(sizeof(uint64_t)/sizeof(T)) :
-      Proto::Id == NCCL_PROTO_LL     ? nthreads*(Proto::calcBytePerGrain()/sizeof(T))
-                   /* LL128 */       : nthreads*(Proto::calcBytePerGrain()/sizeof(T))/8);
-    const ssize_t loopSize = int(nChannels*chunkSize);
+    const size_t chunkCount = args->chunkCount;
+    const size_t gridOffset = args->workOffset;
+    const size_t channelCount = args->workCount;
    const ssize_t size = args->count;
+    const int bid = gridOffset / channelCount;
+    size_t offset;
+    int nelem;
+
    int nthreadsSplit;
    if (Proto::Id == NCCL_PROTO_SIMPLE) {
      nthreadsSplit = nthreads/2;
@@ -442,9 +426,6 @@ namespace {
    }
 #endif

-    if (loopSize > size)
-      chunkSize = divUp((int)size, nChannels*int(minChunkSize))*int(minChunkSize);
-
    if (tree->up == -1) {
      // Reduce and broadcast. Max number of recv is 2, max number of send is 2
      Primitives<T, RedOp, FanSymmetric<NCCL_MAX_DEV_ARITY>, /*Direct=*/0, Proto, 0>
@@ -464,9 +445,9 @@ namespace {
      }
 #endif

-      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-        ssize_t offset = gridOffset + bid*int(chunkSize);
-        int nelem = min(chunkSize, size-offset);
+      for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
+        offset = gridOffset + elemOffset;
+        nelem = min(chunkCount, channelCount - elemOffset);
        prims.directRecvReduceCopySend(offset, offset, nelem, /*doPost=*/true);
      }

@@ -505,16 +486,16 @@ namespace {
 #endif

      if (tree->down[0] == -1) {
-        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + bid*int(chunkSize);
-          int nelem = min(chunkSize, size-offset);
+        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
+          offset = gridOffset + elemOffset;
+          nelem = min(chunkCount, channelCount - elemOffset);
          prims.send(offset, nelem);
        }
      }
      else {
-        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + bid*int(chunkSize);
-          int nelem = min(chunkSize, size-offset);
+        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
+          offset = gridOffset + elemOffset;
+          nelem = min(chunkCount, channelCount - elemOffset);
          prims.recvReduceSend(offset, nelem);
        }
      }
@@ -548,16 +529,16 @@ namespace {
 #endif

      if (tree->down[0] == -1) {
-        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + bid*int(chunkSize);
-          int nelem = min(chunkSize, size-offset);
+        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
+          offset = gridOffset + elemOffset;
+          nelem = min(chunkCount, channelCount - elemOffset);
          prims.directRecv(offset, nelem);
        }
      }
      else {
-        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + bid*int(chunkSize);
-          int nelem = min(chunkSize, size-offset);
+        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
+          offset = gridOffset + elemOffset;
+          nelem = min(chunkCount, channelCount - elemOffset);
          prims.directRecvCopySend(offset, nelem);
        }
      }
@@ -604,7 +585,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
    const int bid = args->bid;
    const int nChannels = args->nChannels;
    struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
-    const ssize_t chunkSize = int(args->lastChunkSize);
+    const ssize_t chunkSize = args->chunkCount;
    const ssize_t size = args->count;
    const ssize_t loopSize = nChannels*direct->nHeads*chunkSize;

@@ -700,14 +681,10 @@ template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
  __device__ __forceinline__ void run(ncclWorkElem *args) {
    const int tid = threadIdx.x;
-    const int bid = args->bid;
-    const int nChannels = args->nChannels;
    struct ncclNvls* nvls = &ncclShmem.channel.nvls;
-    const ssize_t chunkSize = int(args->lastChunkSize);
-    const ssize_t size = args->count;
-    const ssize_t loopSize = nChannels*nvls->nHeads*chunkSize;
-    const int nranks = ncclShmem.comm.nRanks;
+    ssize_t chunkSize = args->chunkCount;
    const bool hasOut = nvls->out != -1;
+    const int nranks = ncclShmem.comm.nRanks;
    const int totalWarps = NCCL_MAX_NTHREADS/WARP_SIZE;
    const int bcastWarps = hasOut ? (args->regUsed ? ((totalWarps - 2) >> 1) - 1 : 2) : 0;
    const int reduceWarps = args->regUsed ? (totalWarps - bcastWarps - 2) : (hasOut ? 3 : nranks <= 6 ? 7 : 5);
@@ -723,62 +700,114 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
    const int tidEndReduce = tidEndGather + nThreadsReduce;
    const int tidEndBcast = tidEndReduce + nThreadsBcast;

-    if (tid < tidEndScatter) {
-      // Scatter
-      using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
-      Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
-        prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
-          args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
-      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-        ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
-        int nelem = args->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset);
-        prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
-      }
-    } else if (tid < tidEndGather) {
-      // Gather
-      using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
-      Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
-        prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
-          args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
-      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-        ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
-        int nelem = args->regUsed ? 0 :min(nvls->nHeads * chunkSize, size - offset);
-        prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
-      }
-    } else if (tid < tidEndReduce && nvls->headRank != -1) {
-      if (!hasOut) {
+    if (args->oneNode) {
+      const ssize_t loopCount = nvls->nHeads * chunkSize;
+      const ssize_t channelCount = args->workCount;
+      const ssize_t gridOffset = args->workOffset;
+      ssize_t offset;
+      int nelem;
+
+      if (tid < tidEndScatter) {
+        // Scatter
+        using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
+        Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
+          prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
+            args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
+        for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
+          if (channelCount - elemOffset < loopCount) chunkSize = args->lastChunkCount;
+          offset = gridOffset + elemOffset;
+          nelem = args->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
+          prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
+        }
+      } else if (tid < tidEndGather) {
+        // Gather
+        using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
+        Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
+          prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
+            args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
+        for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
+          if (channelCount - elemOffset < loopCount) chunkSize = args->lastChunkCount;
+          offset = gridOffset + elemOffset;
+          nelem = args->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
+          prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
+        }
+      } else if (tid < tidEndReduce) {
        // Reduce, broadcast through NVLS
        using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
        Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
          prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL,
            args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args);
-        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
-          int nelem = min(chunkSize, size - offset);
-          prims.directRecvDirectSend(offset, offset, nelem);
-        }
-      } else {
-        // Reduce, send to network
-        using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
-        Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
-          prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->out, NULL, NULL,
-            args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 1, args);
-        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
-          int nelem = min(chunkSize, size - offset);
+        for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
+          ssize_t chunkOffset;
+          if (channelCount - elemOffset < loopCount) chunkSize = args->lastChunkCount;
+          chunkOffset = elemOffset + nvls->headRank * chunkSize;
+          offset = gridOffset + chunkOffset;
+          nelem = min(chunkSize, channelCount - chunkOffset);
          prims.directRecvDirectSend(offset, offset, nelem);
        }
      }
-    } else if (tid < tidEndBcast && nvls->headRank != -1) {
-      // Recv from network, broadcast
-      using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
-      Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
-        prims(tid - tidEndReduce, nThreadsBcast, &nvls->out, &nvls->down, NULL, NULL,
-          args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, args);
-      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-        ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
-        int nelem = min(chunkSize, size - offset);
-        prims.directRecvDirectSend(offset, offset, nelem);
+    } else {
+      const int bid = args->bid;
+      const ssize_t loopSize = args->nChannels * nvls->nHeads * chunkSize;
+      const ssize_t size = args->count;
+
+      if (tid < tidEndScatter) {
+        // Scatter
+        using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
+        Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
+          prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
+            args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
+        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+          ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
+          int nelem = args->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset);
+          prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
+        }
+      } else if (tid < tidEndGather) {
+        // Gather
+        using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
+        Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
+          prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
+            args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
+        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+          ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
+          int nelem = args->regUsed ? 0 :min(nvls->nHeads * chunkSize, size - offset);
+          prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
+        }
+      } else if (tid < tidEndReduce && nvls->headRank != -1) {
+        if (!hasOut) {
+          // Reduce, broadcast through NVLS
+          using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
+          Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
+            prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL,
+              args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args);
+          for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+            ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
+            int nelem = min(chunkSize, size - offset);
+            prims.directRecvDirectSend(offset, offset, nelem);
+          }
+        } else {
+          // Reduce, send to network
+          using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
+          Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
+            prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->out, NULL, NULL,
+              args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 1, args);
+          for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+            ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
+            int nelem = min(chunkSize, size - offset);
+            prims.directRecvDirectSend(offset, offset, nelem);
+          }
+        }
+      } else if (tid < tidEndBcast && nvls->headRank != -1) {
+        // Recv from network, broadcast
+        using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
+        Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
+          prims(tid - tidEndReduce, nThreadsBcast, &nvls->out, &nvls->down, NULL, NULL,
+            args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, args);
+        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+          ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
+          int nelem = min(chunkSize, size - offset);
+          prims.directRecvDirectSend(offset, offset, nelem);
+        }
      }
    }
  }
@@ -788,14 +817,13 @@ template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PROTO_SIMPLE> {
  __device__ __forceinline__ void run(ncclWorkElem *args) {
    const int tid = threadIdx.x;
-    const int bid = args->bid;
-    const int nChannels = args->nChannels;
    struct ncclNvls* nvls = &ncclShmem.channel.nvls;
    const int treeUp = nvls->treeUp;
    const int* treeDown = nvls->treeDown;
-    const ssize_t chunkSize = int(args->lastChunkSize);
-    const ssize_t size = args->count;
-    const ssize_t loopSize = nChannels*nvls->nHeads*chunkSize;
+    ssize_t chunkCount = args->chunkCount;
+    const ssize_t loopCount = nvls->nHeads * chunkCount;
+    const ssize_t channelCount = args->workCount;
+    const ssize_t gridOffset = args->workOffset;
    const int nranks = ncclShmem.comm.nRanks;
    const bool hasUp = treeUp != -1;
    const int totalWarps = NCCL_MAX_NTHREADS/WARP_SIZE;
@@ -803,6 +831,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
    const int reduceWarps = args->regUsed ? (totalWarps - bcastWarps - 2) : (hasUp ? 5 : nranks <= 6 ? 7 : 5);
    const int scatterWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps + 1) >> 1;
    const int gatherWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps) >> 1;
+    ssize_t offset;
+    int nelem;

    const int nThreadsScatter = scatterWarps*WARP_SIZE;
    const int nThreadsGather  = gatherWarps*WARP_SIZE;
@@ -819,10 +849,11 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
      Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
        prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
          args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
-      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-        ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
-        int nelem = args->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset);
-        prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
+      for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
+        if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount;
+        offset = gridOffset + elemOffset;
+        nelem = args->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
+        prims.scatter(offset, nelem, chunkCount, chunkCount, -1, 0);
      }
    } else if (tid < tidEndGather) {
      // Gather
@@ -830,10 +861,11 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
      Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
        prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
          args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
-      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-        ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
-        int nelem = args->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset);
-        prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
+      for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
+        if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount;
+        offset = gridOffset + elemOffset;
+        nelem = args->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
+        prims.gather(offset, nelem, chunkCount, chunkCount, -1, 0);
      }
    } else if (tid < tidEndReduce && nvls->headRank != -1) {
      if (!hasUp) {
@@ -842,9 +874,12 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
        Primitives<T, RedOp, FanSymmetric<3>, /*Direct=*/1, Proto, 0>
          prims(tid - tidEndGather, nThreadsReduce, treeDown, treeDown, NULL, NULL,
            args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args);
-        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
-          int nelem = min(chunkSize, size - offset);
+        for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
+          ssize_t chunkOffset;
+          if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount;
+          chunkOffset = elemOffset + nvls->headRank * chunkCount;
+          offset = gridOffset + chunkOffset;
+          nelem = min(chunkCount, channelCount - chunkOffset);
          prims.directRecvDirectSend(offset, offset, nelem);
        }
      } else {
@@ -853,9 +888,12 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
        Primitives<T, RedOp, FanAsymmetric<3, 1>, /*Direct=*/1, Proto, 0>
          prims(tid - tidEndGather, nThreadsReduce, treeDown, &treeUp, NULL, NULL,
            args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args);
-        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
-          int nelem = min(chunkSize, size - offset);
+        for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
+          ssize_t chunkOffset;
+          if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount;
+          chunkOffset = elemOffset + nvls->headRank * chunkCount;
+          offset = gridOffset + chunkOffset;
+          nelem = min(chunkCount, channelCount - chunkOffset);
          prims.directRecvDirectSend(offset, offset, nelem);
        }
      }
@@ -865,9 +903,12 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
      Primitives<T, RedOp, FanAsymmetric<1, 3>, /*Direct=*/1, Proto, 0>
        prims(tid - tidEndReduce, nThreadsBcast, &treeUp, treeDown, NULL, NULL,
          args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, args);
-      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-        ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
-        int nelem = min(chunkSize, size - offset);
+      for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
+        ssize_t chunkOffset;
+        if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount;
+        chunkOffset = elemOffset + nvls->headRank * chunkCount;
+        offset = gridOffset + chunkOffset;
+        nelem = min(chunkCount, channelCount - chunkOffset);
        prims.directRecvDirectSend(offset, offset, nelem);
      }
    }
@@ -882,7 +923,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
    const int bid = args->bid;
    const int nChannels = args->nChannels;
    ncclTree *tree = &ncclShmem.channel.collnetChain;
-    ssize_t chunkSize = int(args->lastChunkSize);
+    ssize_t chunkSize = args->chunkCount;
    const ssize_t loopSize = int(nChannels*chunkSize);
    const int nranks = ncclShmem.comm.nRanks;
    const ssize_t size = args->count;
@@ -992,4 +1033,4 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_LL
    runTreeSplit<T, RedOp, ProtoLL128>(args);
    //LAUNCH_CLIQUE_KERNEL(AllReduceCliqueSplitKernel, RedOp, T, args);
  }
-};
+};
@@ -30,7 +30,7 @@ namespace {
    const ssize_t chunk_offset = elem_size * (num_elems / num_chunks * chunk_id + (chunk_id < num_padding_chunks ? chunk_id : num_padding_chunks));
    const ssize_t chunk_size = elem_size * (num_elems / num_chunks + (chunk_id < num_padding_chunks ? 1 : 0));
    const int pivot_direction = (bid % num_uni_rings) / num_bi_rings;
-    const ssize_t prims_size = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLTOALL_PIVOT_CHUNKSTEPS : 1));
+    const ssize_t prims_size = args->chunkCount;

    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
      (tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, /*redOpArg(ignored)=*/0);
@@ -39,10 +39,10 @@ namespace {
      const int src_rank = ring->userRanks[(nranks - num_hops) % nranks];
      const int dst_rank = ring->userRanks[num_hops];
      const ssize_t send_offset =
-          dst_rank * num_elems * elem_size + chunk_offset +
+          dst_rank * args->count + chunk_offset +
          (src_rank == dst_rank ? pivot_direction * chunk_size / 2 : 0);
      const ssize_t recv_offset =
-          src_rank * num_elems * elem_size + chunk_offset +
+          src_rank * args->count + chunk_offset +
          (src_rank == dst_rank ? pivot_direction * chunk_size / 2 : 0);
      const ssize_t send_recv_size =
          src_rank == dst_rank ?
@@ -16,20 +16,19 @@ namespace {
  __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
 #endif
    const int tid = threadIdx.x;
-    const int nthreads = args->nWarps*WARP_SIZE;
-    const int bid = args->bid;
-    const int nChannels = args->nChannels;
+    const int nthreads = (int)args->nWarps * WARP_SIZE;
    ncclRing *ring = &ncclShmem.channel.ring;
-    const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? BROADCAST_CHUNKSTEPS : 1));
-    const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T)));
-    const ssize_t loopSize = nChannels*chunkSize;
-    const ssize_t size = args->count;
    const int rank = ring->userRanks[0];
    const int nextRank = ring->userRanks[1];
    const int root = args->root;
+    const size_t chunkCount = args->chunkCount;
+    const size_t channelCount = args->workCount;
+    const size_t gridOffset = args->workOffset;
+    size_t offset;
+    int nelem;

 #if defined(ENABLE_NPKIT)
-    int npKitCtxIdx = bid;
+    int npKitCtxIdx = gridOffset / channelCount;
 #endif

 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
@@ -58,20 +57,9 @@ namespace {
    }
 #endif

-    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-      ssize_t realChunkSize;
-      if (Proto::Id == NCCL_PROTO_SIMPLE) {
-        realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels));
-        realChunkSize = roundUp(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-      }
-      else if (Proto::Id == NCCL_PROTO_LL)
-        realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize;
-      else if (Proto::Id == NCCL_PROTO_LL128)
-        realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128);
-      realChunkSize = int(realChunkSize);
-
-      ssize_t offset = gridOffset + int(bid*realChunkSize);
-      int nelem = min(realChunkSize, size-offset);
+    for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
+      offset = gridOffset + elemOffset;
+      nelem = min(chunkCount, channelCount - elemOffset);

      if (rank == root) {
        if (inputBuf == outputBuf) {
@@ -108,4 +96,4 @@ struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL
  __device__ __forceinline__ void run(ncclWorkElem *args) {
    runRing<T, RedOp, ProtoLL128>(args);
  }
-};
+};
@@ -114,6 +114,7 @@ struct ncclShmemGroup {
  union {
    unpackGroupShmem unpack;
  } devicePlugin;
+  int32_t dstSizes[NCCL_MAX_NVLS_ARITY+1];
 };

 #define LDS_NUM_EVENTS 64
@@ -30,11 +30,11 @@ inline __device__ int loadInt(int* ptr) {
 template<typename RedFn, typename T, int Unroll, int BytePerPack,
         int MultimemSrcs, int MinSrcs, int MaxSrcs,
         int MultimemDsts, int MinDsts, int MaxDsts, int PreOpSrcs,
-         typename IntBytes>
+         typename IntBytes, typename SrcPtrFn, typename DstPtrFn>
 __device__ __forceinline__ void reduceCopyPacks(
    int nThreads, int &thread,
    uint64_t redArg, uint64_t *preOpArgs, bool postOp,
-    int nSrcs, void **srcPtrs, int nDsts, void **dstPtrs,
+    int nSrcs, SrcPtrFn const &srcPtrFn, int nDsts, DstPtrFn const &dstPtrFn,
    IntBytes &nBytesBehind, IntBytes &nBytesAhead
  ) {
  static_assert(std::is_signed<IntBytes>::value, "IntBytes must be a signed integral type.");
@@ -68,10 +68,10 @@ __device__ __forceinline__ void reduceCopyPacks(
  uintptr_t minDsts[MinDsts + !MinDsts];
  #pragma unroll
  for (int s=0; s < MinSrcs; s++)
-    minSrcs[s] = cvta_to_global(srcPtrs[s]) + threadBytesBehind;
+    minSrcs[s] = cvta_to_global(srcPtrFn(s)) + threadBytesBehind;
  #pragma unroll
  for (int d=0; d < MinDsts; d++)
-    minDsts[d] = cvta_to_global(dstPtrs[d]) + threadBytesBehind;
+    minDsts[d] = cvta_to_global(dstPtrFn(d)) + threadBytesBehind;

  // We dictate loop termination condition according to whether partial hunks
  // can be handled or not.
@@ -116,7 +116,7 @@ __device__ __forceinline__ void reduceCopyPacks(
    }

    for (int s=MinSrcs; (MinSrcs < MaxSrcs) && (s < MaxSrcs) && (s < nSrcs); s++) {
-      uintptr_t src = cvta_to_global(srcPtrs[s]) + threadBytesBehind;
+      uintptr_t src = cvta_to_global(srcPtrFn(s)) + threadBytesBehind;
      BytePack<BytePerPack> tmp[Unroll];
      RedFn preFn(s < PreOpSrcs ? preOpArgs[s] : 0);
      #pragma unroll Unroll
@@ -151,7 +151,7 @@ __device__ __forceinline__ void reduceCopyPacks(
      }
    }
    for (int d=MinDsts; (MinDsts < MaxDsts) && (d < MaxDsts) && (d < nDsts); d++) {
-      uintptr_t dst = cvta_to_global(dstPtrs[d]) + threadBytesBehind;
+      uintptr_t dst = cvta_to_global(dstPtrFn(d)) + threadBytesBehind;
      #pragma unroll Unroll
      for (int u=0; u < Unroll; u++) {
        st_global<BytePerPack>(dst, acc[u]);
@@ -185,11 +185,11 @@ __device__ __forceinline__ void reduceCopyPacks(
 template<int Unroll, typename RedFn, typename T,
         int MultimemSrcs, int MinSrcs, int MaxSrcs,
         int MultimemDsts, int MinDsts, int MaxDsts, int PreOpSrcs,
-         typename IntBytes>
+         typename IntBytes, typename SrcPtrFn, typename DstPtrFn>
 __device__ __forceinline__ void reduceCopy(
    int thread, int nThreads,
    uint64_t redArg, uint64_t *preOpArgs, bool postOp,
-    int nSrcs, void **srcPtrs, int nDsts, void **dstPtrs,
+    int nSrcs, SrcPtrFn const &srcPtrFn, int nDsts, DstPtrFn const &dstPtrFn,
    IntBytes nElts
  ) {
  static_assert(MultimemSrcs <= MinSrcs && MultimemDsts <= MinDsts, "Multimem pointers cannot exceed respective Min values.");
@@ -200,6 +200,9 @@ __device__ __forceinline__ void reduceCopy(
  // is supported for this redfn/type.
  constexpr int BigPackSize = (MultimemSrcs == 0) ? 16 : LoadMultimem_BigPackSize<RedFn>::BigPackSize;

+  if (MaxDsts==0) return;
+  if (MinDsts==0 && nDsts==0) return;
+
  IntBytes nBytesBehind = 0;
  IntBytes nBytesAhead = nElts*sizeof(T);

@@ -210,27 +213,27 @@ __device__ __forceinline__ void reduceCopy(
  #endif
    // Check that all pointers are BigPackSize aligned.
    bool aligned = true;
-    if (lane < nSrcs) aligned &= 0 == cvta_to_global(srcPtrs[lane]) % (BigPackSize + !BigPackSize);
-    if (lane < nDsts) aligned &= 0 == cvta_to_global(dstPtrs[lane]) % (BigPackSize + !BigPackSize);
+    if (lane < nSrcs) aligned &= 0 == cvta_to_global(srcPtrFn(lane)) % (BigPackSize + !BigPackSize);
+    if (lane < nDsts) aligned &= 0 == cvta_to_global(dstPtrFn(lane)) % (BigPackSize + !BigPackSize);
    aligned = !(__any(!aligned));
    if (aligned) {
 #if defined(__gfx90a__)
      reduceCopyPacks<RedFn, T, ((MinSrcs > 1) ? 2 : Unroll), BigPackSize,
        MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts, MaxDsts, PreOpSrcs>
        (nThreads, thread, redArg, preOpArgs, postOp,
-         nSrcs, srcPtrs, nDsts, dstPtrs, nBytesBehind, nBytesAhead);
+         nSrcs, srcPtrFn, nDsts, dstPtrFn, nBytesBehind, nBytesAhead);
 #else
      reduceCopyPacks<RedFn, T, Unroll*((MinSrcs == 1 && MinDsts == 1) ? 2 : 1), BigPackSize,
        MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts, MaxDsts, PreOpSrcs>
        (nThreads, /*&*/thread, redArg, preOpArgs, postOp,
-         nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead);
+         nSrcs, srcPtrFn, nDsts, dstPtrFn, /*&*/nBytesBehind, /*&*/nBytesAhead);
 #endif
      if (nBytesAhead == 0) return;

      reduceCopyPacks<RedFn, T, /*Unroll=*/1, BigPackSize,
        MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts, MaxDsts, PreOpSrcs>
        (nThreads, /*&*/thread, redArg, preOpArgs, postOp,
-         nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead);
+         nSrcs, srcPtrFn, nDsts, dstPtrFn, /*&*/nBytesBehind, /*&*/nBytesAhead);
      if (nBytesAhead == 0) return;
    }
  }
@@ -240,25 +243,43 @@ __device__ __forceinline__ void reduceCopy(
    reduceCopyPacks<RedFn, T, Unroll/2*(16/sizeof(T))/2, sizeof(T),
    MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts, MaxDsts, PreOpSrcs>
    (nThreads, thread, redArg, preOpArgs, postOp,
-     nSrcs, srcPtrs, nDsts, dstPtrs, nBytesBehind, nBytesAhead);
+     nSrcs, srcPtrFn, nDsts, dstPtrFn, nBytesBehind, nBytesAhead);
  } else {
    reduceCopyPacks<RedFn, T, Unroll*(16/sizeof(T))/2, /*BytePerPack=*/sizeof(T),
    MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts, MaxDsts, PreOpSrcs>
    (nThreads, /*&*/thread, redArg, preOpArgs, postOp,
-     nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead);
+     nSrcs, srcPtrFn, nDsts, dstPtrFn, /*&*/nBytesBehind, /*&*/nBytesAhead);
  }
 #else
  reduceCopyPacks<RedFn, T, Unroll*(16/sizeof(T))/2, /*BytePerPack=*/sizeof(T),
    MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts, MaxDsts, PreOpSrcs>
    (nThreads, /*&*/thread, redArg, preOpArgs, postOp,
-     nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead);
+     nSrcs, srcPtrFn, nDsts, dstPtrFn, /*&*/nBytesBehind, /*&*/nBytesAhead);
 #endif
  if (nBytesAhead == 0) return;

  reduceCopyPacks<RedFn, T, /*Unroll=*/1, /*BytePerPack=*/sizeof(T),
    MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts, MaxDsts, PreOpSrcs>
    (nThreads, /*&*/thread, redArg, preOpArgs, postOp,
-     nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead);
+     nSrcs, srcPtrFn, nDsts, dstPtrFn, /*&*/nBytesBehind, /*&*/nBytesAhead);
 }

-#endif // COMMON_KERNEL_H_
+template<int Unroll, typename RedFn, typename T,
+         int MultimemSrcs, int MinSrcs, int MaxSrcs,
+         int MultimemDsts, int MinDsts, int MaxDsts, int PreOpSrcs,
+         typename IntBytes>
+__device__ __forceinline__ void reduceCopy(
+    int thread, int nThreads,
+    uint64_t redArg, uint64_t *preOpArgs, bool postOp,
+    int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs,
+    IntBytes nElts
+  ) {
+  reduceCopy<Unroll, RedFn, T,
+             MultimemSrcs, MinSrcs, MaxSrcs,
+             MultimemDsts, MinDsts, MaxDsts, PreOpSrcs, IntBytes>
+    (thread, nThreads, redArg, preOpArgs, postOp,
+     nSrcs, [=]__device__(int i) { return srcPtrs[i]; },
+     nDsts, [=]__device__(int i) { return dstPtrs[i]; }, nElts);
+}
+
+#endif // COMMON_KERNEL_H_
@@ -0,0 +1,405 @@
+#!/usr/bin/env python3
+import os
+import sys
+
+# Order of redops, tys, protos, algos must match src/include/device.h
+all_colls =  ["Broadcast","Reduce","AllGather","ReduceScatter","AllReduce","SendRecv"]
+all_redops = ["Sum","Prod","MinMax","PreMulSum","SumPostDiv"]
+all_tys =    ["i8","u8","i32","u32","i64","u64","f16","f32","f64","bf16"]
+all_protos = ["LL","LL128","SIMPLE"]
+all_algos =  ["TREE","RING","COLLNET_DIRECT","COLLNET_CHAIN","NVLS","NVLS_TREE"]
+
+################################################################################
+# The first command line argument is the path to the directory to generate and
+# populate.
+
+gensrc = sys.argv[1]
+
+if os.path.exists(gensrc):
+  for name in os.listdir(gensrc):
+    os.remove(os.path.join(gensrc, name))
+    #os.truncate(os.path.join(gensrc, name), 0)
+else:
+  os.mkdir(gensrc)
+
+################################################################################
+# The second  command line argument is used as a regex to filter the functions
+# which make it into libnccl. This is helpful for reducing the binary when
+# developing device code. The regex supports non-space containing globs '*',
+# parentheses '(x)', and union 'a|b'. The string representing the function has
+# one of the forms:
+#
+# SendRecv
+# (AllGather|Broadcast) <algo> <proto>
+# (AlLReduce|Reduce|ReduceScatter) <redop> <type> <algo> <proto>
+#
+# The possible values for redop, type, algo, proto can be found in the all_<foo>
+# lists at the top of this file.
+#
+# Since the Makefile forwards this from the ONLY_FUNCS variable, useful command
+# line examples are given:
+"""
+# Only send/recv:
+make ONLY_FUNCS="SendRecv"
+
+# Only non-reductions:
+make ONLY_FUNCS="AllGather * *|Broadcast * *|SendRecv"
+
+# Only AllReduce sum f32 (but all algos, protos)
+make ONLY_FUNCS="AllReduce Sum f32 * *"
+
+# Only AllReduce minmax i32 NVLS (but all protos)
+make ONLY_FUNCS="AllReduce MinMax i32 NVLS *"
+
+# AllReduce sum <all floats> RING LL128
+make ONLY_FUNCS="AllReduce Sum f32 RING LL128"
+"""
+
+# Paste all non-None arguments together with `sep`.
+def paste(sep, *args):
+  return sep.join(x for x in args if x is not None)
+
+func_pattern = sys.argv[2:3]
+if func_pattern and func_pattern[0]:
+  import re
+  func_pattern = func_pattern[0]
+  func_pattern = func_pattern.replace("*", "[^ ]*")
+  func_pattern += "$"
+  def func_filter(*fn):
+    return None is not re.match(func_pattern, paste(" ", *fn), flags=re.IGNORECASE)
+else:
+  def func_filter(coll, redop, ty, algo, proto):
+    return True
+
+################################################################################
+
+algos_of_coll = {
+  "AllGather":     ["RING","COLLNET_DIRECT","NVLS"],
+  "AllReduce":     all_algos,
+  "Broadcast":     ["RING"],
+  "Reduce":        ["RING"],
+  "ReduceScatter": ["RING","COLLNET_DIRECT","NVLS"],
+  "SendRecv":      [None]
+}
+
+coll_camel_to_lower = {
+  "AllGather":     "all_gather",
+  "AllReduce":     "all_reduce",
+  "Broadcast":     "broadcast",
+  "Reduce":        "reduce",
+  "ReduceScatter": "reduce_scatter",
+  "SendRecv":      "sendrecv"
+}
+coll_lower_to_camel = {coll_camel_to_lower[x]: x for x in coll_camel_to_lower}
+
+################################################################################
+
+# Returns pair of minimum required values for (CUDART_VERSION, __CUDA_ARCH__)
+# or None if function is never supported. Note that (0, 0) encodes universal
+# support.
+def required_cuda(coll, redop, ty, algo, proto):
+  cudart, arch = 0, 0
+  # kernels mapped to by coll="Nop" functions have coll="Generic"
+  if coll in ("SendRecv", "Generic", "Nop"): return (cudart, arch)
+
+  if proto!="SIMPLE" and algo not in ("RING","TREE"): return None
+
+  if coll in ("AllReduce","Reduce","ReduceScatter"):
+    if redop=="SumPostDiv" and ty[0] not in ("i","u"): return None
+    if ty=="bf16": cudart = max(cudart, 11000)
+
+  if "NVLS" in algo:
+    if coll in ("AllReduce","Reduce","ReduceScatter"):
+      # Must match ncclNvlsSupported() in src/include/device.h
+      nvls_ok = ((ty in ("i32","u32","i64","u64") and redop in ("Sum","MinMax")) or
+                 (ty in ("f32","f64") and redop=="Sum") or
+                 (ty in ("f16","bf16") and redop in ("Sum","MinMax")))
+      if not nvls_ok: return None
+    cudart = max(cudart, 12010)
+    arch = max(arch, 900)
+
+  return (cudart, arch)
+
+# Maps functions to the chosen representative for the equivalence class it
+# belongs to. For instance (sum, signed int) maps to (sum, unsigned int).
+def equivalent_primary(coll, redop, ty, algo, proto):
+  if coll in ("AllReduce", "Reduce", "ReduceScatter"):
+    # map signed integer sum/prod to unsigned
+    if redop in ("Sum","Prod","PreMulSum") and ty[0]=="i":
+      return (coll, redop, "u"+ty[1:], algo, proto)
+    # map signed integer min/max to unsigned for non-NVLS
+    if redop=="MinMax" and ty[0]=="i" and ("NVLS" not in algo):
+      return (coll, redop, "u"+ty[1:], algo, proto)
+  return (coll, redop, ty, algo, proto)
+
+# Map to another func representing the best kernel to use. Every distinct value
+# returned will instantiate a ncclDevKernel specialized to run this func
+# without function call overhead.
+def best_kernel(coll, redop, ty, algo, proto):
+  def best(coll, redop, ty, algo, proto):
+    # Modify this logic to control how many kernels are specialized.
+    if coll=="Nop": return ("Generic", None, None, None, None)
+    if coll=="SendRecv": return ("SendRecv", None, None, None, None)
+    if coll in ("AllGather","Broadcast"): return (coll, None, None, "RING", "LL")
+    return (coll, "Sum", ty, ("TREE" if algo=="TREE" else "RING"), "LL")
+  # Need to ensure kernel is specialize for a primary function
+  kfn = equivalent_primary(*best(coll, redop, ty, algo, proto))
+  # And isn't filtered out.
+  if not func_filter(*kfn): return ("Generic", None, None, None, None)
+  return kfn
+
+# Order rows are enumerated must match formula of `ncclDevFuncId()`:
+def enumerate_func_rows():
+  yield ("SendRecv", None, None, None, None)
+  for coll in ("AllGather", "Broadcast"):
+    algos = algos_of_coll[coll]
+    for algo in algos:
+      for proto in all_protos:
+        yield (coll, None, None, algo, proto)
+  for coll in ("AllReduce", "Reduce", "ReduceScatter"):
+    algos = algos_of_coll[coll]
+    for redop in all_redops:
+      for ty in all_tys:
+        for algo in algos:
+          for proto in all_protos:
+            yield (coll, redop, ty, algo, proto)
+
+################################################################################
+
+def is_built(coll, redop, ty, algo, proto):
+  built = required_cuda(coll, redop, ty, algo, proto)
+  built = built and func_filter(coll, redop, ty, algo, proto)
+  return built
+
+# Returns None if required_cuda(...) is None.
+# Returns the coll="Nop" function if developer has filtered it out.
+# Otherwise just returns func it was given.
+def validate(coll, redop, ty, algo, proto):
+  valid = required_cuda(coll, redop, ty, algo, proto)
+  built = valid and func_filter(coll, redop, ty, algo, proto)
+  if built: return (coll, redop, ty, algo, proto)
+  if valid: return ("Nop", None, None, None, None)
+  return None
+
+# Corresponds to ncclDevFuncRowToId[]
+func_rows = [validate(*fn) for fn in enumerate_func_rows()]
+
+# Corresponds to ncclDevFuncTable[]
+primary_funcs = sorted(set(equivalent_primary(*fn) for fn in func_rows if fn is not None))
+
+# primary_to_index[primary_funcs[i]] == i
+primary_to_index = {fn: i for (i,fn) in zip(range(len(primary_funcs)), primary_funcs)}
+
+kernel_funcs = sorted(set(best_kernel(*fn) for fn in primary_funcs))
+
+################################################################################
+
+# Generate <gensrc>/device_table.cu
+with open(os.path.join(gensrc, "device_table.cu"), "w") as f:
+  out = f.write
+  out('#include "common.h"\n')
+  out("\n")
+
+  for fn in primary_funcs:
+    sym = paste("_", "ncclDevFunc", *fn)
+    cudart, arch = required_cuda(*fn)
+    if (cudart, arch) != (0, 0):
+      out("#if CUDART_VERSION >= %d && __CUDA_ARCH__ >= %d\n" % (cudart, arch))
+    out("__device__ void %s();\n" % sym)
+    if (cudart, arch) != (0, 0):
+      out("#endif\n")
+  out("\n")
+
+  out("__device__ ncclDevFuncPtr_t const ncclDevFuncTable[] = {\n");
+  index = 0
+  for fn in primary_funcs:
+    sym = paste("_", "ncclDevFunc", *fn)
+    cudart, arch = required_cuda(*fn)
+    if (cudart, arch) != (0, 0):
+      out("#if CUDART_VERSION >= %d && __CUDA_ARCH__ >= %d\n" % (cudart ,arch))
+    out("/*%4d*/ %s,\n" % (index, sym))
+    if (cudart, arch) != (0, 0):
+      out("#else\n" "/*%4d*/ nullptr,\n" "#endif\n" % index)
+    index += 1
+  out("nullptr};\n")
+  out("\n")
+
+  out("// Workaround for https://reviews.llvm.org/D55580\n"
+      "__device__ void ncclWorkaroundClangD55580() {}\n")
+
+# Generate <gensrc>/host_table.cc
+with open(os.path.join(gensrc, "host_table.cc"), "w") as f:
+  out = f.write
+  out('#include "device.h"\n')
+  out("\n")
+
+  # The mapping from function rows to valid primary function ids.
+  out("extern int const ncclDevFuncRowToId[] = {\n")
+  index = 0
+  for fn in func_rows:
+    fn_id, comment = -1, ""
+    if fn is not None:
+      fn_id = primary_to_index[equivalent_primary(*fn)]
+      comment = " // " + paste(" ", *fn)
+    out("/*%4d*/ %d,%s\n" % (index, fn_id, comment))
+    index += 1
+  out("-1};\n")
+  out("\n")
+
+  # Forward declarations of kernels.
+  for kfn in kernel_funcs:
+    cudart, _ = required_cuda(*kfn)
+    sym = paste("_", "ncclDevKernel", *kfn)
+    if cudart != 0: out("#if CUDART_VERSION >= %d\n" % cudart)
+    out("__global__ void %s(struct ncclDevComm*, uint64_t, struct ncclWork*);\n" % sym)
+    if cudart != 0: out("#endif\n")
+  out("\n")
+
+  # List of all kernel function pointers.
+  out("extern int const ncclDevKernelCount = %d;\n" % len(kernel_funcs))
+  out("extern void* const ncclDevKernelList[] = {\n")
+  index = 0
+  for kfn in kernel_funcs:
+    cudart, _ = required_cuda(*kfn)
+    sym = paste("_", "ncclDevKernel", *kfn)
+    if cudart != 0: out("#if CUDART_VERSION >= %d\n" % cudart)
+    out("/*%4d*/ (void*)%s,\n" % (index, sym));
+    if cudart != 0: out("#else\n" "/*%4d*/ nullptr,\n" "#endif\n" % index)
+    index += 1
+  out("nullptr};\n")
+  out("\n")
+
+  # Maps primary id to kernel function pointer.
+  out("extern void* const ncclDevKernelForFunc[] = {\n")
+  index = 0
+  for fn in primary_funcs:
+    kfn = best_kernel(*fn)
+    sym = paste("_", "ncclDevKernel", *kfn)
+    cudart, _ = required_cuda(*kfn)
+    if cudart != 0: out("#if CUDART_VERSION >= %d\n" % cudart)
+    out("/*%4d*/ (void*)%s,\n" % (index, sym))
+    if cudart != 0: out("#else\n" "/*%4d*/ nullptr,\n" "#endif\n" % index)
+    index += 1
+  out("nullptr};\n")
+  out("\n")
+
+  # Does the prior map use an explicitly specialized kernel.
+  out("extern bool const ncclDevKernelForFuncIsSpecialized[] = {\n")
+  index = 0
+  for fn in primary_funcs:
+    kfn = best_kernel(*fn)
+    specialized = "1" if fn == kfn else "0"
+    out("/*%4d*/ %s,\n" % (index, specialized))
+    index += 1
+  out("0};\n")
+
+# Maps to .cu filename which implements this func. The only constraint is that
+# "coll" is reflected in the name: formally that no two funcs having different
+# coll's map to the same filename.
+def impl_filename(coll, redop, ty, algo, proto):
+  return "%s.cu" % paste("_", coll_camel_to_lower[coll], redop and redop.lower(), ty)
+
+# Partition the functions and kernels to the .cu filenames. The partition is
+# a dictionary mapping filename to (coll, func-tuple list)
+def partition_by_name(fns):
+  ans = {}
+  for fn in fns:
+    name = impl_filename(*fn)
+    coll = fn[0]
+    if name not in ans:
+      ans[name] = (coll, [])
+    ans[name][1].append(fn)
+  return ans
+
+name_to_funcs = partition_by_name(fn for fn in primary_funcs if fn[0]!="Nop")
+name_to_kernels = partition_by_name(kfn for kfn in kernel_funcs if kfn[0]!="Generic")
+
+# Generate <gensrc>/rules.mk
+with open(os.path.join(gensrc, "rules.mk"), "w") as f:
+  out = f.write
+  impl_names = sorted(name_to_funcs.keys())
+  names = impl_names + ["host_table.cc", "device_table.cu"]
+  out("LIB_OBJS_GEN = $(patsubst %, $(OBJDIR)/genobj/%.o, {names})\n"
+      .format(names=" ".join(names)))
+  out("\n")
+
+  # For each <coll>_<op>_<ty>.cu compile to a .cu.o file. Notice the dependencies
+  # come from the suffix-erased file (e.g. 'gensrc/all_reduce.cu')
+  for name in impl_names:
+    coll = name_to_funcs[name][0]
+    out(
+      "$(OBJDIR)/genobj/{name}.o: $(OBJDIR)/gensrc $(OBJDIR)/genobj/{lower_coll}.cu.d\n"
+      "\t" "$(call COMPILE,$@,$(OBJDIR)/gensrc/{name})\n"
+      "\n"
+      .format(name=name, lower_coll=coll_camel_to_lower[coll])
+    )
+
+# Add the suffix-erased .cu's which are used only for dependency scraping.
+for coll in set(coll for (coll,_,_,_,_) in primary_funcs if coll!="Nop"):
+  name = impl_filename(coll, None, None, None, None)
+  if name not in name_to_funcs:
+    name_to_funcs[name] = (coll, [])
+
+redop_to_cxx = {
+  None: "FuncCopy",
+  "Sum": "FuncSum",
+  "Prod": "FuncProd",
+  "MinMax": "FuncMinMax",
+  "PreMulSum": "FuncPreMulSum",
+  "SumPostDiv": "FuncSumPostDiv"
+}
+
+ty_to_cxx = {
+  None: "int8_t",
+  "i8": "int8_t",
+  "u8": "uint8_t",
+  "i32": "int32_t",
+  "u32": "uint32_t",
+  "i64": "int64_t",
+  "u64": "uint64_t",
+  "f16": "half",
+  "f32": "float",
+  "f64": "double",
+  "bf16": "__nv_bfloat16"
+}
+
+# Generate each <gensrc>/<impl>.cu:
+for name in name_to_funcs.keys():
+  (coll, fns) = name_to_funcs[name]
+  with open(os.path.join(gensrc, name), "w") as f:
+    out = f.write
+    out(
+      '#include "common.h"\n'
+      '#include "{lower_coll}.h"\n'
+      .format(lower_coll=coll_camel_to_lower[coll])
+    )
+
+    (_, kfns) = name_to_kernels.get(name) or (None, [])
+    for kfn in kfns:
+      (coll, redop, ty, algo, proto) = kfn
+      sym = paste("_", coll, redop, ty, algo, proto)
+      fn_id = primary_to_index[kfn]
+      cudart, arch = required_cuda(*kfn)
+      if (cudart, arch) != (0, 0):
+        out("#if CUDART_VERSION >= %d && __CUDA_ARCH__ >= %d\n" % (cudart, arch))
+      out(
+        "DEFINE_ncclDevKernel({sym}, ncclFunc{coll}, {redop_cxx}, {ty_cxx}, NCCL_ALGO_{algo}, NCCL_PROTO_{proto}, {fn_id})\n"
+        .format(sym=sym, coll=coll, redop_cxx=redop_to_cxx[redop], ty_cxx=ty_to_cxx[ty],
+                algo=(algo or "RING"), proto=(proto or "SIMPLE"), fn_id=fn_id)
+      )
+      if (cudart, arch) != (0, 0):
+        out("#endif\n")
+
+    for fn in fns:
+      (coll, redop, ty, algo, proto) = fn
+      sym = paste("_", coll, redop, ty, algo, proto)
+      cudart, arch = required_cuda(*fn)
+      if (cudart, arch) != (0, 0):
+        out("#if CUDART_VERSION >= %d && __CUDA_ARCH__ >= %d\n" % (cudart, arch))
+      out(
+        "DEFINE_ncclDevFunc({sym}, ncclFunc{coll}, {redop_cxx}, {ty_cxx}, NCCL_ALGO_{algo}, NCCL_PROTO_{proto})\n"
+        .format(sym=sym, coll=coll, redop_cxx=redop_to_cxx[redop], ty_cxx=ty_to_cxx[ty],
+                algo=(algo or "RING"), proto=(proto or "SIMPLE"))
+      )
+      if (cudart, arch) != (0, 0):
+        out("#endif\n")
@@ -44,7 +44,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
  uint64_t recvConnHead;

  struct ncclConnInfo* sendConn = NULL;
-  volatile int* sendConnFifoPtr = NULL;
+  volatile struct ncclConnFifo* sendConnFifo = NULL;
  volatile uint64_t* sendConnHeadPtr = NULL;
  uint64_t sendConnHead;
  uint64_t sendConnHeadCache; // Cache last seen value
@@ -114,10 +114,9 @@ private:
        sendConnHeadCache = atomicAdd((unsigned long long *)sendConnHeadPtr, 0);
        if (checkAbort(spins, 1)) break;
      }
-      __asm__ __volatile__("s_wakeup");
-      if (sendConnFifoPtr) {
+      if (sendConnFifo) {
        int size = ((sendConnHead & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? stepLines*sizeof(union ncclLLFifoLine) : nbytes;
-        __atomic_store_n(sendConnFifoPtr+sendConnHead%NCCL_STEPS, (size), __ATOMIC_RELAXED);
+        sendConnFifo[sendConnHead%NCCL_STEPS].size = size;
      }
      sendConnHead += 1;
    }
@@ -586,7 +585,7 @@ private:
      sendConnHeadPtr = sendConn->head;
      sendConnHeadCache = *sendConnHeadPtr;
      sendConnHead = sendConn->step;
-      sendConnFifoPtr = sendConn->sizesFifo;
+      sendConnFifo = sendConn->connFifo;
    }
  }

@@ -594,7 +593,7 @@ private:
  __device__  Primitives(
      const int tid, const int nthreads, int const *recvPeers, int const *sendPeers,
      void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
-      uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclWorkElem* e = nullptr, int stepSize_=0
+      uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclWorkElem* e = nullptr, struct ncclWorkElemP2p* p2p = nullptr, int stepSize_=0
    ):
    redOp(redOpArg),
    tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group),
@@ -772,4 +771,4 @@ private:
  __device__ void localCopy(T* srcs, T* dsts, int eltN) {
    return mscclGenericOp<0,1,0,0>(&srcs, 1, &dsts, 1, eltN);
  }
-};
+};
@@ -42,7 +42,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
  uint64_t recvConnHead;

  struct ncclConnInfo* sendConn = NULL;
-  volatile int* sendConnFifoPtr = NULL;
+  volatile struct ncclConnFifo* sendConnFifo = NULL;
  volatile uint64_t* sendConnTailPtr = NULL;
  uint64_t sendConnTail;
  volatile uint64_t* sendConnHeadPtr = NULL;
@@ -102,9 +102,8 @@ private:
        sendConnHeadCache = __atomic_load_n(sendConnHeadPtr, __ATOMIC_RELAXED);
        if (checkAbort(spins, wid, 1)) break;
      }
-      __asm__ __volatile__("s_wakeup");
-      if (sendConnFifoPtr) {
-        __atomic_store_n(sendConnFifoPtr+sendStep[wid]%NCCL_STEPS, nbytes, __ATOMIC_RELAXED);
+      if (sendConnFifo) {
+        sendConnFifo[sendStep[wid]%NCCL_STEPS].size = nbytes;
      }
      sendConnHead += 1;
    }
@@ -487,10 +486,10 @@ private:
      sendConnHeadPtr = sendConn->head;
      sendConnHeadCache = *sendConnHeadPtr;
      sendConnHead = sendConn->step;
-      sendConnFifoPtr = sendConn->sizesFifo;
+      sendConnFifo = sendConn->connFifo;
    }
    if (tid >= nthreads-WARP_SIZE && wid<fan.nsend()) {
-      if (sendConn->sizesFifo) {
+      if (sendConn->connFifo) {
        sendConnTailPtr = sendConn->tail;
        sendConnTail = sendConn->step;
      }
@@ -581,4 +580,4 @@ public:
  __device__ void localCopy(T* srcs, T* dsts, int eltN) {
    return mscclGenericOp<0,1,0,0>(&srcs, 1, &dsts, 1, eltN);
  }
-};
+};
@@ -27,8 +27,8 @@ class Primitives<
                       RolePostSend = 0x10,
                       RolePostRecv = 0x20,
                       Aborted = 0x40,
-                       OffsFifoEnabled = 0x80,
-                       SizesFifoEnabled = 0x100,
+                       UserBufferMode = 0x80,
+                       ConnFifoEnabled = 0x100,
                       DirectWrite = 0x200,
                       DirectRead = 0x400,
                       ThreadsSynced = 0x800,
@@ -46,15 +46,12 @@ class Primitives<
  int flags;
  int group;
  uint64_t step;
-  int *connOffsFifoPtr;   // (flags & OffsFifoEnabled)
+  struct ncclConnFifo* connFifo = NULL;
  union {
    T *userBuff;            // (flags & (RoleInput|RoleOutput))
    T *connEltsFifo;        // !(flags & (RoleInput|RoleOutput))
  };
-  union {
-    int volatile *connSizesFifoPtr; //  (flags & SizesFifoEnabled)
-    T *directBuff;                  // !(flags & SizesFifoEnabled)
-  };
+  T *directBuff;
  uint64_t *connStepPtr;
  uint64_t connStepCache; // Cache last seen value of (*connStepPtr)
  uint64_t* barriers;
@@ -129,14 +126,16 @@ private:
    }

    if (flags & (Recv*RoleWaitRecv | Send*RoleWaitSend)) {
-      if (isSendNotRecv && (flags & SizesFifoEnabled))
-        __atomic_store_n(connSizesFifoPtr+step%NCCL_STEPS, nelts*sizeof(T), __ATOMIC_RELAXED);
+      if (flags & ConnFifoEnabled)
+        connFifo[step%NCCL_STEPS].size = nelts*sizeof(T);

      void **ptrs = isSendNotRecv ? (ncclShmem.groups[group].dsts + Dst)
                                  : (ncclShmem.groups[group].srcs + Src);
-      if (flags & OffsFifoEnabled)
-        ptrs[index] = connEltsFifo + loadInt(connOffsFifoPtr + (step%NCCL_STEPS))/sizeof(T);
-      else if (isSendNotRecv && DirectSend) {
+      if (flags & UserBufferMode) {
+         // Do nothing
+      } else if ((flags & ConnFifoEnabled) && connFifo[step%NCCL_STEPS].mode == NCCL_MODE_OFFSET) {
+        ptrs[index] = connEltsFifo + loadInt(&connFifo[step%NCCL_STEPS].offset)/sizeof(T);
+      } else if (isSendNotRecv && DirectSend) {
        if (flags & (DirectWrite | NvlsDirectWrite)) {
          ptrs[index] = directBuff + dstIx + offset;
        } else if (flags & DirectRead) {  // empty send
@@ -196,7 +195,7 @@ private:
    int slice = 0;
    int offset = 0;

-    if (tid < nworkers && offset < nelem) {
+    if (tid < nworkers && offset < nelem && ((flags & UserBufferMode) == 0)) {
      // Worker-only loop for non-empty slices. Non-workers and empty slices are
      // processed in the loop following this if block. The benefit of splitting
      // the loop like this is we pull two branches out of the critical path.
@@ -422,6 +421,55 @@ private:
    barrier();
  }

+public:
+  template<int Recv, int Send, typename Fn>
+  __device__ __forceinline__ void process(Fn &&fn) {
+    #pragma unroll 1
+    for (int slice=0; slice < SlicePerChunk; slice++) {
+      if (tid < nworkers) {
+        if (flags & (Recv*RoleWaitRecv | Send*RoleWaitSend)) {
+          bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send;
+          int spins = 0;
+          while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) {
+            connStepCache = loadStepValue(connStepPtr);
+            if (checkAbort(spins)) break;
+          }
+          void **ptrs = isSendNotRecv ? ncclShmem.groups[group].dsts
+                                      : ncclShmem.groups[group].srcs;
+          if ((flags & ConnFifoEnabled) && connFifo[step%NCCL_STEPS].mode == NCCL_MODE_OFFSET) {
+            int offset = loadInt(&connFifo[step%NCCL_STEPS].offset);
+            ptrs[index] = connEltsFifo + offset/sizeof(T);
+          } else {
+            ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
+          }
+        }
+        subBarrier();
+        fn.template operator()<SlicePerChunk, 0, Recv*MaxRecv, 0, Send*MaxSend>
+          (tid, nworkers, slice, stepSize*StepPerSlice,
+           fan.nrecv(), ncclShmem.groups[group].srcs,
+           fan.nsend(), ncclShmem.groups[group].dsts, ncclShmem.groups[group].dstSizes);
+      }
+      barrier();
+      int32_t dstSize = 0;
+      if (flags & Send*RolePostSend) {
+        dstSize = ncclShmem.groups[group].dstSizes[index];
+        ncclShmem.groups[group].dstSizes[index] = 0;
+        if (flags & ConnFifoEnabled) connFifo[step%NCCL_STEPS].size = dstSize*sizeof(T);
+      }
+      barrier();
+      if (flags & (Recv*(RoleWaitRecv|RolePostRecv) | Send*(RoleWaitSend|RolePostSend))) {
+        step += StepPerSlice;
+      }
+      if (flags & (Recv*RolePostRecv | Send*RolePostSend)) {
+        if (Send && (!Recv || (flags & RolePostSend)) && (dstSize!=0 || (flags&ConnFifoEnabled))) {
+          fence_acq_rel_sys();
+        }
+        st_relaxed_sys_global(connStepPtr, step);
+      }
+    }
+  }
+
+private:
  // Scatter/Gather generic op
  // skip: my own rank order in the buffer chunks
  // shift: peer offset to avoid all ranks sending to or receiving from same peer
@@ -507,8 +555,11 @@ private:
        flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0;
        connStepPtr = conn->tail;
        connStepCache = loadStepValue(connStepPtr);
-        flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0;
-        if (Direct) {
+        connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
+        if (conn->connFifo != nullptr) {
+          flags |= ConnFifoEnabled;
+          connFifo = conn->connFifo;
+        } else if (Direct) {
          // User buffers have been registered
          if ((conn->flags & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
            if (connIndex == 1 && P2p == 0) {
@@ -530,9 +581,6 @@ private:
            flags |= NvlsDirectRead;
          }
        }
-        if (flags & OffsFifoEnabled)
-          connOffsFifoPtr = conn->offsFifo;
-        connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
      }
    }
  }
@@ -542,6 +590,10 @@ private:
      auto *conn = &peer->send[connIndex];
      step = conn->step;
      step = roundUp(step, SlicePerChunk*StepPerSlice);
+
+      connFifo = conn->connFifo;
+      if (connFifo != nullptr) flags |= ConnFifoEnabled;
+
      if (flags & RolePostSend) {
        connStepPtr = conn->tail;
 	      next_hdp_reg = conn->next_hdp_reg;
@@ -552,15 +604,8 @@ private:
        flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0;
        connStepPtr = conn->head;
        connStepCache = loadStepValue(connStepPtr);
-        flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0;
-        if (flags & OffsFifoEnabled)
-          connOffsFifoPtr = conn->offsFifo;
        connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
-
-        if (conn->sizesFifo != nullptr) {
-          flags |= SizesFifoEnabled;
-          connSizesFifoPtr = conn->sizesFifo;
-        } else if (Direct) {
+        if (connFifo == nullptr && Direct) {
          // User buffers have been registered
          if ((conn->flags & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
            if (connIndex == 1 && P2p == 0) {
@@ -590,7 +635,7 @@ private:
  __forceinline__ __device__ Primitives(
      int tid, int nthreads, int const *recvPeers, int const *sendPeers,
      void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
-      uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclWorkElem* e = nullptr, int stepSize_=0
+      uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclWorkElem* e = nullptr, struct ncclWorkElemP2p* p2p = nullptr, int stepSize_=0
    ):
    tid(tid), nthreads(nthreads), tidInBlock(threadIdx.x), group(group),
    stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_) {
@@ -631,6 +676,8 @@ private:
    loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, e);
    loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, e);

+    if (p2p && p2p->reg) flags |= UserBufferMode;
+
    // if (barrierAny(flags & NetDeviceUnpack)) {
    //   flags |= AnyNetDeviceUnpack;
    //   // g == 0 is the first ThreadPerSync # of threads of this warp
@@ -657,10 +704,21 @@ private:
      auto *conns = (flags & RolePostSend) ? ncclShmem.groups[group].sendConns : ncclShmem.groups[group].recvConns;
      conns[index]->step = step;
    }
-    
+    if ((flags & UserBufferMode) && (flags & RoleWaitSend)) {
+      // Make sure we wait until the proxy has sent data before we return.
+      // We don't want the next CUDA kernel to overwrite the send buffer which
+      // was accessed directly.
+      uint64_t prevStep = step - StepPerSlice;
+      volatile ssize_t* ptr = &(connFifo[prevStep%NCCL_STEPS].size);
+      while (*ptr != -1);
+    }
+
    if ((flags & (AnyNetDeviceUnpack)) && (flags & (RoleWaitRecv))) {
      ncclNetDeviceSaveHead(netDeviceHandle, group);
    }
+
+    // Make sure all threads are done writing back conn->step and done using
+    // ncclShmem.groups[group]
    barrier();
  }

@@ -860,4 +918,4 @@ private:
  __device__ __forceinline__ void localCopy(T* srcs, T* dsts, int eltN) {
    return mscclGenericOp<0,1,0,0>(&srcs, 1, &dsts, 1, eltN);
  }
-};
+};
@@ -17,56 +17,39 @@ namespace {
  __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
 #endif
    const int tid = threadIdx.x;
-    const int nthreads = args->nWarps*WARP_SIZE;
-    const int bid = args->bid;
-    const int nChannels = args->nChannels;
+    const int nthreads = (int)args->nWarps * WARP_SIZE;
    ncclRing *ring = &ncclShmem.channel.ring;
-    const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCE_CHUNKSTEPS : 1));
-    const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T)));
    const int nranks = ncclShmem.comm.nRanks;
-    const ssize_t loopSize = nChannels*chunkSize;
-    const ssize_t size = args->count;
    const int rank = ncclShmem.comm.rank;
    const int prevRank = ring->userRanks[nranks-1];
    const int root = args->root;
+    const size_t chunkCount = args->chunkCount;
+    const size_t channelCount = args->workCount;
+    const size_t gridOffset = args->workOffset;
+    size_t offset;
+    int nelem;

    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
      prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, 0, args->connIndex, args->connIndex);

-    auto calcChunkSize = [&]__device__(ssize_t gridOffset)->int {
-      int realChunkSize;
-      if (Proto::Id == NCCL_PROTO_SIMPLE) {
-        realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels));
-        realChunkSize = roundUp(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-      }
-      else if (Proto::Id == NCCL_PROTO_LL)
-        realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize;
-      else if (Proto::Id == NCCL_PROTO_LL128)
-        realChunkSize = min(divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128, chunkSize);
-      return realChunkSize;
-    };
-
    if (prevRank == root) {
-      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-        int realChunkSize = calcChunkSize(gridOffset);
-        ssize_t offset = gridOffset + bid*realChunkSize;
-        int nelem = min(realChunkSize, size-offset);
+      for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
+        offset = gridOffset + elemOffset;
+        nelem = min(chunkCount, channelCount - elemOffset);
        prims.send(offset, nelem);
      }
    }
    else if (rank == root) {
-      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-        int realChunkSize = calcChunkSize(gridOffset);
-        ssize_t offset = gridOffset + bid*realChunkSize;
-        int nelem = min(realChunkSize, size-offset);
+      for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
+        offset = gridOffset + elemOffset;
+        nelem = min(chunkCount, channelCount - elemOffset);
        prims.recvReduceCopy(offset, offset, nelem, /*postOp=*/true);
      }
    }
    else {
-      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-        int realChunkSize = calcChunkSize(gridOffset);
-        ssize_t offset = gridOffset + bid*realChunkSize;
-        int nelem = min(realChunkSize, size-offset);
+      for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
+        offset = gridOffset + elemOffset;
+        nelem = min(chunkCount, channelCount - elemOffset);
        prims.recvReduceSend(offset, nelem);
      }
    }
@@ -93,4 +76,4 @@ struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128
  __device__ __forceinline__ void run(ncclWorkElem *args) {
    runRing<T, RedOp, ProtoLL128>(args);
  }
-};
+};
@@ -663,7 +663,7 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
      BytePack<2*sizeof(T)> tmp; \
      asm("multimem.ld_reduce.relaxed.sys.global.add." #ptx_ty " %0, [%1];" \
        : "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
-        : "l"(addr & -uintptr_t(sizeof(T)))); \
+        : "l"(addr & -uintptr_t(2*sizeof(T)))); \
      return tmp.half[(addr/sizeof(T))%2]; \
    } \
  };
@@ -676,11 +676,11 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
      if (fn.isMinNotMax) { \
        asm("multimem.ld_reduce.relaxed.sys.global.min." #ptx_ty " %0, [%1];" \
          : "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
-          : "l"(addr & -uintptr_t(sizeof(T)))); \
+          : "l"(addr & -uintptr_t(2*sizeof(T)))); \
      } else { \
        asm("multimem.ld_reduce.relaxed.sys.global.max." #ptx_ty " %0, [%1];" \
          : "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
-          : "l"(addr & -uintptr_t(sizeof(T)))); \
+          : "l"(addr & -uintptr_t(2*sizeof(T)))); \
      } \
      return tmp.half[(addr/sizeof(T))%2]; \
    } \
@@ -17,56 +17,43 @@ namespace {
  __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
 #endif
    const int tid = threadIdx.x;
-    const int nthreads = args->nWarps*WARP_SIZE;
-    const int bid = args->bid;
-    const int nChannels = args->nChannels;
+    const uint32_t nthreads = (uint32_t)args->nWarps * WARP_SIZE;
    ncclRing *ring = &ncclShmem.channel.ring;
    int const *ringRanks = ring->userRanks;
-    const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCESCATTER_CHUNKSTEPS : 1));
-    // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
-    const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2);
+    const size_t chunkCount = args->chunkCount;
    const int nranks = ncclShmem.comm.nRanks;
-    const ssize_t loopSize = nChannels*chunkSize;
-    const ssize_t size = args->count;
+    size_t channelCount = args->workCount;
+    size_t gridOffset = args->workOffset;
+    size_t offset;
+    size_t dataOffset;
+    size_t count = args->count;
+    uint32_t nelem;
+    int rankDest;

    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
      prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, 0, args->connIndex, args->connIndex);

-    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-      ssize_t realChunkSize;
-      if (Proto::Id == NCCL_PROTO_SIMPLE) {
-        realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels));
-        realChunkSize = roundUp(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-      }
-      else if (Proto::Id == NCCL_PROTO_LL)
-        realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize;
-      else if (Proto::Id == NCCL_PROTO_LL128)
-        realChunkSize = min(divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128, chunkSize);
-      realChunkSize = int(realChunkSize);
-
-      ssize_t chunkOffset = gridOffset + bid*int(realChunkSize);
+    for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
+      nelem = min(chunkCount, channelCount - elemOffset);

+      dataOffset = gridOffset + elemOffset;
      /////////////// begin ReduceScatter steps ///////////////
-      ssize_t offset;
-      int nelem = min(realChunkSize, size-chunkOffset);
-      int rankDest;
-
      // step 0: push data to next GPU
      rankDest = ringRanks[nranks-1];
-      offset = chunkOffset + rankDest * size;
+      offset = dataOffset + rankDest * count;
      prims.send(offset, nelem);

      // k-2 steps: reduce and copy to next GPU
      for (int j=2; j<nranks; ++j) {
        rankDest = ringRanks[nranks-j];
-        offset = chunkOffset + rankDest * size;
+        offset = dataOffset + rankDest * count;
        prims.recvReduceSend(offset, nelem);
      }

      // step k-1: reduce this buffer and data, which will produce the final result
      rankDest = ringRanks[0];
-      offset = chunkOffset + rankDest * size;
-      prims.recvReduceCopy(offset, chunkOffset, nelem, /*postOp=*/true);
+      offset = dataOffset + rankDest * count;
+      prims.recvReduceCopy(offset, dataOffset, nelem, /*postOp=*/true);
    }
  }
 }
@@ -97,14 +84,15 @@ template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
  __device__ __forceinline__ void run(ncclWorkElem *args) {
    const int tid = threadIdx.x;
-    const int bid = args->bid;
-    const int nChannels = args->nChannels;
    struct ncclNvls* nvls = &ncclShmem.channel.nvls;
-    const ssize_t chunkSize = int(args->lastChunkSize);
-    const ssize_t size = args->count;
-    const ssize_t loopSize = nChannels*chunkSize;
+    const size_t chunkCount = args->chunkCount;
+    const size_t count = args->count;
    const int rank = ncclShmem.comm.rank;
    const int nranks = ncclShmem.comm.nRanks;
+    size_t gridOffset = args->workOffset;
+    size_t channelCount = args->workCount;
+    size_t offset;
+    int nelem;

    /* if we are direct NVLS, we only need to allocate 1 warp to scatter for sync; 
     * if not, based on #ranks, we allocate 7 or 5 warps to reduce to saturate bandwidth
@@ -121,10 +109,10 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROT
        Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
          prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
            args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
-        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + bid * chunkSize;
-          int nelem = min(chunkSize, size - offset);
-          prims.scatter(offset, nvls->nHeads * size, nelem, size, -1, 0);
+        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
+          offset = gridOffset + elemOffset;
+          nelem = min(chunkCount, channelCount - elemOffset);
+          prims.scatter(offset, nvls->nHeads * count, nelem, count, -1, 0);
        }
      } else if (tid < tidEndReduce) {
        // Reduce through NVLS
@@ -132,9 +120,9 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROT
        Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
          prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, args->recvbuff,
            args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0);
-        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + bid * chunkSize;
-          int nelem = min(chunkSize, size - offset);
+        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
+          offset = gridOffset + elemOffset;
+          nelem = min(chunkCount, channelCount - elemOffset);
          prims.recv(offset, nelem);
        }
      }
@@ -145,7 +133,7 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROT
        Primitives<T, RedOp, FanSymmetric<NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
          prims(tid, nThreadsScatter, nvls->up, nvls->up, NULL, NULL,
            args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
-        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
          prims.scatter(0, 0, 0, 0, -1, 0);
        }

@@ -157,10 +145,10 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROT
        Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
          prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, &nvls->down, NULL, args->recvbuff,
            args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, args);
-        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t outOffset = gridOffset + bid * chunkSize;
-          ssize_t inpOffset = outOffset + rank * size;
-          int nelem = min(chunkSize, size - outOffset);
+        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
+          size_t outOffset = gridOffset + elemOffset;
+          size_t inpOffset = outOffset + rank * count;
+          nelem = min(chunkCount, channelCount - elemOffset);
          prims.directRecvCopy(inpOffset, outOffset, nelem);
        }

@@ -170,3 +158,146 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROT
    }
  }
 };
+
+template<typename T, typename RedOp>
+struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_SIMPLE> {
+  template<bool ReduceSendNotRecv>
+  struct Scatterer {
+    struct ncclWorkElem* args;
+    int chunkSize;
+    ssize_t railGridOffset;
+
+    template<int SlicePerChunk, int MinSrcs, int MaxSrcs, int MinDsts, int MaxDsts>
+    __device__ __forceinline__ void operator()(
+        int tid, int tn, int slice, int maxSliceSize,
+        int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes
+      ) {
+      static_assert(SlicePerChunk==1, "require: SlicePerChunk==1");
+      static_assert(MaxDsts<=1 || MaxSrcs<=1, "require: MaxDsts<=1 || MaxSrcs<=1");
+
+      struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
+      int nNodes = ncclShmem.comm.nNodes;
+      int nRails = direct->nHeads;
+      int bid = args->bid;
+      void* inbuf = (void*)args->sendbuff;
+      ssize_t sizePerRank = args->count;
+
+      ssize_t railAllBeg = min(railGridOffset + bid*chunkSize, nNodes*sizePerRank);
+      ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*sizePerRank);
+      int railAllSize = railAllEnd - railAllBeg;
+      if (tid < nDsts) dstSizes[tid] = railAllSize;
+
+      int dst = 0;
+      int rail;
+      if (!ReduceSendNotRecv) {
+        rail = direct->headRank;
+      } else {
+        rail = direct->headRank+1;
+        if (rail == nRails) rail = 0;
+      }
+      do {
+        int node = railAllBeg/sizePerRank;
+        int railAllOffset = 0;
+        while (railAllOffset < railAllSize) {
+          ssize_t railOneBeg = node*sizePerRank;
+          ssize_t railOneEnd = railOneBeg + sizePerRank;
+          ssize_t railOneOffset = (railAllBeg+railAllOffset) - railOneBeg;
+          int delta = min(railAllEnd, railOneEnd) - (railAllBeg+railAllOffset);
+          int rank = ncclShmem.comm.collNetDenseToUserRank[node*nRails + rail];
+          ssize_t userOneBeg = rank*sizePerRank + railOneOffset;
+          reduceCopy<ncclCollUnroll(), RedOp, T,
+                     /*MultimemSrcs=*/0, 1+MinSrcs, 1+MaxSrcs,
+                     /*MultimemDsts,MinDsts,MaxDsts=*/0,1,1,
+                     /*PreOpSrcs=*/1>
+            (tid, tn, args->redOpArg, &args->redOpArg, false,
+             /*nSrcs=*/1+nSrcs, [=]__device__(int s) {
+               return s==0 ? (T*)inbuf + userOneBeg
+                           : (T*)srcPtrs[s-1] + railAllOffset;
+             },
+             /*nDsts=*/1, [=]__device__(int d/*==0*/) {
+               return (T*)dstPtrs[dst] + railAllOffset;
+             },
+             delta);
+          railAllOffset += delta;
+          node += 1;
+        }
+        dst += 1;
+        rail += 1;
+        if (rail == nRails) rail = 0;
+      } while (ReduceSendNotRecv && dst < nRails-1);
+    }
+  };
+
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
+    int tid = threadIdx.x;
+    const int nChannels = args->nChannels;
+    struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
+    int const &nNodes = ncclShmem.comm.nNodes;
+    ssize_t chunkSize = int(args->chunkCount);
+    ssize_t sizePerRank = args->count;
+
+    // if (direct->out == -1) __trap();
+    bool isMultiRail = (direct->nHeads > 1);
+    int nWarps1 = (isMultiRail ? 2 : 0);
+    int nWarps2 = (isMultiRail ? 2 : 1);
+    int nWarps3 = 1;
+    float denom = float(args->nWarps)/float(nWarps1+nWarps2+nWarps3);
+    nWarps3 = int(denom*nWarps3);
+    nWarps2 = int(denom*nWarps2);
+    nWarps1 = args->nWarps - (nWarps2+nWarps3);
+
+    using Proto = ProtoSimple<1, 1>;
+
+    int tn = nWarps1*WARP_SIZE;
+    if (tid < tn) {
+      // Phase 1: Scatter inputs to peers
+      Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
+        prims(tid, tn, nullptr, direct->heads+1, nullptr, nullptr,
+              args->redOpArg, 0*Proto::MaxGroupWidth, 1, 1);
+      for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
+        Scatterer</*ReduceSendNotRecv=*/true> scat;
+        scat.args = args;
+        scat.chunkSize = chunkSize;
+        scat.railGridOffset = railGridOffset;
+        prims.template process</*Recv=*/0, /*Send=*/1>(scat);
+      }
+      return;
+    }
+    tid -= tn;
+
+    tn = nWarps2*WARP_SIZE;
+    if (tid < tn) {
+      // Phase 2: Reduce from peers + local input -> send to network
+      Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>
+        prims(tid, tn, direct->heads+1, &direct->out, nullptr, nullptr,
+              args->redOpArg, 1*Proto::MaxGroupWidth, 1, 1);
+      for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
+        Scatterer</*ReduceSendNotRecv=*/false> scat;
+        scat.args = args;
+        scat.chunkSize = chunkSize;
+        scat.railGridOffset = railGridOffset;
+        prims.template process</*Recv=*/1, /*Send=*/1>(scat);
+      }
+      return;
+    }
+    tid -= tn;
+
+    tn = nWarps3*WARP_SIZE;
+    if (tid < tn) {
+      // Phase 3: recv from network
+      Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
+        prims(tid, tn, &direct->out, nullptr, nullptr, args->recvbuff,
+              args->redOpArg, 2*Proto::MaxGroupWidth, 0, 0);
+      for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
+        ssize_t railAllBeg = railGridOffset + args->bid*chunkSize;
+        ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*sizePerRank);
+        ssize_t railOneBeg = ncclShmem.comm.node*sizePerRank;
+        ssize_t railOneEnd = railOneBeg + sizePerRank;
+        ssize_t beg = max(railAllBeg, railOneBeg);
+        ssize_t end = min(railAllEnd, railOneEnd);
+        prims.recv(beg-railOneBeg, max(ssize_t(0), end-beg), /*postOp=*/true);
+      }
+      return;
+    }
+  }
+};
@@ -85,7 +85,7 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
      if (args->proto == NCCL_PROTO_LL) chunkSize /= 2;
      int const peer = args->peer;
      Primitives<T, RedOp, FanAsymmetric<0, 1>, 0, Proto, 1> prims
-        (tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group, args->connIndex, args->connIndex, nullptr, ncclShmem.comm.p2pChunkSize/sizeof(T));
+        (tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group, args->connIndex, args->connIndex, nullptr, args, ncclShmem.comm.p2pChunkSize/sizeof(T));

 #if defined(ENABLE_NPKIT)
      if (isNpKitThread) {
@@ -106,7 +106,7 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
        int nelem = min(size_t(chunkSize), count-offset);
        prims.directSend(offset, offset, nelem);
        offset += nelem;
-      } while(offset < count);
+      } while(offset < count && args->reg == 0);

 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_SEND_EXIT)
      if (isNpKitThread) {
@@ -147,7 +147,7 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
      if (args->proto == NCCL_PROTO_LL) chunkSize /= 2; // This is to account for chunkEffectiveSize
      int const peer = args->peer;
      Primitives<T, RedOp, FanAsymmetric<1, 0>, 0, Proto, 1> prims
-        (tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group, args->connIndex, args->connIndex, nullptr, ncclShmem.comm.p2pChunkSize/sizeof(T));
+        (tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group, args->connIndex, args->connIndex, nullptr, args, ncclShmem.comm.p2pChunkSize/sizeof(T));

 #if defined(ENABLE_NPKIT)
      if (isNpKitThread) {
@@ -168,7 +168,7 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
        int nelem = min(size_t(chunkSize), count-offset);
        prims.directRecv(offset, nelem);
        offset += nelem;
-      } while(offset < count);
+      } while(offset < count && args->reg == 0);

 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_RECV_EXIT)
      if (isNpKitThread) {
@@ -230,4 +230,4 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
      }
    }
  }
-};
+};
@@ -22,6 +22,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs
  int localRanks = comm->topo->nodes[GPU].count;
  int nChannels = comm->nChannels;

+  topoRanks->nvlsHeadNum = 0;
  for (int c=0; c<nChannels; c++) {
    struct ncclChannel* channel = comm->channels+c;
    channel->ring.prev = channel->ring.next = -1;
@@ -33,20 +34,20 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs
    channel->collnetDirect.headRank = -1;
    channel->collnetDirect.nHeads = 0;
    channel->collnetDirect.shift = 0;
+    for (int i=0; i<NCCL_MAX_DIRECT_ARITY+1; i++) channel->collnetDirect.heads[i] = -1;
    for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collnetDirect.up[i] = -1;
    for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collnetDirect.down[i] = -1;

    int* ringIntra = graphs[NCCL_ALGO_RING]->intra+c*localRanks;
    int* treeIntra = graphs[NCCL_ALGO_TREE]->intra+c*localRanks;
    int* collNetIntra = graphs[NCCL_ALGO_COLLNET_CHAIN]->intra+c*localRanks;
-    int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra+c*localRanks;

    for (int i=0; i<localRanks; i++) {
      if (ringIntra[i] == rank) {
        topoRanks->ringRecv[c] = ringIntra[0];
        topoRanks->ringSend[c] = ringIntra[localRanks-1];
-        channel->ring.prev = (i == 0) ? -1 : ringIntra[i-1];
-        channel->ring.next = (i == localRanks-1) ? -1 : ringIntra[i+1];
+        topoRanks->ringPrev[c] = (i == 0) ? -1 : ringIntra[i-1];
+        topoRanks->ringNext[c] = (i == localRanks-1) ? -1 : ringIntra[i+1];
      }
      if (treeIntra[i] == rank) {
        int parentIndex = 0;
@@ -64,14 +65,28 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs
        channel->collnetChain.down[0] = i == localRanks-1 ? -1 : collNetIntra[i+1];
      }
    }
-    topoRanks->ringPrev[c] = channel->ring.prev;
-    topoRanks->ringNext[c] = channel->ring.next;
-    topoRanks->nvlsHeads[c] = nvlsIntra[0];
  }
-  // Duplicate channels rings/trees
+  // Duplicate channels trees
  struct ncclChannel* channel0 = comm->channels;
  struct ncclChannel* channel1 = (nChannels > MAXCHANNELS/2) ? 0 : channel0+nChannels;
  if (channel1) memcpy(channel1, channel0, nChannels*sizeof(struct ncclChannel));
+
+  // Get nvls heads and the number of heads. Duplicate head is not allowed.
+  for (int c = 0; c < graphs[NCCL_ALGO_NVLS]->nChannels; ++c) {
+    bool addHead = true;
+    int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra + c * localRanks;
+
+    for (int dup = 0; dup < topoRanks->nvlsHeadNum; dup++) {
+      if (topoRanks->nvlsHeads[dup] == nvlsIntra[0]) {
+        addHead = false;
+        break;
+      }
+    }
+    if (addHead) {
+      topoRanks->nvlsHeads[topoRanks->nvlsHeadNum++] = nvlsIntra[0];
+    }
+  }
+  
  return ncclSuccess;
 }

@@ -203,26 +218,14 @@ static ncclResult_t connectRings(struct ncclComm* comm, int* ringRecv, int* ring
    int* send = ringSend+c*comm->nNodes;
    int* prev = ringPrev+c*comm->nRanks;
    int* next = ringNext+c*comm->nRanks;
-    struct ncclChannel* channel0 = comm->channels+c;
-    struct ncclChannel* channel1 = (nChannels > MAXCHANNELS/2) ? 0 : channel0+nChannels;
    for (int n=0; n<nNodes; n++) {
      int recvRank = recv[n];
      int prevSendRank = send[(n-1+nNodes)%nNodes];
      prev[recvRank] = prevSendRank;
-      if (comm->rank == recvRank) {
-        channel0->ring.prev = prevSendRank;
-        if (channel1) channel1->ring.prev = prevSendRank;
-      }
      int sendRank = send[n];
      int nextRecvRank = recv[(n+1)%nNodes];
      next[sendRank] = nextRecvRank;
-      if (comm->rank == sendRank) {
-        channel0->ring.next = nextRecvRank;
-        if (channel1) channel1->ring.next = nextRecvRank;
-      }
    }
-    TRACE(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c, channel0->ring.prev, comm->rank, channel0->ring.next);
-    if (channel1) TRACE(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c+nChannels, channel1->ring.prev, comm->rank, channel1->ring.next);
  }

  // [RCCL] Print off the recv/send local ranks per node, per channel
@@ -404,6 +407,15 @@ static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
      channel->collnetDirect.up[nUp++] = heads[h];
      sprintf(line+strlen(line), " %d ", heads[h]);
    }
+    sprintf(line+strlen(line), "heads ");
+    { // heads[] is the list of heads ordered in head order startubg with self
+      int h0 = (channel->collnetDirect.headRank == -1) ? 0 : channel->collnetDirect.headRank;
+      for (int h1=0; h1 < nHeads; h1++) {
+        int h = (h0+h1)%nHeads;
+        channel->collnetDirect.heads[h1] = heads[h];
+        sprintf(line+strlen(line), " %d ", heads[h]);
+      }
+    }
    channel->collnetDirect.nHeads = nHeads;
    channel->collnetDirect.shift = (rank%localRanks)%nHeads; // Shift by intraRank so that leaves don't send to same head simultaneously
    channel->collnetDirect.depth = (nUp == 0 && nDown == 0) ? 1 : 2;
@@ -412,27 +424,22 @@ static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
    INFO(NCCL_GRAPH, "%s", line);
    channel->collnetChain.depth = comm->nRanks/comm->nNodes;
  }
-  for (int c=0; c<comm->nvlsChannels; c++) {
-    struct ncclChannel* channel = comm->channels+c;
-    if (channel->nvls.headRank != -1) channel->nvls.out = comm->nRanks;
-  }
  free(heads);
  return ncclSuccess;
 }

-static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, struct ncclTopoGraph* nvlsGraph) {
-  int nHeads = nvlsGraph->nChannels;
+static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, int nHeads) {
  int headRank = -1;
-  for (int h=0; h<nHeads; h++) {
-    if (nvlsGraph->intra[h*comm->localRanks] == comm->rank) headRank = h;
-  }
-
  if (nHeads == 0) {
    comm->nvlsChannels = 0;
    return ncclSuccess;
  }

-  for (int c=0; c<comm->nvlsChannels; c++) {
+  for (int h = 0; h < nHeads; h++) {
+    if (nvlsHeads[h * comm->nNodes + comm->node] == comm->rank) headRank = h;
+  }
+
+  for (int c=0; c<comm->nChannels; c++) {
    struct ncclChannel* channel = comm->channels+c;
    channel->nvls.nHeads = nHeads;
    for (int h=0; h<nHeads; h++) channel->nvls.up[h] = comm->nRanks+1+h;
@@ -443,8 +450,10 @@ static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, struct nc
    channel->nvls.treeUp = channel->nvls.treeDown[0] = channel->nvls.treeDown[1] = channel->nvls.treeDown[2] = -1;
    channel->nvls.node = comm->node;
    channel->nvls.nNodes = comm->nNodes;
+    if (comm->collNetSupport && channel->nvls.headRank != -1) channel->nvls.out = comm->nRanks;
  }
-  if (comm->nNodes == 1) return ncclSuccess;
+  // MNNVL: NVLS not yet supported
+  if (comm->nNodes == 1 || comm->MNNVL) return ncclSuccess;

  // Connect Trees
  int tree0Parent, tree0Child0, tree0Child1, tree1Parent, tree1Child0, tree1Child1;
@@ -485,7 +494,7 @@ static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, struct nc
  }
  // Set prev/next in all channels (NVLS compute channels work
  // orthogonally to NVLS search channels).
-  for (int c=0; c<comm->nvlsChannels; c++) {
+  for (int c=0; c<comm->nChannels; c++) {
    struct ncclChannel* channel = comm->channels+c;
    channel->nvls.treeUp = treeUp[c%2];
    channel->nvls.treeDown[0] = channel->nvls.down;
@@ -543,12 +552,19 @@ static int copyChannels(struct ncclComm* comm, int start, int end, int* ringPrev
  return c;
 }

+void exchangeValues(int* v0, int* v1) {
+  int tmp = *v1;
+  *v1 = *v0;
+  *v0 = tmp;
+}
+
 ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, int nc) {
  // Gather data from all ranks
  int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1, *nvlsHeads;
  int nranks = comm->nRanks;
  int nNodes = comm->nNodes;
  int nChannels = comm->nChannels;
+  int minHeadNum = INT_MAX;
  NCCLCHECK(ncclCalloc(&ringRecv, nNodes*MAXCHANNELS));
  NCCLCHECK(ncclCalloc(&ringSend, nNodes*MAXCHANNELS));
  NCCLCHECK(ncclCalloc(&ringPrev, nranks*MAXCHANNELS));
@@ -557,6 +573,22 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
  NCCLCHECK(ncclCalloc(&treeToChild0, nNodes*MAXCHANNELS));
  NCCLCHECK(ncclCalloc(&treeToChild1, nNodes*MAXCHANNELS));
  NCCLCHECK(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS));
+
+  // Alternate rings to avoid crossing rails
+  if (graphs[NCCL_ALGO_RING]->crossNic && (comm->nNodes % 2) == 0 && (nChannels % 2) == 0) {
+    for (int r=0; r<comm->nRanks; r++) {
+      if (comm->rankToNode[r] % 2 == 1) {
+        // Exchange rings
+        for (int c=0; c<nChannels; c+=2) {
+          exchangeValues(allTopoRanks[r]->ringRecv+c, allTopoRanks[r]->ringRecv+(c^1));
+          exchangeValues(allTopoRanks[r]->ringSend+c, allTopoRanks[r]->ringSend+(c^1));
+          exchangeValues(allTopoRanks[r]->ringPrev+c, allTopoRanks[r]->ringPrev+(c^1));
+          exchangeValues(allTopoRanks[r]->ringNext+c, allTopoRanks[r]->ringNext+(c^1));
+        }
+      }
+    }
+  }
+
  for (int c=0; c<nChannels;c++) {
    for (int n=0; n<nNodes; n++) {
      int r = firstRanks[n];
@@ -571,17 +603,23 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
      ringNext[c*nranks+r] = allTopoRanks[r]->ringNext[c];
    }
  }
-  for (int c=0; c<graphs[NCCL_ALGO_NVLS]->nChannels; c++) {
-    for (int n=0; n<nNodes; n++) {
+
+  for (int n = 0; n < nNodes; n++) {
+    int r = firstRanks[n];
+    if (minHeadNum > allTopoRanks[r]->nvlsHeadNum)
+      minHeadNum = allTopoRanks[r]->nvlsHeadNum;
+  }
+
+  for (int c = 0; c < minHeadNum; c++) {
+    for (int n = 0; n < nNodes; n++) {
      int r = firstRanks[n];
-      nvlsHeads[c*nNodes+n] = allTopoRanks[r]->nvlsHeads[c];
+      nvlsHeads[c * nNodes + n] = allTopoRanks[r]->nvlsHeads[c];
    }
  }

  // Connect rings and trees. This should also duplicate the channels.
  NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext));
  NCCLCHECK(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, treePatterns));
-  NCCLCHECK(connectNvls(comm, nvlsHeads, graphs[NCCL_ALGO_NVLS]));

  // Only use full MAXCHANNELS for gfx94x
  int maxChannels = IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") ? MAXCHANNELS : (MAXCHANNELS/2);
@@ -595,6 +633,14 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
  nc = std::min(maxNchannels/comm->nChannels, nc);
  nc *= comm->nChannels;

+  // Set ring prev/next for my rank
+  for (int c=0; c<nChannels; c++) {
+    struct ncclChannel* channel0 = comm->channels+c;
+    struct ncclChannel* channel1 = channel0+nChannels;
+    channel0->ring.prev = channel1->ring.prev = ringPrev[c*nranks+comm->rank];
+    channel0->ring.next = channel1->ring.next = ringNext[c*nranks+comm->rank];
+  }
+
  // Duplication should be complete now
  nChannels = comm->nChannels = std::min(maxChannels, (nChannels <= maxChannels/2) ? nChannels*2 : nChannels);

@@ -633,6 +679,13 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
    nChannels = comm->nChannels = copyChannels(comm, nChannels, std::max(minNchannels, std::max(nc, comm->config.minCTAs)), ringPrev, ringNext);
  }

+  comm->collChannels = comm->nChannels;
+  // Support maximal channel usage for aggregation
+  if (comm->nChannels < comm->nvlsChannels) {
+    nChannels = comm->nChannels = copyChannels(comm, comm->nChannels, comm->nvlsChannels, ringPrev, ringNext);
+  }
+  NCCLCHECK(connectNvls(comm, nvlsHeads, minHeadNum));
+
  // Create rings array and check all is fine
  NCCLCHECK(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext));

@@ -646,4 +699,4 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
  free(nvlsHeads);

  return ncclSuccess;
-}
+}
@@ -349,6 +349,23 @@ compare:
  return ncclSuccess;
 }

+// MNNVL: Check whether peers are in the same fabric cluster and clique
+ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* ret) {
+  *ret = 0;
+
+  nvmlGpuFabricInfoV_t *fabricInfo1 = &info1->fabricInfo;
+  nvmlGpuFabricInfoV_t *fabricInfo2 = &info2->fabricInfo;
+  // A zero UUID means we don't have MNNVL fabric info
+  if ((((long *)&fabricInfo2->clusterUuid)[0]|((long *)fabricInfo2->clusterUuid)[1]) == 0) return ncclSuccess;
+  if ((memcmp(fabricInfo1->clusterUuid, fabricInfo2->clusterUuid, NVML_GPU_FABRIC_UUID_LEN) == 0) &&
+      (fabricInfo1->cliqueId == fabricInfo2->cliqueId)) {
+    INFO(NCCL_NET, "MNNVL matching peer 0x%lx UUID %lx.%lx cliqueId 0x%x",
+         info2->busId, ((long *)fabricInfo2->clusterUuid)[0], ((long *)fabricInfo2->clusterUuid)[1], fabricInfo2->cliqueId);
+    *ret = 1;
+  }
+  return ncclSuccess;
+}
+
 NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
 int ncclTopoUserGdrLevel = -1;

@@ -779,7 +796,7 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
  }

  comm->localRanks = system->nodes[GPU].count;
-  if (system->nodes[GPU].count == comm->nRanks && remove) {
+  if ((system->nodes[GPU].count == comm->nRanks && remove) || comm->MNNVL) {
    for (int n=system->nodes[NET].count-1; n>=0; n--)
      NCCLCHECK(ncclTopoRemoveNode(system, NET, n));
  }
@@ -794,11 +811,12 @@ void ncclTopoFree(struct ncclTopoSystem* system) {
  free(system);
 }

-NCCL_PARAM(NChannelsPerNetPeer, "NCHANNELS_PER_NET_PEER", 1);
+NCCL_PARAM(NChannelsPerNetPeer, "NCHANNELS_PER_NET_PEER", -1);
 NCCL_PARAM(NChannelsPerPeer, "NCHANNELS_PER_PEER", -2);

-static ncclResult_t ncclTopoGetNchannels(struct ncclTopoSystem* system, int g /*local gpu index*/, int peerRank, int* nChannels) {
+static ncclResult_t ncclTopoGetNchannels(struct ncclComm* comm, int g /*local gpu index*/, int peerRank, int* nChannels) {
  int peer;
+  struct ncclTopoSystem* system = comm->topo;
  struct ncclTopoLinkList* path = NULL;
  if (ncclTopoRankToIndex(system, peerRank, &peer) == ncclSuccess) {
    // Same rank
@@ -814,9 +832,28 @@ static ncclResult_t ncclTopoGetNchannels(struct ncclTopoSystem* system, int g /*
    } else {
      *nChannels = 2;
    }
+  } else if (comm->MNNVL) {
+    // MNNVL assume all GPUs are connected via NVLink
+    path = system->nodes[GPU].nodes[g].paths[GPU]+((g+1)%system->nodes[GPU].count);
+    float nvlBw = ncclTopoNVLinkBw(system->nodes[GPU].nodes[g].gpu.cudaCompCap);
+    *nChannels = 2*std::max(1, (int)(path->bw / nvlBw));
  } else {
    // Remote rank, use network
-    *nChannels = ncclParamNChannelsPerNetPeer();
+    int nNetChannels = ncclParamNChannelsPerNetPeer();
+    if (nNetChannels == -1) {
+       //start from 2 channels per NIC and reduce with scale
+       nNetChannels = 2;
+
+       // check if we need to use more than one NIC, hence more than one channel
+       int netCountByBw = 1, nChannelsMax = nNetChannels;
+       NCCLCHECK(getLocalNetCountByBw(system, g, &netCountByBw));
+       // Avoid overloading channels with 8+ operations as we loose the sync warp, hence a bit of bandwidth.
+       while (nChannelsMax*comm->nRanks > comm->p2pnChannels*4 && nChannelsMax > 1) nChannelsMax /= 2;
+
+       //allow upto channels requires to drive the NICs
+       nNetChannels = std::max(netCountByBw, nChannelsMax);
+    }
+    *nChannels = nNetChannels;
  }
  return ncclSuccess;
 }
@@ -845,7 +882,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
  for (int g=0; g<comm->topo->nodes[GPU].count; g++) {
    for (int r=0; r<comm->nRanks; r++) {
      int nChannels;
-      NCCLCHECK(ncclTopoGetNchannels(comm->topo, g, r, &nChannels));
+      NCCLCHECK(ncclTopoGetNchannels(comm, g, r, &nChannels));
      if (nChannels >= 0) minChannels = std::min(minChannels, nChannels);
    }
  }
@@ -907,4 +944,4 @@ int ncclTopoPathAllNVLink(struct ncclTopoSystem* system) {
    }
  }
  return minPath >= PATH_PIX ? 0 : 1;
-}
+}
@@ -404,13 +404,12 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop
    return ncclSuccess;
  }
  // 2. Try to get better bandwidth
-  // Give a 15% perf bonus to paths not crossing nics
-  float target = 1.0 - (refGraph->crossNic - graph->crossNic) * .15;
-  if (graph->nChannels*graph->bwIntra > refGraph->nChannels*refGraph->bwIntra*target) {
+  // Give a 5% perf bonus to paths not crossing nics
+  if (graph->nChannels*graph->bwIntra > refGraph->nChannels*refGraph->bwIntra) {
    *copy = 1;
    return ncclSuccess;
  }
-  if (graph->nChannels*graph->bwIntra < refGraph->nChannels*refGraph->bwIntra*target) return ncclSuccess;
+  if (graph->nChannels*graph->bwIntra < refGraph->nChannels*refGraph->bwIntra) return ncclSuccess;

  // 3. Less hops
  if (graph->pattern == refGraph->pattern && graph->crossNic == refGraph->crossNic && graph->nHops < refGraph->nHops) *copy = 1;
@@ -520,6 +519,7 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
        struct ncclTopoNode* net = system->nodes[NET].nodes+n;
        if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) continue; // Trees are symmetric
        if (graph->crossNic != 1 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port)) continue;
+        if (graph->crossNic && (graph->nChannels & 1) && net->id != graph->inter[(graph->nChannels-1)*2]) continue;

        // Balanced Tree : count half of the bandwidth on first two GPUs
        int nextBackToNet = -1;
@@ -591,6 +591,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
    struct ncclTopoNode* net = system->nodes[NET].nodes+n;
    if (graph->collNet && net->net.collSupport == 0) continue;
    if (net->net.bw < bw) continue;
+    if (graph->crossNic && (graph->nChannels & 1) && net->id != graph->inter[(graph->nChannels-1)*2+1]) continue;

    graph->inter[graph->nChannels*2] = net->id;
    graph->latencyInter = net->net.latency;
@@ -1180,16 +1181,29 @@ ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, stru

 #include "comm.h"
 // NVLS channels aren't compute channels. Find which NIC corresponds to our rank being the head
-ncclResult_t getNvlsNetDev(struct ncclComm* comm, struct ncclTopoGraph* graph, int* dev) {
+ncclResult_t getNvlsNetDev(struct ncclComm* comm, struct ncclTopoGraph* graph, int channelId, int* dev) {
+  ncclResult_t ret = ncclSuccess;
  int localRanks = comm->topo->nodes[GPU].count;
-  for (int c=0; c<graph->nChannels; c++) {
-    if (graph->intra[c*localRanks] == comm->rank) {
-      *dev = graph->inter[c*2];
-      return ncclSuccess;
+  int netNum = 0;
+  int net[MAXCHANNELS];
+
+  for (int c = 0; c < graph->nChannels; c++) {
+    if (graph->intra[c * localRanks] == comm->rank) {
+      net[netNum++] = graph->inter[c * 2];
    }
  }
+  if (netNum) {
+    *dev = net[channelId % netNum];
+  } else {
+    ret = ncclInternalError;
+    goto fail;
+  }
+
+exit:
+  return ret;
+fail:
  WARN("Could not find NIC for rank %d in NVLS graph\n", comm->rank);
-  return ncclInternalError;
+  goto exit;
 }

 // 0: don't use PXN for P2P, 1: use PXN if needed, 2: use PXN as much as possible to maximize aggregation
@@ -1204,7 +1218,7 @@ ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoG
    if (graph->pattern != NCCL_TOPO_PATTERN_NVLS) {
      *dev = graph->inter[channel*2+index];
    } else {
-      NCCLCHECK(getNvlsNetDev(comm, graph, dev));
+      NCCLCHECK(getNvlsNetDev(comm, graph, channelId, dev));
    }
    NCCLCHECK(ncclTopoGetIntermediateRank(comm->topo, rank, *dev, proxyRank));
  } else if (peerRank == -1) {
@@ -186,12 +186,17 @@ ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode
 // even though they're supposed to sustain full BW across all ports.
 // Flatten the switch as this extra level can break the search and make
 // NCCL take wrong topology decisions.
+int getBcmGen(uint64_t id, int level) {
+  if ((id & 0xfffffffffffff000) == 0x1000c0101000a000) return 4;
+  if ((id & 0xfffffffffffff000) == (0x1000c03010000000 | level*0x1000)) return 5;
+  return 0;
+}
 ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) {
  for (int s=0; s<system->nodes[PCI].count; s++) {
    struct ncclTopoNode* pciSwitch = system->nodes[PCI].nodes+s;
-    uint64_t device = pciSwitch->pci.device;
-    // Only flatten PEX Gen 4 switches in base mode
-    if ((device & 0xfffffffffffff000) == 0x1000c0101000a000) {
+    int gen = getBcmGen(pciSwitch->pci.device, 0);
+    // Flatten Gen4 PEX switches in base mode
+    if (gen) {
      // Find sub switches with the same device ID.
      int64_t* subSwIds;
      NCCLCHECK(ncclCalloc(&subSwIds, pciSwitch->nlinks));
@@ -199,7 +204,7 @@ ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) {
      for (int l=0; l<pciSwitch->nlinks; l++) {
        struct ncclTopoNode* sub = pciSwitch->links[l].remNode;
        // Only fuse sub switches with the same device ID.
-        if (sub->type != PCI || sub->pci.device != device) continue;
+        if (sub->type != PCI || getBcmGen(sub->pci.device, 1) != gen) continue;
        // Save sub switch for later
        subSwIds[subs++] = sub->id;
        // Remove link to that sub switch
@@ -231,8 +236,8 @@ ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) {
        }
        NCCLCHECK(ncclTopoRemoveNode(system, PCI, index));
      }
-      // Set subdevice to 0x0000 to make sure we don't merge this switch again.
-      pciSwitch->pci.device = 0x1000c01010000000;
+      // Set subdevice to 0xffff to make sure we don't merge this switch again.
+      pciSwitch->pci.device |= 0xffff;
      free(subSwIds);
      // Restart, as system->nodes[PCI].nodes has changed.
      s = 0;
@@ -816,6 +821,30 @@ ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index
  return ncclSuccess;
 }

+ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *count) {
+  int localNetCount = 0, netCountByBw = 0;
+  int* localNets;
+  float totalNetBw = 0, gpuBw = 0;
+
+  for (int l=0; l<system->nodes[GPU].nodes[gpu].nlinks; l++) {
+    //assuming BW to CPU reflects the GPU bandwidth via P2P or C2C
+    //caveat, this could be wrong if there is a PCIe switch,
+    //and a narrower link to the CPU
+    if (system->nodes[GPU].nodes[gpu].links[l].remNode->type == CPU) {
+       gpuBw = system->nodes[GPU].nodes[gpu].links[l].bw;
+    }
+  }
+
+  NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, &localNets, &localNetCount, NULL));
+  for (int l=0; (l < localNetCount) && (totalNetBw < gpuBw); l++, netCountByBw++) {
+     totalNetBw += system->nodes[GPU].nodes[gpu].paths[NET][localNets[l]].bw;
+  }
+  *count = netCountByBw;
+
+  free(localNets);
+  return ncclSuccess;
+}
+
 ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int* id) {
  int gpu;
  NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu));
@@ -845,17 +874,25 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int ch
 }

 ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int net, int* gpuIndex) {
+  int netIndex;
+  NCCLCHECK(ncclTopoIdToIndex(system, NET, net, &netIndex));
+  int* localGpus = NULL;
+  int localGpuCount;
+  NCCLCHECK(ncclTopoGetLocal(system, NET, netIndex, GPU, &localGpus, &localGpuCount, NULL));
  for (int c=0; c<MAXCHANNELS; c++) {
-    for (int g=0; g<system->nodes[GPU].count; g++) {
+    for (int lg=0; lg<localGpuCount; lg++) {
+      int g = localGpus[lg];
      struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
      int id;
      NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id));
      if (net == id) {
        *gpuIndex = g;
+        free(localGpus);
        return ncclSuccess;
      }
    }
  }
+  free(localGpus);
  *gpuIndex = -1;
  return ncclSuccess;
 }
@@ -963,4 +1000,4 @@ ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int*
  if (ccMin) *ccMin = min;
  if (ccMax) *ccMax = max;
  return ncclSuccess;
-}
+}
@@ -17,7 +17,7 @@
 #define SM60_NVLINK_BW 18.0
 #define SM70_NVLINK_BW 20.0
 #define SM80_NVLINK_BW 20.0
-#define SM90_NVLINK_BW 20.0
+#define SM90_NVLINK_BW 20.6
 #define SM86_NVLINK_BW 12.0
 #define PCI_BW 12.0           // PCI Gen3 x16
 #define QPI_BW 6.0
@@ -246,6 +246,17 @@ static float ncclTopoXGMISpeed(const char* gcn) {
  #define ncclGetKernelIndex(p_comm) (0)
 #endif

+// Returns NVLink bw in GB/s
+static float ncclTopoNVLinkBw(int cudaCompCap) {
+  return
+    cudaCompCap >= 90 ? SM90_NVLINK_BW :
+    cudaCompCap == 86 ? SM86_NVLINK_BW :
+    cudaCompCap >= 80 ? SM80_NVLINK_BW :
+    cudaCompCap >= 70 ? SM70_NVLINK_BW :
+    cudaCompCap >= 60 ? SM60_NVLINK_BW :
+    SM80_NVLINK_BW;
+}
+
 // Mirror bits
 static bool isPow2(int val) {
  return (val & (val-1)) == 0;
@@ -325,7 +325,8 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
    getNthreads("NCCL_LL128_NTHREADS", ncclParamLl128Nthreads(), NCCL_LL128_MAX_NTHREADS/4, NCCL_LL128_MAX_NTHREADS, NCCL_LL128_MAX_NTHREADS);
 #endif

-  int nNodes = comm->nNodes;
+  // MNNVL support - treat as a single NVLink connected node
+  int nNodes = comm->MNNVL ? 1 : comm->nNodes;
  int nRanks = comm->nRanks;
  if (nRanks <= 1) return ncclSuccess;

@@ -358,8 +359,8 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
    for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
      if (coll == ncclFuncBroadcast && a != NCCL_ALGO_RING) continue;
      if (coll == ncclFuncReduce && a != NCCL_ALGO_RING) continue;
-      if (coll == ncclFuncReduceScatter && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS) continue;
-      if (coll == ncclFuncAllGather && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS) continue;
+      if (coll == ncclFuncReduceScatter && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS && a != NCCL_ALGO_COLLNET_DIRECT) continue;
+      if (coll == ncclFuncAllGather && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS && a != NCCL_ALGO_COLLNET_DIRECT) continue;

      for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
        if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_SIMPLE && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") && comm->topo->nodes[GPU].count == comm->topo->nRanks) continue;
@@ -388,20 +389,39 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
        if (a == NCCL_ALGO_COLLNET_DIRECT && p != NCCL_PROTO_SIMPLE) busBw = 0;  // Not used
        if (a == NCCL_ALGO_COLLNET_CHAIN && p != NCCL_PROTO_SIMPLE) busBw = 0;  // Not used
        if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE) {
-          // Collnet+Direct requires all GPUs to have a local NIC to work at full speed
-          float factor = ppn / (1.0*graphs[a]->nChannels); // GPU/NIC ratio
-          factor -= (factor-1)/2;
-          busBw /= factor;
+          if (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter) {
+            busBw = ppn * bw;
+            // AllGather/ReduceScatter requires 1:1 GPU:NIC
+            int nicPerNode = comm->collNetHeadsUniqueNum;
+            if (coll == ncclFuncAllGather && comm->nNodes > 1) {
+              if (!comm->ncclCollNet || !comm->ncclCollNet->iallgather || ppn > nicPerNode) busBw = 0;
+            }
+            if (coll == ncclFuncReduceScatter && comm->nNodes > 1) {
+              if (!comm->ncclCollNet || !comm->ncclCollNet->ireducescatter || ppn > nicPerNode) busBw = 0;
+            }
+            // Measured corrective ratio needed at 1 ppn and 8ppn. Here we hackishly
+            // interpolate the two.
+            float w = (ppn-1)/(8-1);
+            busBw *= w*0.85 + (1-w)*0.95;
+          } else {
+            // Collnet+Direct requires all GPUs to have a local NIC to work at full speed
+            float factor = ppn / (1.0*graphs[a]->nChannels); // GPU/NIC ratio
+            factor -= (factor-1)/2;
+            busBw /= factor;
+            if (minCompCap >= 90) busBw *= .85;
+          }
        }
 #endif
-        if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE && minCompCap >= 90) busBw *= .85;

        // Convert bus BW to algorithm BW
-        float ratio;
-        if (a == NCCL_ALGO_RING) ratio = (1.0 * nRanks) / nsteps;
-        else if (a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) ratio = 5.0/6.0;
-        else ratio = .5;
-        comm->bandwidths[coll][a][p] = busBw * ratio;
+        if (!(a == NCCL_ALGO_COLLNET_DIRECT && (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter))) {
+          float ratio = 1.0f;
+          if (a == NCCL_ALGO_RING) ratio *= (1.0 * nRanks) / nsteps;
+          else if (a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) ratio *= 5.0/6.0;
+          else ratio *= .5;
+          busBw *= ratio;
+        }
+        comm->bandwidths[coll][a][p] = busBw;
        /* Ring bandwidth backup */
        if (a == NCCL_ALGO_RING)
          comm->ringbdw[coll][p] = comm->bandwidths[coll][NCCL_ALGO_RING][p];
@@ -464,18 +484,19 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
    NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable));
  }

-  if (comm->nNodes == 1) algoEnable[NCCL_ALGO_NVLS_TREE] = 0;
+  // MNNVL: NVLS not yet supported
+  if (comm->nNodes == 1 || comm->MNNVL) algoEnable[NCCL_ALGO_NVLS_TREE] = 0;

  // Disable CollNet if it is not supported
  if (comm->collNetSupport == 0) {
    algoEnable[NCCL_ALGO_COLLNET_DIRECT] = 0;
    algoEnable[NCCL_ALGO_COLLNET_CHAIN] = 0;
-    if (comm->nNodes > 1) algoEnable[NCCL_ALGO_NVLS] = 0;
+    // MNNVL: NVLS not yet supported
+    if (comm->nNodes > 1 || comm->MNNVL) algoEnable[NCCL_ALGO_NVLS] = 0;
    // If user has hard set NCCL_ALGO=COLLNET, ignore it
    if (algoEnable[NCCL_ALGO_RING] == 0 && algoEnable[NCCL_ALGO_TREE] == 0 &&
        algoEnable[NCCL_ALGO_NVLS] == 0 && algoEnable[NCCL_ALGO_NVLS_TREE] == 0) {
      algoEnable[NCCL_ALGO_RING] = algoEnable[NCCL_ALGO_TREE] = 1;
-      if (comm->rank == 0) WARN("CollNet is not supported or fails to initialize, ignoring NCCL_ALGO=COLLNET");
    }
  } else {
    // Disable CollNet+Direct if not on an NVSwitch system
@@ -611,9 +632,9 @@ static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][23] = {
 };

 ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time, bool* backup) {
-  float bw = info->comm->bandwidths[info->coll][algorithm][protocol]; 
+  float bw = info->comm->bandwidths[info->coll][algorithm][protocol];
  float lat = info->comm->latencies[info->coll][algorithm][protocol];
-  
+
  if (backup) {
    *backup = false;
    if (algorithm == NCCL_ALGO_RING && bw == 0.0f) {
@@ -640,7 +661,7 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int proto
 #else
  if (algorithm == NCCL_ALGO_TREE && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize];
  if (info->nChannels != 0) bw = bw / info->comm->nChannels * info->nChannels;
-  if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && info->comm->nNodes > 1
+  if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && (!info->comm->MNNVL && info->comm->nNodes > 1)
      && info->coll == ncclFuncAllReduce && info->nBytes/(info->comm->nChannels*info->comm->nRanks) >= 64) {
    lat *= info->comm->minCompCap < 80 ? 1.9 : 1.4; // Plateau effect of ring
  }
@@ -649,4 +670,4 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int proto
  int latCount = algorithm == NCCL_ALGO_RING ? numPipeOps : DIVUP(numPipeOps, NCCL_MAX_WORK_ELEMENTS);
  *time = lat * latCount + (info->nBytes) / (1000 * bw);
  return ncclSuccess;
-}
+}
@@ -155,7 +155,7 @@ static ncclResult_t doLaunches(struct ncclComm* head) {
    }

    while (true) { // Iterate rounds of launches for clique.
-      bool moreRounds;
+      bool moreRounds = false;
      comm = cliqueHead;
      do { // Iterate clique members.
        struct ncclComm* next = comm->groupNext;
@@ -163,7 +163,7 @@ static ncclResult_t doLaunches(struct ncclComm* head) {
          // Barrier reduction result tells us if this was the final round.
          moreRounds = 0 != ncclCommIntraBarrierOut(comm);
        } else {
-          moreRounds = comm->unlaunchedPlansHead != nullptr;
+          moreRounds |= comm->unlaunchedPlansHead != nullptr;
        }
        if (moreRounds) {
          // Pop next unlaunched kernel
@@ -248,9 +248,9 @@ static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** g
    // Reset comm->tasks to empty.
    comm->tasks.nTasksColl = 0;
    comm->tasks.nTasksP2p = 0;
+    comm->tasks.workBytesTotal = 0;
    comm->tasks.streams = nullptr;
    ncclIntruQueueConstruct(&comm->tasks.collQueue);
-    comm->tasks.collBytesTotal = 0;
    for (int i = 0; i < comm->nRanks; i++) {
      ncclIntruQueueConstruct(&comm->tasks.peers[i].sendQueue);
      ncclIntruQueueConstruct(&comm->tasks.peers[i].recvQueue);
@@ -334,9 +334,9 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
          assert(state == ncclGroupJobJoined);
        }

-        if (*groupAbortFlag == true || errorJobAbortFlag == true) {
-          *job->abortFlag = 1;
-          if (job->childAbortFlag) *job->childAbortFlag = 1;
+        if (__atomic_load_n(groupAbortFlag, __ATOMIC_RELAXED) || errorJobAbortFlag == true) {
+          __atomic_store_n(job->abortFlag, 1, __ATOMIC_RELAXED);
+          if (job->childAbortFlag) __atomic_store_n(job->childAbortFlag, 1, __ATOMIC_RELAXED);
        }

        job = job->next;
@@ -455,7 +455,7 @@ ncclResult_t ncclGroupJobComplete(struct ncclGroupJob* groupJob) {

 ncclResult_t ncclGroupJobAbort(struct ncclGroupJob* groupJob) {
  if (groupJob && groupJob->initialized) {
-    *groupJob->abortFlagPtr = true;
+    __atomic_store_n(groupJob->abortFlagPtr, true, __ATOMIC_RELAXED);
    NCCLCHECK(ncclGroupJobComplete(groupJob));
  }
  return ncclSuccess;
@@ -109,13 +109,14 @@ static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHand
  CUmemAllocationProp prop = {};
  CUmemAccessDesc accessDesc = {};
  CUmemGenericAllocationHandle handle;
+  CUmemAllocationHandleType type = ncclCuMemHandleType;
  int cudaDev;
  int flag = 0;
  CUDACHECK(cudaGetDevice(&cudaDev));
  CUCHECK(cuDeviceGet(&currentDev, cudaDev));
  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-  prop.requestedHandleTypes = NCCL_P2P_HANDLE_TYPE; // So it can be exported
+  prop.requestedHandleTypes = type;
  prop.location.id = currentDev;
  // Query device to see if RDMA support is available
  CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, currentDev));
@@ -12,5 +12,6 @@

 ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname);
 ncclResult_t ArgsCheck(struct ncclInfo* info);
+ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname);

 #endif
@@ -19,9 +19,9 @@ static ncclResult_t collNetGetProperties(struct ncclComm* comm, int dev, ncclNet
 static ncclResult_t collNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclCollNet->listen(dev, handle, listenComm)); return ncclSuccess; }
 static ncclResult_t collNetConnect(struct ncclComm* comm, void* handles[], int nranks, int rank, void* listenComm, void** collComm) { NCCLCHECK(comm->ncclCollNet->connect(handles, nranks, rank, listenComm, collComm)); return ncclSuccess; }
 static ncclResult_t collNetReduceSupport(struct ncclComm* comm, ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { NCCLCHECK(comm->ncclCollNet->reduceSupport(dataType, redOp, supported)); return ncclSuccess; }
-static ncclResult_t collNetRegMr(struct ncclComm* comm, void* collComm, void* data, int size, int type, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMr(collComm, data, size, type, mhandle)); return ncclSuccess; }
+static ncclResult_t collNetRegMr(struct ncclComm* comm, void* collComm, void* data, size_t size, int type, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMr(collComm, data, size, type, mhandle)); return ncclSuccess; }
 /* DMA-BUF support */
-static ncclResult_t collNetRegMrDmaBuf(struct ncclComm* comm, void* collComm, void* data, int size, int type, uint64_t offset, int fd, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMrDmaBuf(collComm, data, size, type, offset, fd, mhandle)); return ncclSuccess; }
+static ncclResult_t collNetRegMrDmaBuf(struct ncclComm* comm, void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMrDmaBuf(collComm, data, size, type, offset, fd, mhandle)); return ncclSuccess; }
 static ncclResult_t collNetDeregMr(struct ncclComm* comm, void* collComm, void* mhandle) { NCCLCHECK(comm->ncclCollNet->deregMr(collComm, mhandle)); return ncclSuccess; }
 static ncclResult_t collNetIallreduce(struct ncclComm* comm, void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle,  void** request) {
  NCCLCHECK(comm->ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; }
@@ -53,4 +53,15 @@ inline int ncclTypeSize(ncclDataType_t type) {
  }
 }

+#include <sys/types.h>
+
+#define NCCL_MODE_NORMAL 0
+#define NCCL_MODE_OFFSET 1
+#define NCCL_MODE_PTR    2
+struct ncclConnFifo {
+  int mode;
+  int offset;
+  ssize_t size;
+  void* ptr;
+};
 #endif
@@ -15,6 +15,7 @@
 #include "proxy.h"
 #include "strongstream.h"
 #include "nccl_net.h"
+#include "register.h"

 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
 #define HIPRT_CB
@@ -59,6 +60,7 @@ struct ncclRecvMem {
    struct {
      uint64_t tail;
      char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
+      struct ncclConnFifo connFifo[NCCL_STEPS];
      int sizesFifo[NCCL_STEPS];
      int offsFifo[NCCL_STEPS];
      int flush; // For GDRCopy-based flush
@@ -174,7 +176,6 @@ struct ncclKernelPlan {
  // A kernel plan is also a callback that reclaims itself. Hence this must
  // be the first member.
  struct ncclCommCallback reclaimer;
-  struct ncclMemoryPool memPool_ncclProxyOp; // memory to return to comm in cleanup

  struct ncclComm* comm;
  struct ncclKernelPlan* next;
@@ -205,23 +206,7 @@ struct ncclKernelPlan {
    struct ncclIntruQueue<struct ncclWorkList, &ncclWorkList::next> workQueue;
    struct ncclIntruQueue<struct ncclProxyOp, &ncclProxyOp::enqNext> proxyOpQueue;
  } channels[MAXCHANNELS];
-};
-
-struct ncclRegRequest {
-  uintptr_t buff;
-  size_t size;
-  struct ncclRegRequest *next;
-};
-
-struct ncclRegRecord {
-  uintptr_t buff;
-  size_t size;
-  CUdeviceptr regAddr;
-  size_t regSize;
-  int dev;
-  CUmemGenericAllocationHandle mcHandle;
-  uintptr_t *addrs; /* use to check if NVLS buffers match among intra-node ranks */
-  struct ncclRegRecord *next;
+  size_t maxBytesPerChannel;
 };

 struct ncclComm {
@@ -268,6 +253,7 @@ struct ncclComm {
  int* localRankToRank;
  // localRanks and localRanktoRank for all nodes
  struct ncclNodeRanks* nodeRanks;
+  int MNNVL; // MNNVL: Multi-Node NVLink

  bool checkPointers;
  bool dmaBufSupport;
@@ -276,8 +262,9 @@ struct ncclComm {
  uint64_t opCount;

  // Channels for collectives
-  int nChannels;
-  int nvlsChannels;
+  int nChannels; // connection nChannels
+  int collChannels; // enqueue nChannels
+  int nvlsChannels; // enqueue nChannels
  int collNetChannels;
  // Channels (per peer) for p2p
  int p2pnChannels;
@@ -345,6 +332,9 @@ struct ncclComm {
  int intraHighestTransportType;
  int* collNetHeads;
  int collNetHeadsNum;
+  int collNetHeadsUniqueNum;
+  int* collNetDenseToUserRank;
+  int* collNetUserToDenseRank;
  /* sharable collNet proxy progress resource. */
  struct ncclCollNetSharedRes* collNetSharedRes;

@@ -354,8 +344,6 @@ struct ncclComm {
  /* sharable NVLS resource. */
  struct ncclNvlsSharedRes* nvlsResources;

-  ssize_t channelSize; // User requested work size (bytes) for channel partitions
-
  // pools backed by comm->memPermanent
  struct ncclMemoryPool memPool_ncclProxyOp;
  struct ncclMemoryPool memPool_ncclKernelPlan;
@@ -406,13 +394,10 @@ struct ncclComm {
  // group job to support multi-thread FT
  struct ncclGroupJob *groupJob;

-  /* store to buffer register request */
-  struct ncclIntruQueue<struct ncclRegRequest, &ncclRegRequest::next> regRequestQueue;
-  /* store registered buffer */
-  struct ncclIntruQueue<struct ncclRegRecord, &ncclRegRecord::next> regRecordQueue;
-
  // Tuning plugin
  ncclTuner_t* tuner;
+  // buffer registration cache
+  struct ncclRegCache regCache;
 };

 enum ncclLaunchMode {
@@ -496,4 +481,4 @@ static inline ncclRedOp_t ncclUserRedOpMangle(ncclComm *comm, ncclRedOp_t op) {
 ncclResult_t ncclCommEnsureReady(ncclComm_t comm);
 ncclResult_t ncclCommSetAsyncError(ncclComm_t comm, ncclResult_t nextState);

-#endif
+#endif
@@ -16,6 +16,10 @@ extern int ncclCuMemEnable();

 #if CUDART_VERSION >= 11030
 #include <cudaTypedefs.h>
+
+// Handle type used for cuMemCreate()
+extern CUmemAllocationHandleType ncclCuMemHandleType;
+
 #else
 typedef CUresult (CUDAAPI *PFN_cuInit_v2000)(unsigned int Flags);
 typedef CUresult (CUDAAPI *PFN_cuDriverGetVersion_v2020)(int *driverVersion);
@@ -106,8 +106,7 @@ struct ncclConnInfo {
  void **ptrExchange; // Pointer exchange for direct communication
  uint64_t* redOpArgExchange; // PreOp scaler exchange for direct pull case

-  int *sizesFifo;     // Sizes fifo from GPU to proxy
-  int *offsFifo;      // Buffer fifo from proxy to GPU
+  struct ncclConnFifo* connFifo; // Used for GPU - Proxy communication

  uint64_t step;      // Keep where we are
  uint64_t llLastCleaning;
@@ -167,6 +166,9 @@ struct ncclDirect {
  int nHeads;   // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down
  int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC)
  int shift;    // Shuffling of send/recv for scatter/gather operations, basically localRank%nHeads
+  // The heads[...] are guaranteed to be in rotated order start with self:
+  //   headRank, (headRank+1)%nHeads, (headRank+2)%nHeads, ...
+  int heads[NCCL_MAX_DIRECT_ARITY+1];
  int up[NCCL_MAX_DIRECT_ARITY];
  int down[NCCL_MAX_DIRECT_ARITY];
 };
@@ -229,30 +231,32 @@ struct ncclWorkElem {
  union {
    uint8_t flagBits;
    struct {
-      uint8_t isUsed:1, redOpArgIsPtr:1, regUsed:1, nWarps:5;
+      uint8_t isUsed:1, redOpArgIsPtr:1, regUsed:1, oneNode:1;
    };
  };
+  uint8_t nWarps;
  uint8_t direct;
-  uint8_t bid;
-  uint8_t nChannels;
-  struct {
-    uint32_t root:28;
-    uint32_t pad_0:2;
-    uint32_t connIndex:2;
-  };

-  const void * sendbuff;
-  void * recvbuff;
+  uint32_t root:30, connIndex:2;
+  const void *sendbuff;
+  void *recvbuff;

-  size_t count;
-  union {
-    size_t lastChunkSize;
-    // Pivot A2A kernel computes chunk size itself.
-    // Instead, it needs the number of bidirectional rings.
-    size_t pivotA2ANumBiRings;
-  };
+  uint64_t count:39, opCount:25;
  uint64_t redOpArg;
-  uint64_t opCount;
+  uint64_t chunkCount:25, workCount:39;
+  union {
+    struct {
+      uint64_t lastChunkCount:25;
+      uint64_t workOffset:39;
+    };
+    struct {
+      uint32_t nChannels;
+      uint16_t bid;
+      // Pivot A2A kernel computes chunk size itself.
+      // Instead, it needs the number of bidirectional rings.
+      uint16_t pivotA2ANumBiRings;
+    };
+  };
 };

 static_assert((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElem)))/sizeof(ncclWorkElem) == 4, "Sanity check: NCCL_MAX_WORK_ELEMENTS == 4");
@@ -265,15 +269,16 @@ struct ncclWorkElemP2p {
    int32_t proto:2;
  };
  union {
-    uint16_t flagBits;
+    uint16_t flagBit;
    struct {
      enum ncclWorkP2PType p2pType:4;
-      uint16_t nWarps:4;
-      uint16_t warpStart:4;
-      uint16_t ngroups:4;
+      uint8_t nWarps:4;
+      uint8_t warpStart:4;
+      uint8_t ngroups:4;
    };
  };
-  uint16_t opCount;
+  uint8_t reg:1;
+  uint16_t opCount:12;
  // Important not to use any fields with greater than 4-byte alignment since
  // we need sizeof(ncclWorkElemP2p)==28, but that would be padded up to 32 if
  // there were 8-byte fields.
@@ -398,6 +403,8 @@ struct alignas(16) ncclDevChannel {
 struct ncclDevComm {
  int rank;
  int nRanks;
+  int node;
+  int nNodes;
  int buffSizes[NCCL_NUM_PROTOCOLS];
  int p2pChunkSize;

@@ -405,6 +412,8 @@ struct ncclDevComm {
  int workFifoDepth;
  struct ncclWork* workFifoHeap; // may be cudaHost or GDR memory

+  int* collNetDenseToUserRank;
+
  // Flag to ask NCCL kernels to abort
  volatile uint32_t* abortFlag;

@@ -526,56 +535,55 @@ extern int const ncclDevFuncRowToId[];
 // `ncclFuncIndex()` needs to be in sync with 'ALL_COLLS' in Generate.cmake
 inline int ncclDevFuncId(int coll, int devRedOp, int type, int algo, int proto) {
  int row = 0;
+  do {
+    // RING / <all_protos> / Sum / int8_t
+    if (coll == ncclFuncAllGather) {
+      row += proto;
+      break;
+    }
+    row += NCCL_NUM_PROTOCOLS;

-  // RING / <all_protos> / Sum / int8_t
-  if (coll == ncclFuncAllGather) {
-    row += proto;
-    goto have_row;
-  }
-  row += NCCL_NUM_PROTOCOLS;
+    // <all_algos> / <all_protos> / <all_redops> / <all_types>
+    if (coll == ncclFuncAllReduce) {
+      row += (((algo * NCCL_NUM_PROTOCOLS + proto) * ncclNumDevRedOps + devRedOp) * ncclNumTypes + type) - NCCL_NUM_FLOATS * (algo * NCCL_NUM_PROTOCOLS + proto);
+      break;
+    }
+    row += (NCCL_NUM_ALGORITHMS - 2) * NCCL_NUM_PROTOCOLS * (ncclNumDevRedOps * ncclNumTypes - NCCL_NUM_FLOATS);

-  // <all_algos> / <all_protos> / <all_redops> / <all_types>
-  if (coll == ncclFuncAllReduce) {
-    row += (((algo * NCCL_NUM_PROTOCOLS + proto) * ncclNumDevRedOps + devRedOp) * ncclNumTypes + type) - /*floats for each SumPostDiv*/ 6 * (algo * NCCL_NUM_PROTOCOLS + proto);
-    goto have_row;
-  }
-  row += (NCCL_NUM_ALGORITHMS - 2) * NCCL_NUM_PROTOCOLS * (ncclNumDevRedOps * ncclNumTypes - /*floats for each SumPostDiv*/ 6);
+    // RING / SIMPLE / Sum / int8_t
+    if (coll == ncclFuncAllToAllPivot) break;
+    row += 1;

-  // RING / SIMPLE / Sum / int8_t
-  if (coll == ncclFuncAllToAllPivot) goto have_row;
-  row += 1;
+    // RING / <all_protos> / Sum / int8_t
+    if (coll == ncclFuncBroadcast) {
+      row += proto;
+      break;
+    }
+    row += NCCL_NUM_PROTOCOLS;

-  // RING / <all_protos> / Sum / int8_t
-  if (coll == ncclFuncBroadcast) {
-    row += proto;
-    goto have_row;
-  }
-  row += NCCL_NUM_PROTOCOLS;
+    // RING / <all_protos> / <all_redops> / <all_types>
+    if (coll == ncclFuncReduce) {
+      row += ((proto * ncclNumDevRedOps + devRedOp) * ncclNumTypes + type) - NCCL_NUM_FLOATS * proto; 
+      break;
+    }
+    row += NCCL_NUM_PROTOCOLS * (ncclNumDevRedOps * ncclNumTypes - NCCL_NUM_FLOATS);

-  // RING / <all_protos> / <all_redops> / <all_types>
-  if (coll == ncclFuncReduce) {
-    row += ((proto * ncclNumDevRedOps + devRedOp) * ncclNumTypes + type) - /*floats for each SumPostDiv*/ 6 * proto; 
-    goto have_row;
-  }
-  row += NCCL_NUM_PROTOCOLS * (ncclNumDevRedOps * ncclNumTypes - /*floats for each SumPostDiv*/ 6);
+    // RING / <all_protos> / <all_redops> / <all_types>
+    if (coll == ncclFuncReduceScatter) {
+      row += ((proto * ncclNumDevRedOps + devRedOp) * ncclNumTypes + type) - NCCL_NUM_FLOATS * proto;
+      break;
+    }
+    row += NCCL_NUM_PROTOCOLS * (ncclNumDevRedOps * ncclNumTypes - NCCL_NUM_FLOATS);

-  // RING / <all_protos> / <all_redops> / <all_types>
-  if (coll == ncclFuncReduceScatter) {
-    row += ((proto * ncclNumDevRedOps + devRedOp) * ncclNumTypes + type) - /*floats for each SumPostDiv*/ 6 * proto;
-    goto have_row;
-  }
-  row += NCCL_NUM_PROTOCOLS * (ncclNumDevRedOps * ncclNumTypes - /*floats for each SumPostDiv*/ 6);
+    // RING / SIMPLE / Sum / int8_t
+    if (coll == ncclFuncSendRecv) break;
+    row += 1;

-  // RING / SIMPLE / Sum / int8_t
-  if (coll == ncclFuncSendRecv) goto have_row;
-  row += 1;
+  } while (false);

-have_row:
  return ncclDevFuncRowToId[row];
 }

-inline int ncclDevFuncId_P2p() { return ncclDevFuncRowToId[FUNC_INDEX_P2P]; }
+inline int ncclDevFuncId_P2p() { return ncclDevFuncRowToId[FUNC_INDEX_TOTAL - NCCL_NUM_ONERANK - 1]; }

-inline int ncclDevFuncId_AllToAllPivot() { return ncclDevFuncRowToId[FUNC_INDEX_ALLTOALL_PIVOT]; }
-
-#endif
+#endif
@@ -12,8 +12,10 @@
 #include "collectives.h"
 #include "utils.h"

-#define NCCL_MIN_CHANNEL_SIZE (NCCL_LL_THREAD_THRESHOLD*64)
-#define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */
+#define NCCL_LL_ALIGNMENT_PER_THREAD sizeof(uint64_t)
+#define NCCL_LL128_ALIGNMENT_PER_WARP 480
+#define NCCL_SIMPLE_ALIGNMENT (WARP_SIZE * 8LL * 16LL)
+#define NCCL_BYTES_ALIGNMENT 16

 ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize);
 ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
@@ -34,6 +34,7 @@ int ncclTopoPathAllNVLink(struct ncclTopoSystem* system);
 // Query topology
 ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* net, int* proxyRank);
 ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank);
+ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* ret);
 ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
 #define MAX_XGMI_INTER_GPUS 4
 ncclResult_t ncclTopoGetIntraNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int type, int* dev);
@@ -59,10 +60,11 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu
 #define NCCL_TOPO_CPU_TYPE_YONGFENG 1
 ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model);
 ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count);
-ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count);
+ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count);
 ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count);
 ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int* id);
 ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int net, int* gpuIndex);
+ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *count);

 #define NCCL_TOPO_MAX_NODES 256

@@ -111,6 +113,7 @@ struct ncclTopoRanks {
  int treeToChild0[MAXCHANNELS];
  int treeToChild1[MAXCHANNELS];
  int nvlsHeads[MAXCHANNELS];
+  int nvlsHeadNum;
 };

 ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks);
@@ -14,6 +14,7 @@
 #include "core.h"
 #include "utils.h"
 #include "strongstream.h"
+#define NCCL_MAX_LOCAL_RANKS 64

 typedef enum : uint8_t {
  ncclPatternRing,
@@ -31,6 +32,13 @@ typedef enum : uint8_t {
  ncclPatternRecv
 } ncclPattern_t;

+enum ncclRegBufferType {
+  NCCL_REGULAR_BUFFER = 0,
+  NCCL_IPC_REG_BUFFER = 1,
+  NCCL_NVLS_REG_BUFFER = 2,
+  NCCL_REG_BUFFER_NUM = 3
+};
+
 // Used to pass NCCL call information between functions
 struct ncclInfo {
  ncclFunc_t coll;
@@ -49,37 +57,46 @@ struct ncclInfo {
  int sliceSteps;
  // Computed later
  ncclDevRedOpFull opFull;
-  int algorithm;
-  int protocol;
  ncclPattern_t pattern;
-  int nChannels;
-  int nThreads;
  size_t nBytes;
+  size_t aggnBytes;
+  size_t workBytes;
  size_t sendbuffSize;
  size_t recvbuffSize;
-  int nstepsPerLoop;
-  int nchunksPerLoop;
+  int stepSize;
+  int chunkCount;
  int chunkSize;
  int channelId;
+  int workFuncIndex;
+  ncclRegBufferType regBufType;
+  void* regBufSend[NCCL_MAX_LOCAL_RANKS];
+  void* regBufRecv[NCCL_MAX_LOCAL_RANKS];
+  // Need to initialize
+  int nThreads;
+  int nChannels;
+  int algorithm;
+  int protocol;
+  bool userTuned;
+  struct ncclInfo *next;
 };

 inline ncclResult_t ncclInfoSetDerived(struct ncclInfo* info, int nRanks) {
-  info->nBytes = info->count * ncclTypeSize(info->datatype);
+  info->nBytes = info->workBytes = info->count * ncclTypeSize(info->datatype);
  if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast || info->coll == ncclFuncAllToAllPivot) {
-    info->count = info->nBytes;
+    info->count = info->workBytes;
    info->datatype = ncclInt8;
  }
  if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= nRanks; // count is per rank

  /* compute buffer size for NVLS buffer registration */
  if (info->coll == ncclFuncAllGather) {
-    info->sendbuffSize = info->count * ncclTypeSize(info->datatype);
+    info->sendbuffSize = info->workBytes;
    info->recvbuffSize = info->sendbuffSize * nRanks;
  } else if (info->coll == ncclFuncReduceScatter) {
-    info->recvbuffSize = info->count * ncclTypeSize(info->datatype);
+    info->recvbuffSize = info->workBytes;
    info->sendbuffSize = info->recvbuffSize * nRanks;
  } else {
-    info->sendbuffSize = info->recvbuffSize = info->count * ncclTypeSize(info->datatype);
+    info->sendbuffSize = info->recvbuffSize = info->workBytes;
  }
  return ncclSuccess;
 }
@@ -94,6 +111,7 @@ struct ncclTaskColl {
  ncclDataType_t datatype;
  ncclDevRedOpFull op;
  int chunkSteps, sliceSteps;
+  struct ncclInfo info;
 };
 struct ncclTaskP2p {
  ncclTaskP2p *next;
@@ -114,8 +132,16 @@ struct ncclTasks {
    struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> sendQueue;
    struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> recvQueue;
  };
-  struct ncclIntruQueue<ncclTaskColl, &ncclTaskColl::next> collQueue;
-  size_t collBytesTotal;
+  struct ncclIntruQueue<struct ncclInfo, &ncclInfo::next> collQueue;
+  // Queue for user-tuned executed collectives
+  struct ncclIntruQueue<struct ncclInfo, &ncclInfo::next> collTunedQueue;
+  // Queue for continuous bytes distribution (CBD) collectives
+  struct ncclIntruQueue<struct ncclInfo, &ncclInfo::next> collCBDQueue;
+  // Queue for collnet
+  struct ncclIntruQueue<struct ncclInfo, &ncclInfo::next> collnetQueue;
+  size_t workBytesTotal;
+  int usableChannels;
+  bool sorted;
  struct Peer* peers/*[nRanks]*/;
  int *p2pSendOrder, *p2pRecvOrder;
  int p2pOrderSteps;
@@ -134,4 +160,4 @@ struct ncclTasks {
  struct ncclCudaGraph capturingGraph;
 };

-#endif
+#endif
@@ -35,4 +35,7 @@ ncclResult_t ncclIpcSocketGetFd(struct ncclIpcSocket* handle, int* fd);
 ncclResult_t ncclIpcSocketRecvFd(struct ncclIpcSocket *handle, int *fd);
 ncclResult_t ncclIpcSocketSendFd(struct ncclIpcSocket *handle, const int fd, int rank, uint64_t hash);

+ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, const int sendFd, int rank, uint64_t hash);
+ncclResult_t ncclIpcSocketRecvMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, int *recvFd);
+
 #endif /* NCCL_IPCSOCKET_H */
@@ -12,12 +12,22 @@ typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCC

 typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);

-#define NCCL_NUM_FUNCTIONS 5 // Send/Recv and AllToAllPivot not included for now
-typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv,  ncclFuncAllToAllPivot, ncclNumFuncs} ncclFunc_t;
+#define NCCL_NUM_ONERANK 12
+#define FUNC_INDEX_TOTAL 980 + NCCL_NUM_ONERANK

-#define FUNC_INDEX_P2P 979
-#define FUNC_INDEX_ALLTOALL_PIVOT 651
-#define FUNC_INDEX_TOTAL 992
+#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
+typedef enum {
+  ncclFuncBroadcast = 0,
+  ncclFuncReduce = 1,
+  ncclFuncAllGather = 2,
+  ncclFuncReduceScatter = 3,
+  ncclFuncAllReduce = 4,
+  ncclFuncSendRecv = 5,
+  ncclFuncSend = 6,
+  ncclFuncRecv = 7,
+  ncclFuncAllToAllPivot = 8,
+  ncclNumFuncs = 9
+} ncclFunc_t;

 #define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
 #define NCCL_ALGO_UNDEF -1
@@ -34,4 +44,6 @@ typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncRed
 #define NCCL_PROTO_LL128 1
 #define NCCL_PROTO_SIMPLE 2

-#endif
+#define NCCL_NUM_FLOATS 6 // half/float/double/rccl_bfloat16
+
+#endif
@@ -21,6 +21,140 @@
 // Maximum number of requests per comm object
 #define NCCL_NET_MAX_REQUESTS 32

+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int regIsGlobal;                 // regMr is not tied to a particular comm
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+} ncclNetProperties_v8_t;
+
+typedef ncclNetProperties_v8_t ncclNetProperties_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+} ncclNet_v8_t;
+
+typedef ncclNet_v8_t ncclNet_t;
+
+#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v8
+
+typedef struct {
+  void* mhandle;
+  void* address;
+  uint32_t size;
+} ncclNetSGE_v8_t;
+
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v8_t* recvParts,
+                             size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                             void* sendMhandle, void** request);
+  ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v8_t* sendParts, void* recvData,
+                                 size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                                 ncclDataType_t dataType, ncclRedOp_t redOp,
+                                 void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclCollNet_v8_t;
+
+typedef ncclCollNet_v8_t ncclCollNet_t;
+
+#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v8
+
 typedef struct {
  char* name;                      // Used mostly for logging.
  char* pciPath;                   // Path to the PCI device in /sys.
@@ -36,8 +170,6 @@ typedef struct {
  int netDeviceVersion;            // Version number for network offload
 } ncclNetProperties_v7_t;

-typedef ncclNetProperties_v7_t ncclNetProperties_t;
-
 typedef struct {
  // Name of the network (mainly for logs)
  const char* name;
@@ -93,11 +225,45 @@ typedef struct {
  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
 } ncclNet_v7_t;

-typedef ncclNet_v7_t ncclNet_t;
-
-#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v7
-
-#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v7
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclCollNet_v7_t;

 #define NCCL_NET_MAX_REQUESTS_V6 8

@@ -162,49 +328,6 @@ typedef struct {
  ncclResult_t (*closeListen)(void* listenComm);
 } ncclNet_v6_t;

-typedef struct {
-  // Name of the collective network (mainly for logs)
-  const char* name;
-  // Initialize the collective network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters capable of doing collective operations.
-  // If ndev returns 0, all other functions might be set to NULL.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create connections.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Create a group for collective operations. handles have been created
-  // using listen() above. rank indicates caller's rank in the collective network.
-  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
-  // Returns whether a reduction operation on a data type is supported.
-  // 1 for supported, 0 otherwise.
-  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
-  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
-  /* DMA-BUF support */
-  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
-  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
-  // Performs an asynchronous allreduce operation on the collective group.
-  // May return request == NULL if the call cannot be performed (or would block).
-  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* size);
-  // Close and free collective comm objects
-  ncclResult_t (*closeColl)(void* collComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclCollNet_v7_t;
-
-typedef ncclCollNet_v7_t ncclCollNet_t;
-
-// v6 struct for backwards compatibility
 typedef struct {
  // Name of the collective network (mainly for logs)
  const char* name;
@@ -24,6 +24,7 @@ typedef struct {
  int needsProxyProgress;
 } ncclNetDeviceHandle_v7_t;

-typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_t;
+typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
+typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_t;

 #endif
@@ -20,6 +20,12 @@
 // Dynamically handle dependencies on NVML

 /* Extracted from nvml.h */
+
+#define NVML_API_VERSION            12
+
+#define NVML_STRUCT_VERSION(data, ver) (unsigned int)(sizeof(nvml ## data ## _v ## ver ## _t) | \
+                                                      (ver << 24U))
+
 typedef struct nvmlDevice_st* nvmlDevice_t;
 #define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE   16

@@ -181,6 +187,72 @@ typedef struct nvmlFieldValue_st
    nvmlValue_t value;          //!< Value for this field. This is only valid if nvmlReturn == NVML_SUCCESS
 } nvmlFieldValue_t;

+
+#define NVML_GPU_FABRIC_UUID_LEN 16
+
+#define NVML_GPU_FABRIC_STATE_NOT_SUPPORTED 0
+#define NVML_GPU_FABRIC_STATE_NOT_STARTED   1
+#define NVML_GPU_FABRIC_STATE_IN_PROGRESS   2
+#define NVML_GPU_FABRIC_STATE_COMPLETED     3
+
+typedef unsigned char nvmlGpuFabricState_t;
+
+typedef struct {
+    unsigned char        clusterUuid[NVML_GPU_FABRIC_UUID_LEN]; //!< Uuid of the cluster to which this GPU belongs
+    nvmlReturn_t         status;                                //!< Error status, if any. Must be checked only if state returns "complete".
+    unsigned int         cliqueId;                              //!< ID of the fabric clique to which this GPU belongs
+    nvmlGpuFabricState_t state;                                 //!< Current state of GPU registration process
+} nvmlGpuFabricInfo_t;
+
+#define NVML_GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_NOT_SUPPORTED 0
+#define NVML_GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_TRUE          1
+#define NVML_GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_FALSE         2
+
+#define NVML_GPU_FABRIC_HEALTH_MASK_SHIFT_DEGRADED_BW 0
+#define NVML_GPU_FABRIC_HEALTH_MASK_WIDTH_DEGRADED_BW 0x11
+
+/**
+ * GPU Fabric Health Status Mask for various fields can be obtained
+ * using the below macro.
+ * Ex - NVML_GPU_FABRIC_HEALTH_GET(var, _DEGRADED_BW)
+ */
+#define NVML_GPU_FABRIC_HEALTH_GET(var, type)             \
+    (((var) >> NVML_GPU_FABRIC_HEALTH_MASK_SHIFT##type) & \
+     (NVML_GPU_FABRIC_HEALTH_MASK_WIDTH##type))
+
+/**
+ * GPU Fabric Health Status Mask for various fields can be tested
+ * using the below macro.
+ * Ex - NVML_GPU_FABRIC_HEALTH_TEST(var, _DEGRADED_BW, _TRUE)
+ */
+#define NVML_GPU_FABRIC_HEALTH_TEST(var, type, val) \
+    (NVML_GPU_FABRIC_HEALTH_GET(var, type) ==       \
+     NVML_GPU_FABRIC_HEALTH_MASK##type##val)
+
+/**
+* GPU Fabric information (v2).
+*
+* Version 2 adds the \ref nvmlGpuFabricInfo_v2_t.version field
+* to the start of the structure, and the \ref nvmlGpuFabricInfo_v2_t.healthMask
+* field to the end. This structure is not backwards-compatible with
+* \ref nvmlGpuFabricInfo_t.
+*/
+typedef struct {
+    unsigned int         version;                               //!< Structure version identifier (set to \ref nvmlGpuFabricInfo_v2)
+    unsigned char        clusterUuid[NVML_GPU_FABRIC_UUID_LEN]; //!< Uuid of the cluster to which this GPU belongs
+    nvmlReturn_t         status;                                //!< Error status, if any. Must be checked only if state returns "complete".
+    unsigned int         cliqueId;                              //!< ID of the fabric clique to which this GPU belongs
+    nvmlGpuFabricState_t state;                                 //!< Current state of GPU registration process
+    unsigned int         healthMask;                            //!< GPU Fabric health Status Mask
+} nvmlGpuFabricInfo_v2_t;
+
+typedef nvmlGpuFabricInfo_v2_t nvmlGpuFabricInfoV_t;
+
+/**
+* Version identifier value for \ref nvmlGpuFabricInfo_v2_t.version.
+*/
+#define nvmlGpuFabricInfo_v2 NVML_STRUCT_VERSION(GpuFabricInfo, 2)
+
 /* End of nvml.h */
 #endif // NCCL_NVML_DIRECT

@@ -210,5 +282,6 @@ ncclResult_t ncclNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int
 ncclResult_t ncclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor);
 ncclResult_t ncclNvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus);
 ncclResult_t ncclNvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values);
+ncclResult_t ncclNvmlDeviceGetGpuFabricInfoV(nvmlDevice_t device, nvmlGpuFabricInfoV_t *gpuFabricInfo);

 #endif // End include guard
@@ -9,10 +9,22 @@
 #ifndef NCCL_P2P_H_
 #define NCCL_P2P_H_

-#define NCCL_P2P_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
+#include <cuda.h>

-typedef struct {
+#if CUDART_VERSION < 12030
+// MNNVL: FABRIC handle support lifted from CUDA 12.3
+#define CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED ((CUdevice_attribute)128)
+#define CU_MEM_HANDLE_TYPE_FABRIC ((CUmemAllocationHandleType)0x8ULL)
+#define CU_IPC_HANDLE_SIZE 64
+typedef struct CUmemFabricHandle_st {
+    unsigned char data[CU_IPC_HANDLE_SIZE];
+} CUmemFabricHandle_v1;
+typedef CUmemFabricHandle_v1 CUmemFabricHandle;
+#endif
+
+typedef union {
  uint64_t data; // Needs to hold a CUmemGenericAllocationHandle for UDS fd support
+  CUmemFabricHandle handle;
 } ncclCuDesc;

 typedef union {
@@ -27,36 +27,43 @@ typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyState*, struct ncclP
 #define NCCL_PROXY_MAX_SUBS MAXCHANNELS
 static_assert(NCCL_MAX_WORK_ELEMENTS <= MAXCHANNELS, "Not enough sub space for max work elements");

+union ncclProxyOpSpecifics {
+  struct {
+    size_t sizePerRank;
+    int nNodes, node;
+  } collnetDirect;
+};
+
 struct ncclProxyOp {
  struct ncclProxyConnection* connection;
-  int channelId;
-  int nsteps;
+  void* buffer;
  ssize_t nbytes;
-  struct {
-    int root:30;
-    uint32_t connIndex:2;
-  };
-  int next;
-
  uint64_t opCount;
-  int sliceSteps;
-  int chunkSteps;
+  int root:30;
+  uint32_t connIndex:2;
+  int next;
+  int nsteps;
  int chunkSize;
+  uint8_t sliceSteps;
+  uint8_t chunkSteps;
+  uint8_t channelId;
  uint8_t /*ncclDataType_t*/ dtype;
  uint8_t /*ncclDevRedOp_t*/ redOp;
+  uint8_t /*ncclFunc_t*/ coll;
  uint8_t /*ncclPattern_t*/ pattern;
  uint8_t protocol;
+  uint8_t reg;

-  union {
-    uint64_t unused;
-    // For use by enqueue.cc
-    struct ncclProxyOp *enqNext;
-  };
+  union ncclProxyOpSpecifics specifics;
+
+  struct ncclProxyOp *enqNext;
 };
-static_assert(sizeof(struct ncclProxyOp) == 64, "Keep ProxyOp aligned with cache lines for effective prefetch");

 struct ncclProxySubArgs {
  struct ncclProxyConnection* connection;
+  int reg;
+  void* buffer;
+  void* mhandle;
  int channelId;
  int nsteps;
  ssize_t nbytes;
@@ -93,6 +100,7 @@ struct ncclProxyArgs {
  uint8_t /*ncclDataType_t*/ dtype;
  uint8_t /*ncclDevRedOp_t*/ redOp;
  uint8_t /*ncclPattern_t*/ pattern;
+  uint8_t /*ncclFunc_t*/ coll;
  uint8_t protocol;
  int state;
  char* sharedBuff[NCCL_STEPS];
@@ -105,6 +113,8 @@ struct ncclProxyArgs {
  struct ncclProxyArgs* next;
  struct ncclProxyArgs* nextPeer;
  struct ncclProxyArgs** proxyAppendPtr;
+
+  union ncclProxyOpSpecifics specifics;
 };
 #define NCCL_MAX_NETDEVS 128

@@ -112,7 +122,7 @@ struct ncclProxyArgs {
 // Make sure we have enough to store two full rounds of operations on all channels.
 // Otherwise we'd be unable to post half of them to free new elements.
 #define MAX_OPS_PER_PEER (2*MAXCHANNELS*NCCL_MAX_WORK_ELEMENTS_P2P)
-#define NCCL_MAX_LOCAL_RANKS 64
+
 struct ncclProxyOpsPool {
  struct ncclProxyOp ops[MAX_OPS_PER_PEER*NCCL_MAX_LOCAL_RANKS];
  volatile int nextOps;
@@ -205,6 +215,16 @@ struct ncclProxyRpcResponseHeader {
  int respSize;
 };

+// UDS support
+struct ncclIpcHdr {
+  int type;
+  int rank;
+  int reqSize;
+  int respSize;
+  void *opId;
+  uint64_t data[16]; // 128-bytes
+};
+
 struct ncclProxyState {
  int refCount;
  int tpRank;
@@ -220,9 +240,11 @@ struct ncclProxyState {
  ncclNet_t* ncclNet;
  ncclCollNet_t* ncclCollNet;
  volatile uint32_t* abortFlag;
-  // Service thread
+  // Service threads
  pthread_t thread;
+  pthread_t threadUDS;
  struct ncclSocket* listenSock;
+  struct ncclIpcSocket ipcSock;
  int stop;
  CUcontext cudaCtx;
  ncclResult_t asyncResult;
@@ -233,6 +255,7 @@ struct ncclProxyState {
  struct ncclProxyOps* proxyOps;
  void** sharedDevMems;
  struct ncclIpcSocket peerIpcSock; // cuMEM API support (UDS)
+  uint64_t *peerAddressesUDS; // cuMem API support (UDS)

  // Progress thread
  struct ncclProxyProgressState progressState;
@@ -274,9 +297,9 @@ enum proxyMode {
 };

 ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* proxyOp, bool *justInquire);
-ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* proxyOp);
+ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* proxyOp, int reg);
 ncclResult_t ncclProxyStart(struct ncclComm* comm);
-ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses);
+ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses, uint64_t *peerAddressesUDS);
 ncclResult_t ncclProxyCreate(struct ncclComm* comm);
 ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int proxyRank, struct ncclProxyConnector* proxyConn);
 enum ncclProxyMsgType {
@@ -300,11 +323,12 @@ ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector
 ncclResult_t ncclProxyCallBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize);
 ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void* respBuff, void* opId);

-ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void *handle, int* convertedFd);
+// UDS support
+ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, int rank, void *handle, int* convertedFd);

 ncclResult_t ncclProxyStop(struct ncclComm* comm);
 ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm);
 ncclResult_t ncclProxyDestroy(struct ncclComm* comm);

 ncclResult_t mscclSaveProxy(struct ncclComm* comm, struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex);
-#endif
+#endif
@@ -0,0 +1,42 @@
+#ifndef NCCL_REGISTER_H_
+#define NCCL_REGISTER_H_
+
+enum {
+  NET_REG_COMPLETE = 0x01,
+  NVLS_REG_COMPLETE = 0x02,
+  NVLS_REG_POSSIBLE = 0x04,
+  NVLS_REG_NO_SUPPORT = 0x08
+};
+
+struct ncclReg {
+  // common attributes
+  size_t pages;
+  int refs;
+  uintptr_t addr;
+  uint32_t state;
+  // net reg
+  int nDevs;
+  int devs[MAXCHANNELS];
+  void** handles;
+  // nvls reg
+  uintptr_t baseAddr;
+  size_t baseSize;
+  CUdeviceptr regAddr;
+  size_t regSize;
+  int dev;
+  CUmemGenericAllocationHandle mcHandle;
+  uintptr_t caddrs[NCCL_MAX_LOCAL_RANKS]; /* use to check if NVLS buffers match among intra-node ranks */
+};
+
+struct ncclRegCache {
+  struct ncclReg **slots;
+  int capacity, population;
+  uintptr_t pageSize;
+  void* sComms[MAXCHANNELS];
+  void* rComms[MAXCHANNELS];
+};
+
+ncclResult_t ncclRegCleanup(struct ncclComm* comm);
+ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** reg);
+
+#endif
@@ -18,6 +18,7 @@ struct ncclShmemCollBuff {
  volatile size_t *cnt[2];
  volatile void *ptr[2];
  int round;
+  size_t maxTypeSize;
 };

 ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize);
@@ -44,6 +44,8 @@ struct ncclPeerInfo {
  int64_t busId;
  struct ncclComm* comm;
  int cudaCompCap;
+  // MNNVL support
+  nvmlGpuFabricInfoV_t fabricInfo;
 };

 #define CONNECT_SIZE 128
@@ -30,6 +30,11 @@ uint64_t getHostHash();
 uint64_t getPidHash();
 ncclResult_t getRandomData(void* buffer, size_t bytes);

+const char* ncclOpToString(ncclRedOp_t op);
+const char* ncclDatatypeToString(ncclDataType_t type);
+const char* ncclAlgoToString(int algo);
+const char* ncclProtoToString(int proto);
+
 struct netIf {
  char prefix[64];
  int port;
@@ -394,6 +399,36 @@ void ncclIntruQueueFreeAll(ncclIntruQueue<T,next> *me, ncclMemoryPool *pool) {
  }
 }

+/* cmp function determines the sequence of objects in the queue. If cmp returns value >= 0, it means a > b,
+ * and we should put a before b; otherwise, b should be put ahead of a. */
+template<typename T, T *T::*next>
+inline void ncclIntruQueueSortEnqueue(ncclIntruQueue<T,next> *me, T *x, int (*cmp)(T *a, T *b)) {
+  T *cur = me->head;
+  T *prev = NULL;
+
+  if (cur == NULL) {
+    x->*next = nullptr;
+    me->tail = me->head = x;
+  } else {
+    while (cur) {
+      if (cmp(cur, x) > 0) {
+        prev = cur;
+        cur = cur->next;
+      } else {
+        break;
+      }
+    }
+
+    x->*next = cur;
+    if (prev) {
+      prev->*next = x;
+      if (cur == NULL) me->tail = x;
+    } else {
+      me->head = x;
+    }
+  }
+}
+
 ////////////////////////////////////////////////////////////////////////////////

 constexpr ncclThreadSignal ncclThreadSignalStaticInitializer() {
@@ -329,6 +329,10 @@ static ncclResult_t commFree(ncclComm_t comm) {
   * resource cleanup in commFree(). */
  if (comm->proxyState && comm->proxyRefCountOld == 0 && comm->proxyState->thread) {
    pthread_join(comm->proxyState->thread, nullptr);
+    if (comm->proxyState->threadUDS) {
+      // UDS support
+      pthread_join(comm->proxyState->threadUDS, nullptr);;
+    }
  }

  delete[] comm->userRedOps;
@@ -417,17 +421,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
  free(comm->topParentRanks);
  free(comm->topParentLocalRanks);

-  while (!ncclIntruQueueEmpty(&comm->regRecordQueue)) {
-    struct ncclRegRecord* rec = ncclIntruQueueDequeue(&comm->regRecordQueue);
-    NCCLCHECK(ncclNvlsDeregBuffer(&rec->mcHandle, rec->regAddr, rec->dev, rec->regSize));
-    free(rec->addrs);
-    free(rec);
-  }
-
-  while (!ncclIntruQueueEmpty(&comm->regRequestQueue)) {
-    struct ncclRegRequest* req = ncclIntruQueueDequeue(&comm->regRequestQueue);
-    free(req);
-  }
+  NCCLCHECK(ncclRegCleanup(comm));

  commPoison(comm); // poison comm before free to avoid comm reuse.
  free(comm);
@@ -472,7 +466,7 @@ ncclResult_t ncclCommEnsureReady(ncclComm_t comm) {
  /* comm must be ready, or error will be reported */
  ncclResult_t ret = ncclSuccess;

-  if (*comm->abortFlag) {
+  if (__atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED)) {
    ncclGroupJobAbort(comm->groupJob);
  } else {
    NCCLCHECK(ncclCommGetAsyncError(comm, &ret));
@@ -555,7 +549,6 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in

  comm->groupNext = reinterpret_cast<struct ncclComm*>(0x1);
  comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
-  comm->channelSize = ncclParamAggChannelSize();

  static_assert(MAXCHANNELS <= sizeof(*comm->connectSend)*8, "comm->connectSend must have enough bits for all channels");
  static_assert(MAXCHANNELS <= sizeof(*comm->connectRecv)*8, "comm->connectRecv must have enough bits for all channels");
@@ -588,9 +581,9 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
      comm->topParentRanks[i] = i;
  }

-  ncclIntruQueueConstruct(&comm->regRequestQueue);
-  ncclIntruQueueConstruct(&comm->regRecordQueue);
  ncclIntruQueueMpscConstruct(&comm->callbackQueue);
+
+  comm->regCache.pageSize = sysconf(_SC_PAGESIZE);
  return ncclSuccess;
 }

@@ -606,6 +599,8 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
  comm->devComm = &devCommAndChans->comm;
  tmpCommAndChans.comm.rank = comm->rank;
  tmpCommAndChans.comm.nRanks = nRanks;
+  tmpCommAndChans.comm.node = comm->node;
+  tmpCommAndChans.comm.nNodes = comm->nNodes;
  tmpCommAndChans.comm.abortFlag = comm->abortFlag;
  for (int p=0; p < NCCL_NUM_PROTOCOLS; p++) {
    tmpCommAndChans.comm.buffSizes[p] = comm->buffSizes[p];
@@ -638,6 +633,12 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
  comm->workFifoSent = 0;
  comm->workFifoAckdMin = 0;

+  if (comm->collNetDenseToUserRank != nullptr) {
+    NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.collNetDenseToUserRank, nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
+    ncclCommPushCudaFree(comm, tmpCommAndChans.comm.collNetDenseToUserRank);
+    NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.comm.collNetDenseToUserRank, comm->collNetDenseToUserRank, nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
+  }
+
  for (int c=0; c < MAXCHANNELS; c++) {
    tmpCommAndChans.channels[c].peers = comm->channels[c].devPeers;
    tmpCommAndChans.channels[c].ring = comm->channels[c].ring;
@@ -731,6 +732,26 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u

  info->comm = comm;
  info->cudaCompCap = comm->minCompCap = comm->maxCompCap = comm->compCap;
+
+#if !defined(__HIP_PLATFORM_HCC__) && !defined(__HCC__) && !defined(__HIPCC__)
+  // MNNVL support
+  {
+    // MNNVL: Request the fabric UUID and partition info
+    char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+    nvmlDevice_t nvmlDev;
+    NCCLCHECK(int64ToBusId(info->busId, busId));
+    NCCLCHECK(ncclNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev));
+    info->fabricInfo.state = NVML_GPU_FABRIC_STATE_NOT_SUPPORTED;
+    (void) ncclNvmlDeviceGetGpuFabricInfoV(nvmlDev, &info->fabricInfo);
+    if (info->fabricInfo.state != NVML_GPU_FABRIC_STATE_NOT_SUPPORTED) {
+      INFO(NCCL_INIT, "MNNVL busId 0x%lx fabric UUID %lx.%lx cliqueId 0x%x state %d healthMask 0x%x",
+           info->busId,
+           ((long *)&info->fabricInfo.clusterUuid)[0], ((long *)&info->fabricInfo.clusterUuid)[1],
+           info->fabricInfo.cliqueId, info->fabricInfo.state, info->fabricInfo.healthMask);
+    }
+  }
+#endif
+
  return ncclSuccess;
 }

@@ -774,8 +795,9 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
    comm->buffSizes[p] = envs[p] != -2 ? envs[p] : defaults[p];
  }

-  if (comm->nNodes > 1) comm->p2pChunkSize = ncclParamP2pNetChunkSize();
-  else if (ncclTopoPathAllNVLink(comm->topo)) comm->p2pChunkSize = ncclParamP2pNvlChunkSize();
+  // MNNVL support
+  if (!comm->MNNVL && comm->nNodes > 1) comm->p2pChunkSize = ncclParamP2pNetChunkSize();
+  else if (comm->MNNVL || ncclTopoPathAllNVLink(comm->topo)) comm->p2pChunkSize = ncclParamP2pNvlChunkSize();
  else comm->p2pChunkSize = ncclParamP2pPciChunkSize();

  // Make sure P2P chunksize is not larger than coll chunksize.
@@ -805,6 +827,8 @@ static ncclResult_t collNetTrySetup(ncclComm_t comm, ncclComm_t parent, struct n
  int highestTypes[NCCL_MAX_LOCAL_RANKS] = { TRANSPORT_P2P };
  // Find all head ranks
  int nHeads = collNetGraph->nChannels;
+  int nHeadsUnique = 0;
+  int headsUnique[NCCL_MAX_LOCAL_RANKS];
  int highestTransportType0, highestTransportType1;
  char line[1024];
  bool share;
@@ -816,13 +840,20 @@ static ncclResult_t collNetTrySetup(ncclComm_t comm, ncclComm_t parent, struct n
  struct collnetShareInfo* infos = NULL;

  NCCLCHECKGOTO(ncclCalloc(&heads, nHeads), ret, fail);
-  // Head GPU index is always 0
-  for (int c = 0; c < nHeads; c++) {
-    heads[c] = collNetGraph->intra[c * comm->localRanks + 0];
+  { uint64_t mask = 0;
+    // Head GPU index is always 0
+    for (int c = 0; c < nHeads; c++) {
+      heads[c] = collNetGraph->intra[c * comm->localRanks + 0];
+      assert(comm->rankToNode[heads[c]] == comm->node);
+      uint64_t mask0 = mask;
+      mask |= 1ull<<comm->rankToLocalRank[heads[c]];
+      if (mask != mask0) headsUnique[nHeadsUnique++] = heads[c];
+    }
  }

  comm->collNetHeads = heads;
  comm->collNetHeadsNum = nHeads;
+  comm->collNetHeadsUniqueNum = nHeadsUnique;
  if (parent && parent->collNetSupport && parent->config.splitShare && parent->nNodes == comm->nNodes) {
    NCCLCHECKGOTO(ncclCalloc(&infos, comm->nRanks), ret, fail);
    /* check whether child can share collnet resources of parent. Since parent builds each collnet communicator
@@ -883,6 +914,26 @@ static ncclResult_t collNetTrySetup(ncclComm_t comm, ncclComm_t parent, struct n
    NCCLCHECK(ncclCalloc(&comm->collNetSharedRes, 1));
    comm->collNetChannels = comm->collNetSharedRes->nChannels = comm->nChannels;
    comm->collNetSharedRes->buffSize = comm->buffSizes[NCCL_PROTO_SIMPLE];
+
+    comm->collNetDenseToUserRank = ncclMemoryStackAlloc<int>(&comm->memPermanent, comm->nRanks);
+    comm->collNetUserToDenseRank = ncclMemoryStackAlloc<int>(&comm->memPermanent, comm->nRanks);
+    { // initialize collNetUserToDenseRank[rank]
+      uint64_t nonHeadMask = (1ull<<comm->localRanks)-1;
+      comm->collNetUserToDenseRank[rank] = -1;
+      for (int h=0; h < nHeadsUnique; h++) {
+        nonHeadMask ^= 1ull<<comm->rankToLocalRank[headsUnique[h]];
+        if (headsUnique[h] == rank) { comm->collNetUserToDenseRank[rank] = h; break; }
+      }
+      if (comm->collNetUserToDenseRank[rank] == -1) {
+        comm->collNetUserToDenseRank[rank] = __builtin_popcountll(nonHeadMask & ((1ull<<comm->localRank)-1));
+      }
+      comm->collNetUserToDenseRank[rank] += comm->node*comm->localRanks;
+    }
+    NCCLCHECK(bootstrapAllGather(comm->bootstrap, comm->collNetUserToDenseRank, sizeof(int)));
+    for (int r=0; r < comm->nRanks; r++) {
+      comm->collNetDenseToUserRank[comm->collNetUserToDenseRank[r]] = r;
+    }
+
    for (int c = 0; c < comm->collNetChannels; c++) {
      struct ncclChannel* channel = comm->channels + c;
      NCCLCHECKGOTO(initCollnetChannel(comm, c, parent, false), ret, fail);
@@ -1000,6 +1051,9 @@ fail:
  goto exit;
 }

+// MNNVL: Flag to indicate whether to enable Multi-Node NVLink
+NCCL_PARAM(MNNVL, "MNNVL", -2);
+
 static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* parent = NULL) {
  // We use 2 AllGathers
  // 1. { peerInfo, comm, compCap}
@@ -1007,6 +1061,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
  ncclResult_t ret = ncclSuccess;
  int rank = comm->rank;
  int nranks = comm->nRanks;
+  int nNodes = 1;
  cpu_set_t affinitySave;
  struct ncclTopoGraph ringGraph;
  struct ncclTopoGraph treeGraph;
@@ -1054,6 +1109,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
  NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, comm->peerInfo, sizeof(struct ncclPeerInfo)), ret, fail);

  for (int i = 0; i < nranks; i++) {
+    if (comm->peerInfo[i].hostHash != comm->peerInfo[rank].hostHash) nNodes++;
    if ((i != rank) && (comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].busId == comm->peerInfo[rank].busId)) {
      WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, comm->peerInfo[rank].busId);
      ret = ncclInvalidUsage;
@@ -1063,6 +1119,56 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p

  // AllGather1 - end

+#if CUDART_VERSION >= 11030
+
+#include <cuda.h>
+#include "cudawrap.h"
+
+  // MNNVL support
+  if (nNodes > 1) {
+    int cliqueSize = 0;
+    comm->MNNVL = 0;
+    // Determine the size of the MNNVL domain/clique
+    for (int i = 0; i < nranks; i++) {
+      nvmlGpuFabricInfoV_t *fabricInfo1 = &comm->peerInfo[rank].fabricInfo;
+      nvmlGpuFabricInfoV_t *fabricInfo2 = &comm->peerInfo[i].fabricInfo;
+      // Check that the Fabric state is fully initialized
+      if (fabricInfo2->state != NVML_GPU_FABRIC_STATE_COMPLETED) continue;
+      // Check that the cluster UUID and cliqueId match in each rank
+      // A zero UUID means we don't have MNNVL fabric info - disable MNNVL
+      if ((((long *)&fabricInfo2->clusterUuid)[0]|((long *)fabricInfo2->clusterUuid)[1]) == 0) continue;
+      if ((memcmp(fabricInfo1->clusterUuid, fabricInfo2->clusterUuid, NVML_GPU_FABRIC_UUID_LEN) == 0) &&
+          (fabricInfo1->cliqueId == fabricInfo2->cliqueId)) {
+        cliqueSize++;
+      }
+    }
+    // Determine whether this is a MNNVL system
+    comm->MNNVL = ncclParamMNNVL() < 0 ? cliqueSize == comm->nRanks : ncclParamMNNVL();
+    // MNNVL requires cuMem to be enabled
+    if (!ncclCuMemEnable()) comm->MNNVL = 0;
+    if (comm->MNNVL) {
+      // MNNVL also requires FABRIC handle support
+      int cudaDev;
+      int flag = 0;
+      CUdevice currentDev;
+      CUDACHECK(cudaGetDevice(&cudaDev));
+      CUCHECK(cuDeviceGet(&currentDev, cudaDev));
+      // Ignore error if CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED is not supported
+      (void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev));;
+      if (!flag)
+        comm->MNNVL = 0;
+      else
+        // Force the handle type to be FABRIC for MNNVL
+        ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_FABRIC;
+    }
+    if (ncclParamMNNVL() == 1 && !comm->MNNVL) {
+      WARN("MNNVL is not supported on this system");
+      ret = ncclSystemError;
+      goto fail;
+    }
+  }
+#endif
+
  do {
    // Compute intra-process ranks
    int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0;
@@ -1347,6 +1453,9 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
    goto fail;
  }

+  INFO(NCCL_INIT, "comm %p rank %d nRanks %d nNodes %d localRanks %d localRank %d MNNVL %d",
+       comm, rank, comm->nRanks, comm->nNodes, comm->localRanks, comm->localRank, comm->MNNVL);
+
  nChannelsOrig = comm->nChannels;
  NCCLCHECKGOTO(ncclCalloc(&allTopoRanks, comm->nRanks), ret, fail);
  int nc;
@@ -1439,7 +1548,12 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
  comm->topParentLocalRanks = topParentLocalRanks;

  // Launch proxy service thread, after this, the proxy calls can be used.
-  NCCLCHECKGOTO(ncclProxyCreate(comm), ret, fail);
+  if (parent && parent->config.splitShare) {
+    comm->proxyState = parent->sharedRes->proxyState;
+    ncclAtomicRefCountIncrement(&parent->sharedRes->proxyState->refCount);
+  } else {
+    NCCLCHECKGOTO(ncclProxyCreate(comm), ret, fail);
+  }

  // Connect with prev/next for each ring
  for (int c=0; c<comm->nChannels; c++) {
@@ -1476,8 +1590,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
  // Setup NVLS
  NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail);
  // And NVLS trees if needed
-  if (comm->nvlsSupport && comm->localRanks > 1) {
-    for (int c=0; c<comm->nvlsChannels; c++) {
+  if (comm->nvlsSupport && comm->nNodes > 1) {
+    for (int c=0; c<comm->nChannels; c++) {
      struct ncclChannel* channel = comm->channels+c;
      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_NVLS_TREE_ARITY, channel->nvls.treeDown, 1, &channel->nvls.treeUp, 0), ret, fail);
      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->nvls.treeUp, NCCL_MAX_NVLS_TREE_ARITY, channel->nvls.treeDown, 0), ret, fail);
@@ -1496,7 +1610,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
  // Compute time models for algorithm and protocol combinations
  NCCLCHECKGOTO(ncclTopoTuneModel(comm, comm->minCompCap, comm->maxCompCap, graphs), ret, fail);

-  INFO(NCCL_INIT, "%d coll channels, %d nvls channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
+  INFO(NCCL_INIT, "%d coll channels, %d collnet channels, %d nvls channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->collNetChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);

  do { // Setup p2p structures in comm->tasks
    struct ncclTasks* tasks = &comm->tasks;
@@ -2281,7 +2395,7 @@ static ncclResult_t commReclaim(ncclComm_t comm) {

  NCCLCHECKGOTO(ncclCommGetAsyncError(comm, &state), ret, fail);
  TRACE(NCCL_INIT, "commReclaim: reclaim comm %p rank %d state %d", comm, comm->rank, state);
-  if (state == ncclSuccess && *comm->abortFlag == 0 && comm->finalizeCalled == false) {
+  if (state == ncclSuccess && __atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED) == 0 && comm->finalizeCalled == false) {
    /* user does not call ncclCommFinalize and this is a normal comm destroy. ncclCommDestroy
     * should be nonblocking until last call of ncclCommDestroy. */
    NCCLCHECKGOTO(commFinalize(comm, false), ret, fail);
@@ -2406,9 +2520,9 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) {
  // Ask anything that might still be running on the device to quit
  childAbortFlag = __atomic_load_n(&comm->childAbortFlag, __ATOMIC_ACQUIRE);
  if (childAbortFlag != NULL) {
-    *childAbortFlag = 1;
+    __atomic_store_n(childAbortFlag, 1, __ATOMIC_RELAXED);
  }
-  *comm->abortFlag = 1;
+  __atomic_store_n(comm->abortFlag, 1, __ATOMIC_RELAXED);
  /* init thread must be joined before we destroy the comm,
   * and we should ignore the init error here. */
  ncclCommEnsureReady(comm);
@@ -2556,98 +2670,6 @@ ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
  return ncclSuccess;
 }

-NCCL_PARAM(LocalRegister, "LOCAL_REGISTER", 1);
-
-NCCL_API(ncclResult_t, ncclCommRegister, const ncclComm_t comm, void* buff, size_t size, void** handle);
-ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) {
-  NVTX3_FUNC_RANGE_IN(nccl_domain);
-  ncclResult_t ret = ncclSuccess;
-
-#if CUDART_VERSION >= 12010
-  size_t granularity;
-  if (ncclParamLocalRegister()) {
-    if (comm == NCCL_COMM_NULL || buff == NULL || handle == NULL || size == 0) {
-      WARN("Invalid arguments comm %p, buff %p, size %ld, handle %p", comm, buff, size, handle);
-      ret = ncclInvalidArgument;
-    } else if (comm->nvlsSupport) {
-      CUmulticastObjectProp prop = comm->nvlsResources->properties;
-
-      prop.size = size;
-      CUCHECK(cuMulticastGetGranularity(&granularity, &prop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
-
-      if ((uintptr_t)buff % comm->nvlsResources->ucGran == 0 && size % granularity == 0) {
-        /* we can direct register what user provide */
-        struct ncclRegRequest* req;
-        NCCLCHECK(ncclCalloc(&req, 1));
-        req->buff = (uintptr_t)buff;
-        req->size = size;
-        ncclIntruQueueEnqueue(&comm->regRequestQueue, req);
-        *handle = (void*)req;
-      } else {
-        void* base;
-        size_t baseSize;
-        /* Since we don't provide actually allocated buffer size for users by ncclMemAlloc,
-         * therefore, we need to get the full range of the buffer by cuMemGetAddressRange to
-         * register buffers. */
-        CUCHECK(cuMemGetAddressRange((CUdeviceptr*)&base, &baseSize, (CUdeviceptr)buff));
-        if ((uintptr_t)base % comm->nvlsResources->ucGran == 0 && baseSize % granularity == 0) {
-          struct ncclRegRequest* req;
-          NCCLCHECK(ncclCalloc(&req, 1));
-          req->buff = (uintptr_t)base;
-          req->size = baseSize;
-          ncclIntruQueueEnqueue(&comm->regRequestQueue, req);
-          *handle = (void*)req;
-        } else {
-          WARN("register fails, buffer %p (aligned %s, granularity %ld) and size %ld (aligned %s, granularity %ld) for registration", buff, (uintptr_t)buff % comm->nvlsResources->ucGran == 0 ? "TRUE" : "FALSE", comm->nvlsResources->ucGran, size, size % granularity == 0 ? "TRUE" : "FALSE", granularity);
-          ret = ncclInvalidArgument;
-        }
-      }
-    }
-  }
-#endif
-
-  return ret;
-}
-
-NCCL_API(ncclResult_t, ncclCommDeregister, const ncclComm_t comm, void* handle);
-ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle) {
-  ncclResult_t ret = ncclSuccess;
-
-#if CUDART_VERSION >= 12010
-  struct ncclRegRequest* dreq = (struct ncclRegRequest*)handle;
-  if (ncclParamLocalRegister()) {
-    if (comm == NCCL_COMM_NULL || handle == NULL) {
-      WARN("Invalid arguments comm %p, handle %p", comm, handle);
-      ret = ncclInvalidArgument;
-    } else {
-      struct ncclRegRecord* rec;
-
-      /* first release register record */
-      rec = ncclIntruQueueHead(&comm->regRecordQueue);
-
-      while (rec) {
-        if (rec->buff == dreq->buff && rec->size == dreq->size) {
-          NCCLCHECK(ncclNvlsDeregBuffer(&rec->mcHandle, rec->regAddr, rec->dev, rec->regSize));
-          ncclIntruQueueDelete(&comm->regRecordQueue, rec);
-          free(rec->addrs);
-          free(rec);
-          break;
-        }
-        rec = rec->next;
-      }
-
-      /* then free register request */
-      if (ncclIntruQueueDelete(&comm->regRequestQueue, dreq) == false) {
-        WARN("Invalid handle %p", handle);
-        ret = ncclInvalidArgument;
-      }
-    }
-  }
-#endif
-
-  return ret;
-}
-
 NCCL_API(ncclResult_t, ncclMemAlloc, void **ptr, size_t size);
 ncclResult_t  ncclMemAlloc(void **ptr, size_t size) {
  NVTX3_FUNC_RANGE_IN(nccl_domain);
@@ -2759,4 +2781,4 @@ exit:
  return ret;
 fail:
  goto exit;
-}
+}
@@ -7,7 +7,7 @@
 #include "argcheck.h"
 #include "comm.h"

-static ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
+ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
  cudaPointerAttributes attr;
  cudaError_t err = cudaPointerGetAttributes(&attr, pointer);
  if (err != cudaSuccess || attr.devicePointer == NULL) {
@@ -14,6 +14,9 @@
 // This env var (NCCL_CUMEM_ENABLE) toggles cuMem API usage
 NCCL_PARAM(CuMemEnable, "CUMEM_ENABLE", -2);

+// Handle type used for cuMemCreate()
+CUmemAllocationHandleType ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+
 static int ncclCuMemSupported = 0;

 // Determine whether CUMEM & VMM RDMA is supported on this platform
@@ -132,7 +132,7 @@ ncclResult_t ncclIpcSocketRecvMsg(ncclIpcSocket *handle, void *hdr, int hdrLen,
      WARN("UDS: Receiving data over socket failed : %d", errno);
      return ncclSystemError;
    }
-    if (handle->abortFlag && *handle->abortFlag) return ncclInternalError;
+    if (handle->abortFlag && __atomic_load_n(handle->abortFlag, __ATOMIC_RELAXED)) return ncclInternalError;
  }

  if (recvFd != NULL) {
@@ -221,7 +221,7 @@ ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen,
      WARN("UDS: Sending data over socket %s failed : %s (%d)", temp, strerror(errno), errno);
      return ncclSystemError;
    }
-    if (handle->abortFlag && *handle->abortFlag) return ncclInternalError;
+    if (handle->abortFlag && __atomic_load_n(handle->abortFlag, __ATOMIC_RELAXED)) return ncclInternalError;
  }

  return ncclSuccess;
@@ -39,6 +39,8 @@ namespace {
  NCCL_NVML_FN(nvmlDeviceGetCudaComputeCapability, nvmlReturn_t, (nvmlDevice_t device, int* major, int* minor))
  NCCL_NVML_FN(nvmlDeviceGetP2PStatus, nvmlReturn_t, (nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus))
  NCCL_NVML_FN(nvmlDeviceGetFieldValues, nvmlReturn_t, (nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values))
+  // MNNVL support
+  NCCL_NVML_FN(nvmlDeviceGetGpuFabricInfoV, nvmlReturn_t, (nvmlDevice_t device, nvmlGpuFabricInfoV_t *gpuFabricInfo))

  std::mutex lock; // NVML has had some thread safety bugs
  bool initialized = false;
@@ -82,7 +84,9 @@ ncclResult_t ncclNvmlEnsureInitialized() {
      {(void**)&pfn_nvmlDeviceGetNvLinkCapability, "nvmlDeviceGetNvLinkCapability"},
      {(void**)&pfn_nvmlDeviceGetCudaComputeCapability, "nvmlDeviceGetCudaComputeCapability"},
      {(void**)&pfn_nvmlDeviceGetP2PStatus, "nvmlDeviceGetP2PStatus"},
-      {(void**)&pfn_nvmlDeviceGetFieldValues, "nvmlDeviceGetFieldValues"}
+      {(void**)&pfn_nvmlDeviceGetFieldValues, "nvmlDeviceGetFieldValues"},
+      // MNNVL support
+      {(void**)&pfn_nvmlDeviceGetGpuFabricInfoV, "nvmlDeviceGetGpuFabricInfoV"},
    };
    for(Symbol sym: symbols) {
      *sym.ppfn = dlsym(libhandle, sym.name);
@@ -269,3 +273,12 @@ ncclResult_t ncclNvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount,
  NVMLTRY(nvmlDeviceGetFieldValues, device, valuesCount, values);
  return ncclSuccess;
 }
+
+// MNNVL support
+ncclResult_t ncclNvmlDeviceGetGpuFabricInfoV(nvmlDevice_t device, nvmlGpuFabricInfoV_t *gpuFabricInfo) {
+  NCCLCHECK(ncclNvmlEnsureInitialized());
+  std::lock_guard<std::mutex> locked(lock);
+  gpuFabricInfo->version = nvmlGpuFabricInfo_v2;
+  NVMLTRY(nvmlDeviceGetGpuFabricInfoV, device, gpuFabricInfo);
+  return ncclSuccess;
+}
@@ -169,7 +169,7 @@ ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff
  int curRound = shmem->round;
  size_t mycnt;

-  if (comm == NULL || shmem == NULL || sendbuff == NULL || recvbuff == NULL) {
+  if (comm == NULL || shmem == NULL || sendbuff == NULL || recvbuff == NULL || shmem->maxTypeSize < typeSize) {
    ret = ncclInvalidArgument;
    goto exit;
  }
@@ -184,7 +184,7 @@ ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff
    uint64_t t0 = clockNano();
    while(__atomic_load_n(shmem->cnt[curRound], __ATOMIC_ACQUIRE) != comm->localRanks + 1) {
      if (clockNano() - t0 >= 5 * 1000) sched_yield();
-      if (*comm->abortFlag == 1) {
+      if (__atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED) == 1) {
        ret = ncclInternalError;
        goto exit;
      }
@@ -36,7 +36,7 @@ static ncclResult_t socketProgressOpt(int op, struct ncclSocket* sock, void* ptr
      }
    }
    (*offset) += bytes;
-    if (sock->abortFlag && *sock->abortFlag != 0) {
+    if (sock->abortFlag && __atomic_load_n(sock->abortFlag, __ATOMIC_RELAXED)) {
      INFO(NCCL_NET, "socketProgressOpt: abort called");
      return ncclInternalError;
    }
@@ -531,6 +531,8 @@ static ncclResult_t socketPollConnect(struct ncclSocket* sock) {
    sock->state = ncclSocketStateConnecting;
  } else if (ret != EINPROGRESS) {
    sock->state = ncclSocketStateError;
+    char line[SOCKET_NAME_MAXLEN+1];
+    WARN("socketPollConnect: Connect to %s returned %d(%s) errno %d(%s)", ncclSocketToString(&sock->addr, line), ret, strerror(ret), errno, strerror(errno));
    return ncclSystemError;
  }
  return ncclSuccess;
@@ -618,12 +620,12 @@ ncclResult_t ncclSocketConnect(struct ncclSocket* sock) {
  do {
    NCCLCHECK(socketProgressState(sock));
  } while (sock->asyncFlag == 0 &&
-      (sock->abortFlag == NULL || *sock->abortFlag == 0) &&
+      (sock->abortFlag == NULL || __atomic_load_n(sock->abortFlag, __ATOMIC_RELAXED) == 0) &&
      (sock->state == ncclSocketStateConnecting ||
       sock->state == ncclSocketStateConnectPolling ||
       sock->state == ncclSocketStateConnected));

-  if (sock->abortFlag && *sock->abortFlag != 0) return ncclInternalError;
+  if (sock->abortFlag && __atomic_load_n(sock->abortFlag, __ATOMIC_RELAXED)) return ncclInternalError;

  switch (sock->state) {
    case ncclSocketStateConnecting:
@@ -665,11 +667,11 @@ ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listen
  do {
    NCCLCHECKGOTO(socketProgressState(sock), ret, exit);
  } while (sock->asyncFlag == 0 &&
-      (sock->abortFlag == NULL || *sock->abortFlag == 0) &&
+      (sock->abortFlag == NULL || __atomic_load_n(sock->abortFlag, __ATOMIC_RELAXED) == 0) &&
      (sock->state == ncclSocketStateAccepting ||
       sock->state == ncclSocketStateAccepted));

-  if (sock->abortFlag && *sock->abortFlag != 0) return ncclInternalError;
+  if (sock->abortFlag && __atomic_load_n(sock->abortFlag, __ATOMIC_RELAXED)) return ncclInternalError;

  switch (sock->state) {
    case ncclSocketStateAccepting:
@@ -30,25 +30,25 @@ ncclResult_t ncclLoadTunerPlugin(ncclTuner_t** tuner) {
    if (name) {
      INFO(NCCL_TUNING, "NCCL_TUNER_PLUGIN set to %s", name);
      tunerPluginLib = dlopen(name, RTLD_LAZY | RTLD_LOCAL);
-    }
-    if (tunerPluginLib == nullptr) {
-      // dlopen does not guarantee to set errno, but dlerror only gives us a
-      // string, so checking errno doesn't hurt to try to provide a better
-      // error message
-      if (errno == ENOENT) {
-        INFO(NCCL_TUNING, "Tuner: no plugin found '%s', using default tuner instead.", name);
+      if (tunerPluginLib == nullptr) {
+        // dlopen does not guarantee to set errno, but dlerror only gives us a
+        // string, so checking errno doesn't hurt to try to provide a better
+        // error message
+        if (errno == ENOENT) {
+          INFO(NCCL_TUNING, "Tuner: no plugin found '%s', using default tuner instead.", name);
+        } else {
+          INFO(NCCL_TUNING, "Tuner: plugin load '%s' returned error (%d : %s), using default tuner instead.", name, errno, dlerror());
+        }
      } else {
-        INFO(NCCL_TUNING, "Tuner: plugin load '%s' returned error (%d : %s), using default tuner instead.", name, errno, dlerror());
-      }
-    } else {
-      tunerSymbol = (ncclTuner_t*)dlsym(tunerPluginLib, NCCL_TUNER_PLUGIN_SYMBOL);
-      if (tunerSymbol == nullptr) {
-        INFO(NCCL_TUNING, "Tuner: failed to find " NCCL_TUNER_PLUGIN_SYMBOL " in plugin (%s), using default tuner instead.", name);
-        dlclose(tunerPluginLib);
-        tunerPluginLib = nullptr;
-      } else {
-        INFO(NCCL_TUNING, "Opened tuner: '%s'", tunerSymbol->name);
-        tunerPluginRefCount = 0;
+        tunerSymbol = (ncclTuner_t*)dlsym(tunerPluginLib, NCCL_TUNER_PLUGIN_SYMBOL);
+        if (tunerSymbol == nullptr) {
+          INFO(NCCL_TUNING, "Tuner: failed to find " NCCL_TUNER_PLUGIN_SYMBOL " in plugin (%s), using default tuner instead.", name);
+          dlclose(tunerPluginLib);
+          tunerPluginLib = nullptr;
+        } else {
+          INFO(NCCL_TUNING, "Opened tuner: '%s'", tunerSymbol->name);
+          tunerPluginRefCount = 0;
+        }
      }
    }
  }
@@ -291,3 +291,79 @@ void ncclMemoryStackDestruct(struct ncclMemoryStack* me) {
    h = h1;
  }
 }
+
+const char* ncclOpToString(ncclRedOp_t op) {
+  switch (op) {
+    case ncclSum:
+      return "ncclSum";
+    case ncclProd:
+      return "ncclProd";
+    case ncclMax:
+      return "ncclMax";
+    case ncclMin:
+      return "ncclMin";
+    case ncclAvg:
+      return "ncclAvg";
+    default:
+      return "Unknown";
+  }
+}
+
+const char* ncclDatatypeToString(ncclDataType_t type) {
+  switch (type) {
+    case ncclInt8: // ncclChar
+      return "ncclInt8";
+    case ncclInt32: // ncclInt
+      return "ncclInt32";
+    case ncclUint32:
+      return "ncclUint32";
+    case ncclInt64:
+      return "ncclInt64";
+    case ncclUint64:
+      return "ncclUint64";
+    case ncclFloat16: // ncclHalf
+      return "ncclFloat16";
+    case ncclFloat32: // ncclFloat
+      return "ncclFloat32";
+    case ncclFloat64: // ncclDouble
+      return "ncclFloat64";
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+    case ncclBfloat16:
+      return "ncclBfloat16";
+#endif
+    default:
+      return "Unknown";
+  }
+}
+
+const char* ncclAlgoToString(int algo) {
+  switch (algo) {
+    case NCCL_ALGO_TREE:
+      return "TREE";
+    case NCCL_ALGO_RING:
+      return "RING";
+    case NCCL_ALGO_COLLNET_DIRECT:
+      return "COLLNET_DIRECT";
+    case NCCL_ALGO_COLLNET_CHAIN:
+      return "COLLNET_CHAIN";
+    case NCCL_ALGO_NVLS:
+      return "NVLS";
+    case NCCL_ALGO_NVLS_TREE:
+      return "NVLS_TREE";
+    default:
+      return "Unknown";
+  }
+}
+
+const char* ncclProtoToString(int proto) {
+  switch (proto) {
+    case NCCL_PROTO_LL:
+      return "LL";
+    case NCCL_PROTO_LL128:
+      return "LL128";
+    case NCCL_PROTO_SIMPLE:
+      return "SIMPLE";
+    default:
+      return "Unknown";
+  }
+}
@@ -264,11 +264,7 @@ const char*  ncclGetErrorString(ncclResult_t result);
 const char* pncclGetErrorString(ncclResult_t result);
 /*! @endcond */

-/*! @brief      Returns mesage on last result that occured.
-    @details    Returns a human-readable message of the last error that occurred.
-    @return     String containing the last result
-
-    @param[in]  comm is currently unused and can be set to NULL */
+/* Returns a human-readable message of the last error that occurred. */
 const char*  ncclGetLastError(ncclComm_t comm);
 /*! @cond       include_hidden */
 const char* pncclGetLastError(ncclComm_t comm);
@@ -324,6 +320,18 @@ ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
 /*! @endcond */
 /*! @} */

+/* Register CUDA buffer for zero-copy operation */
+ncclResult_t  ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
+/*! @cond       include_hidden */
+ncclResult_t pncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
+/*! @endcond */
+
+/* Deregister CUDA buffer */
+ncclResult_t  ncclCommDeregister(const ncclComm_t comm, void* handle);
+/*! @cond       include_hidden */
+ncclResult_t pncclCommDeregister(const ncclComm_t comm, void* handle);
+/*! @endcond */
+
 /*! @defgroup   rccl_api_enumerations API Enumerations
    @details    Enumerations used by collective communication calls
    @{ */
@@ -812,16 +820,8 @@ ncclResult_t pncclGroupEnd();
 /*! @endcond */
 /*! @} */

-/* Register CUDA buffer for zero-copy operation */
-ncclResult_t  ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
-ncclResult_t pncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
-
-/* Deregister CUDA buffer */
-ncclResult_t  ncclCommDeregister(const ncclComm_t comm, void* handle);
-ncclResult_t pncclCommDeregister(const ncclComm_t comm, void* handle);
-
 #ifdef __cplusplus
 } // end extern "C"
 #endif

-#endif // end include guard
+#endif // end include guard
@@ -16,16 +16,67 @@
 //#include <sys/stat.h>
 //#include <unistd.h>

-static ncclNet_v7_t ncclNet_v5_as_v7;
-static ncclNet_v7_t ncclNet_v6_as_v7;
+static ncclNet_v8_t ncclNet_v5_as_v8;
+static ncclNet_v8_t ncclNet_v6_as_v8;
+static ncclNet_v8_t ncclNet_v7_as_v8;
 static ncclNet_v5_t *ncclNet_v5;
 static ncclNet_v6_t *ncclNet_v6;
-static ncclCollNet_v7_t ncclCollNet_v5_as_v7;
-static ncclCollNet_v7_t ncclCollNet_v6_as_v7;
+static ncclNet_v7_t *ncclNet_v7;
+static ncclCollNet_v8_t ncclCollNet_v5_as_v8;
+static ncclCollNet_v8_t ncclCollNet_v6_as_v8;
+static ncclCollNet_v8_t ncclCollNet_v7_as_v8;
 static ncclCollNet_v5_t *ncclCollNet_v5;
 static ncclCollNet_v6_t *ncclCollNet_v6;
+static ncclCollNet_v7_t *ncclCollNet_v7;

-static ncclResult_t ncclNet_v6_as_v7_getProperties(int dev, ncclNetProperties_v7_t* props) {
+static ncclResult_t ncclNet_v7_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) {
+  ncclNetProperties_v7_t p7;
+  ncclResult_t ans = ncclNet_v7->getProperties(dev, &p7);
+  if (ans != ncclSuccess) return ans;
+  props->name = p7.name;
+  props->pciPath = p7.pciPath;
+  props->guid = p7.guid;
+  props->ptrSupport = p7.ptrSupport;
+  props->regIsGlobal = 0;
+  props->speed = p7.speed;
+  props->port = p7.port;
+  props->maxComms = p7.maxComms;
+  props->maxRecvs = p7.maxRecvs;
+  props->latency = p7.latency;
+  props->netDeviceType = p7.netDeviceType;
+  props->netDeviceVersion = p7.netDeviceVersion;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclNet_v7_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
+  if (size >= 1<<31) return ncclInternalError;
+  return ncclNet_v7->regMr(comm, data, (int) size, type, mhandle);
+}
+
+static ncclResult_t ncclNet_v7_as_v8_init(ncclDebugLogger_t logfn) {
+  NCCLCHECK(ncclNet_v7->init(logfn));
+  ncclNet_v7_as_v8.name = ncclNet_v7->name;
+  ncclNet_v7_as_v8.devices = ncclNet_v7->devices;
+  ncclNet_v7_as_v8.getProperties = ncclNet_v7_as_v8_getProperties; // ncclNet_v5->getProperties;
+  ncclNet_v7_as_v8.listen = ncclNet_v7->listen;
+  ncclNet_v7_as_v8.connect = ncclNet_v7->connect;
+  ncclNet_v7_as_v8.accept =  ncclNet_v7->accept;
+  ncclNet_v7_as_v8.regMr = ncclNet_v7_as_v8_regMr;
+  ncclNet_v7_as_v8.regMrDmaBuf = ncclNet_v7->regMrDmaBuf;
+  ncclNet_v7_as_v8.deregMr = ncclNet_v7->deregMr;
+  ncclNet_v7_as_v8.isend = ncclNet_v7->isend;
+  ncclNet_v7_as_v8.irecv = ncclNet_v7->irecv;
+  ncclNet_v7_as_v8.iflush = ncclNet_v7->iflush;
+  ncclNet_v7_as_v8.test = ncclNet_v7->test;
+  ncclNet_v7_as_v8.closeSend = ncclNet_v7->closeSend;
+  ncclNet_v7_as_v8.closeRecv = ncclNet_v7->closeRecv;
+  ncclNet_v7_as_v8.closeListen = ncclNet_v7->closeListen;
+  ncclNet_v7_as_v8.getDeviceMr = ncclNet_v7->getDeviceMr;
+  ncclNet_v7_as_v8.irecvConsumed = ncclNet_v7->irecvConsumed;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclNet_v6_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) {
  ncclNetProperties_v6_t p6;
  ncclResult_t ans = ncclNet_v6->getProperties(dev, &p6);
  if (ans != ncclSuccess) return ans;
@@ -33,6 +84,7 @@ static ncclResult_t ncclNet_v6_as_v7_getProperties(int dev, ncclNetProperties_v7
  props->pciPath = p6.pciPath;
  props->guid = p6.guid;
  props->ptrSupport = p6.ptrSupport;
+  props->regIsGlobal = 0;
  props->speed = p6.speed;
  props->port = p6.port;
  props->maxComms = p6.maxComms;
@@ -43,38 +95,43 @@ static ncclResult_t ncclNet_v6_as_v7_getProperties(int dev, ncclNetProperties_v7
  return ncclSuccess;
 }

-static ncclResult_t ncclNet_v6_as_v7_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
+static ncclResult_t ncclNet_v6_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
+  if (size >= 1<<31) return ncclInternalError;
+  return ncclNet_v6->regMr(comm, data, (int) size, type, mhandle);
+}
+
+static ncclResult_t ncclNet_v6_as_v8_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
  return ncclNet_v6->connect(dev, handle, sendComm);
 }

-static ncclResult_t ncclNet_v6_as_v7_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
+static ncclResult_t ncclNet_v6_as_v8_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
  return ncclNet_v6->accept(listenComm, recvComm);
 }

-static ncclResult_t ncclNet_v6_as_v7_init(ncclDebugLogger_t logfn) {
+static ncclResult_t ncclNet_v6_as_v8_init(ncclDebugLogger_t logfn) {
  NCCLCHECK(ncclNet_v6->init(logfn));
-  ncclNet_v6_as_v7.name = ncclNet_v6->name;
-  ncclNet_v6_as_v7.devices = ncclNet_v6->devices;
-  ncclNet_v6_as_v7.getProperties = ncclNet_v6_as_v7_getProperties; // ncclNet_v5->getProperties;
-  ncclNet_v6_as_v7.listen = ncclNet_v6->listen;
-  ncclNet_v6_as_v7.connect = ncclNet_v6_as_v7_connect;
-  ncclNet_v6_as_v7.accept =  ncclNet_v6_as_v7_accept;
-  ncclNet_v6_as_v7.regMr = ncclNet_v6->regMr;
-  ncclNet_v6_as_v7.regMrDmaBuf = ncclNet_v6->regMrDmaBuf;
-  ncclNet_v6_as_v7.deregMr = ncclNet_v6->deregMr;
-  ncclNet_v6_as_v7.isend = ncclNet_v6->isend;
-  ncclNet_v6_as_v7.irecv = ncclNet_v6->irecv;
-  ncclNet_v6_as_v7.iflush = ncclNet_v6->iflush;
-  ncclNet_v6_as_v7.test = ncclNet_v6->test;
-  ncclNet_v6_as_v7.closeSend = ncclNet_v6->closeSend;
-  ncclNet_v6_as_v7.closeRecv = ncclNet_v6->closeRecv;
-  ncclNet_v6_as_v7.closeListen = ncclNet_v6->closeListen;
-  ncclNet_v6_as_v7.getDeviceMr = NULL;
-  ncclNet_v6_as_v7.irecvConsumed = NULL;
+  ncclNet_v6_as_v8.name = ncclNet_v6->name;
+  ncclNet_v6_as_v8.devices = ncclNet_v6->devices;
+  ncclNet_v6_as_v8.getProperties = ncclNet_v6_as_v8_getProperties; // ncclNet_v5->getProperties;
+  ncclNet_v6_as_v8.listen = ncclNet_v6->listen;
+  ncclNet_v6_as_v8.connect = ncclNet_v6_as_v8_connect;
+  ncclNet_v6_as_v8.accept =  ncclNet_v6_as_v8_accept;
+  ncclNet_v6_as_v8.regMr = ncclNet_v6_as_v8_regMr;
+  ncclNet_v6_as_v8.regMrDmaBuf = ncclNet_v6->regMrDmaBuf;
+  ncclNet_v6_as_v8.deregMr = ncclNet_v6->deregMr;
+  ncclNet_v6_as_v8.isend = ncclNet_v6->isend;
+  ncclNet_v6_as_v8.irecv = ncclNet_v6->irecv;
+  ncclNet_v6_as_v8.iflush = ncclNet_v6->iflush;
+  ncclNet_v6_as_v8.test = ncclNet_v6->test;
+  ncclNet_v6_as_v8.closeSend = ncclNet_v6->closeSend;
+  ncclNet_v6_as_v8.closeRecv = ncclNet_v6->closeRecv;
+  ncclNet_v6_as_v8.closeListen = ncclNet_v6->closeListen;
+  ncclNet_v6_as_v8.getDeviceMr = NULL;
+  ncclNet_v6_as_v8.irecvConsumed = NULL;
  return ncclSuccess;
 }

-static ncclResult_t ncclNet_v5_as_v7_getProperties(int dev, ncclNetProperties_v7_t* props) {
+static ncclResult_t ncclNet_v5_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) {
  ncclNetProperties_v6_t p6;
  ncclResult_t ans = ncclNet_v5->getProperties(dev, &p6);
  if (ans != ncclSuccess) return ans;
@@ -82,6 +139,7 @@ static ncclResult_t ncclNet_v5_as_v7_getProperties(int dev, ncclNetProperties_v7
  props->pciPath = p6.pciPath;
  props->guid = p6.guid;
  props->ptrSupport = p6.ptrSupport;
+  props->regIsGlobal = 0;
  props->speed = p6.speed;
  props->port = p6.port;
  props->maxComms = p6.maxComms;
@@ -92,40 +150,45 @@ static ncclResult_t ncclNet_v5_as_v7_getProperties(int dev, ncclNetProperties_v7
  return ncclSuccess;
 }

-static ncclResult_t ncclNet_v5_as_v7_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
+static ncclResult_t ncclNet_v5_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
+  if (size >= 1<<31) return ncclInternalError;
+  return ncclNet_v5->regMr(comm, data, (int) size, type, mhandle);
+}
+
+static ncclResult_t ncclNet_v5_as_v8_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
  return ncclNet_v5->connect(dev, handle, sendComm);
 }

-static ncclResult_t ncclNet_v5_as_v7_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
+static ncclResult_t ncclNet_v5_as_v8_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
  return ncclNet_v5->accept(listenComm, recvComm);
 }

 // We use a wrapper around the v5 init to copy over the struct contents
 // post-init since they may not be initialized before hand.
-static ncclResult_t ncclNet_v5_as_v7_init(ncclDebugLogger_t logfn) {
+static ncclResult_t ncclNet_v5_as_v8_init(ncclDebugLogger_t logfn) {
  NCCLCHECK(ncclNet_v5->init(logfn));
-  ncclNet_v5_as_v7.name = ncclNet_v5->name;
-  ncclNet_v5_as_v7.devices = ncclNet_v5->devices;
-  ncclNet_v5_as_v7.getProperties = ncclNet_v5_as_v7_getProperties;
-  ncclNet_v5_as_v7.listen = ncclNet_v5->listen;
-  ncclNet_v5_as_v7.connect = ncclNet_v5_as_v7_connect;
-  ncclNet_v5_as_v7.accept =  ncclNet_v5_as_v7_accept;
-  ncclNet_v5_as_v7.regMr = ncclNet_v5->regMr;
-  ncclNet_v5_as_v7.regMrDmaBuf = NULL;
-  ncclNet_v5_as_v7.deregMr = ncclNet_v5->deregMr;
-  ncclNet_v5_as_v7.isend = ncclNet_v5->isend;
-  ncclNet_v5_as_v7.irecv = ncclNet_v5->irecv;
-  ncclNet_v5_as_v7.iflush = ncclNet_v5->iflush;
-  ncclNet_v5_as_v7.test = ncclNet_v5->test;
-  ncclNet_v5_as_v7.closeSend = ncclNet_v5->closeSend;
-  ncclNet_v5_as_v7.closeRecv = ncclNet_v5->closeRecv;
-  ncclNet_v5_as_v7.closeListen = ncclNet_v5->closeListen;
-  ncclNet_v5_as_v7.getDeviceMr = NULL;
-  ncclNet_v5_as_v7.irecvConsumed = NULL;
+  ncclNet_v5_as_v8.name = ncclNet_v5->name;
+  ncclNet_v5_as_v8.devices = ncclNet_v5->devices;
+  ncclNet_v5_as_v8.getProperties = ncclNet_v5_as_v8_getProperties;
+  ncclNet_v5_as_v8.listen = ncclNet_v5->listen;
+  ncclNet_v5_as_v8.connect = ncclNet_v5_as_v8_connect;
+  ncclNet_v5_as_v8.accept =  ncclNet_v5_as_v8_accept;
+  ncclNet_v5_as_v8.regMr = ncclNet_v5_as_v8_regMr;
+  ncclNet_v5_as_v8.regMrDmaBuf = NULL;
+  ncclNet_v5_as_v8.deregMr = ncclNet_v5->deregMr;
+  ncclNet_v5_as_v8.isend = ncclNet_v5->isend;
+  ncclNet_v5_as_v8.irecv = ncclNet_v5->irecv;
+  ncclNet_v5_as_v8.iflush = ncclNet_v5->iflush;
+  ncclNet_v5_as_v8.test = ncclNet_v5->test;
+  ncclNet_v5_as_v8.closeSend = ncclNet_v5->closeSend;
+  ncclNet_v5_as_v8.closeRecv = ncclNet_v5->closeRecv;
+  ncclNet_v5_as_v8.closeListen = ncclNet_v5->closeListen;
+  ncclNet_v5_as_v8.getDeviceMr = NULL;
+  ncclNet_v5_as_v8.irecvConsumed = NULL;
  return ncclSuccess;
 }

-static ncclResult_t ncclCollNet_v5_as_v7_getProperties(int dev, ncclNetProperties_v7_t* props) {
+static ncclResult_t ncclCollNet_v5_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) {
  ncclNetProperties_v6_t p6;
  ncclResult_t ans = ncclCollNet_v5->getProperties(dev, &p6);
  if (ans != ncclSuccess) return ans;
@@ -133,6 +196,7 @@ static ncclResult_t ncclCollNet_v5_as_v7_getProperties(int dev, ncclNetPropertie
  props->pciPath = p6.pciPath;
  props->guid = p6.guid;
  props->ptrSupport = p6.ptrSupport;
+  props->regIsGlobal = 0;
  props->speed = p6.speed;
  props->port = p6.port;
  props->maxComms = p6.maxComms;
@@ -143,28 +207,35 @@ static ncclResult_t ncclCollNet_v5_as_v7_getProperties(int dev, ncclNetPropertie
  return ncclSuccess;
 }

+static ncclResult_t ncclCollNet_v5_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
+  if (size >= 1<<31) return ncclInternalError;
+  return ncclCollNet_v5->regMr(comm, data, (int) size, type, mhandle);
+}
+
 // We use a wrapper around the v5 init to copy over the struct contents
 // post-init since they may not be initialized before hand.
-static ncclResult_t ncclCollNet_v5_as_v7_init(ncclDebugLogger_t logfn) {
+static ncclResult_t ncclCollNet_v5_as_v8_init(ncclDebugLogger_t logfn) {
  NCCLCHECK(ncclCollNet_v5->init(logfn));
-  ncclCollNet_v5_as_v7.name = ncclCollNet_v5->name;
-  ncclCollNet_v5_as_v7.devices = ncclCollNet_v5->devices;
-  ncclCollNet_v5_as_v7.getProperties = ncclCollNet_v5_as_v7_getProperties;
-  ncclCollNet_v5_as_v7.listen = ncclCollNet_v5->listen;
-  ncclCollNet_v5_as_v7.connect = ncclCollNet_v5->connect;
-  ncclCollNet_v5_as_v7.reduceSupport = ncclCollNet_v5->reduceSupport;
-  ncclCollNet_v5_as_v7.regMr = ncclCollNet_v5->regMr;
-  ncclCollNet_v5_as_v7.regMrDmaBuf = NULL;
-  ncclCollNet_v5_as_v7.deregMr = ncclCollNet_v5->deregMr;
-  ncclCollNet_v5_as_v7.iallreduce = ncclCollNet_v5->iallreduce;
-  ncclCollNet_v5_as_v7.iflush = ncclCollNet_v5->iflush;
-  ncclCollNet_v5_as_v7.test = ncclCollNet_v5->test;
-  ncclCollNet_v5_as_v7.closeColl = ncclCollNet_v5->closeColl;
-  ncclCollNet_v5_as_v7.closeListen = ncclCollNet_v5->closeListen;
+  ncclCollNet_v5_as_v8.name = ncclCollNet_v5->name;
+  ncclCollNet_v5_as_v8.devices = ncclCollNet_v5->devices;
+  ncclCollNet_v5_as_v8.getProperties = ncclCollNet_v5_as_v8_getProperties;
+  ncclCollNet_v5_as_v8.listen = ncclCollNet_v5->listen;
+  ncclCollNet_v5_as_v8.connect = ncclCollNet_v5->connect;
+  ncclCollNet_v5_as_v8.reduceSupport = ncclCollNet_v5->reduceSupport;
+  ncclCollNet_v5_as_v8.regMr = ncclCollNet_v5_as_v8_regMr;
+  ncclCollNet_v5_as_v8.regMrDmaBuf = NULL;
+  ncclCollNet_v5_as_v8.deregMr = ncclCollNet_v5->deregMr;
+  ncclCollNet_v5_as_v8.iallreduce = ncclCollNet_v5->iallreduce;
+  ncclCollNet_v5_as_v8.iallgather = nullptr;
+  ncclCollNet_v5_as_v8.ireducescatter = nullptr;
+  ncclCollNet_v5_as_v8.iflush = ncclCollNet_v5->iflush;
+  ncclCollNet_v5_as_v8.test = ncclCollNet_v5->test;
+  ncclCollNet_v5_as_v8.closeColl = ncclCollNet_v5->closeColl;
+  ncclCollNet_v5_as_v8.closeListen = ncclCollNet_v5->closeListen;
  return ncclSuccess;
 }

-static ncclResult_t ncclCollNet_v6_as_v7_getProperties(int dev, ncclNetProperties_v7_t* props) {
+static ncclResult_t ncclCollNet_v6_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) {
  ncclNetProperties_v6_t p6;
  ncclResult_t ans = ncclCollNet_v6->getProperties(dev, &p6);
  if (ans != ncclSuccess) return ans;
@@ -172,6 +243,7 @@ static ncclResult_t ncclCollNet_v6_as_v7_getProperties(int dev, ncclNetPropertie
  props->pciPath = p6.pciPath;
  props->guid = p6.guid;
  props->ptrSupport = p6.ptrSupport;
+  props->regIsGlobal = 0;
  props->speed = p6.speed;
  props->port = p6.port;
  props->maxComms = p6.maxComms;
@@ -182,24 +254,78 @@ static ncclResult_t ncclCollNet_v6_as_v7_getProperties(int dev, ncclNetPropertie
  return ncclSuccess;
 }

-// We use a wrapper around the v5 init to copy over the struct contents
+static ncclResult_t ncclCollNet_v6_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
+  if (size >= 1<<31) return ncclInternalError;
+  return ncclCollNet_v6->regMr(comm, data, (int) size, type, mhandle);
+}
+
+// We use a wrapper around the v6 init to copy over the struct contents
 // post-init since they may not be initialized before hand.
-static ncclResult_t ncclCollNet_v6_as_v7_init(ncclDebugLogger_t logfn) {
+static ncclResult_t ncclCollNet_v6_as_v8_init(ncclDebugLogger_t logfn) {
  NCCLCHECK(ncclCollNet_v6->init(logfn));
-  ncclCollNet_v6_as_v7.name = ncclCollNet_v6->name;
-  ncclCollNet_v6_as_v7.devices = ncclCollNet_v6->devices;
-  ncclCollNet_v6_as_v7.getProperties = ncclCollNet_v6_as_v7_getProperties;
-  ncclCollNet_v6_as_v7.listen = ncclCollNet_v6->listen;
-  ncclCollNet_v6_as_v7.connect = ncclCollNet_v6->connect;
-  ncclCollNet_v6_as_v7.reduceSupport = ncclCollNet_v6->reduceSupport;
-  ncclCollNet_v6_as_v7.regMr = ncclCollNet_v6->regMr;
-  ncclCollNet_v6_as_v7.regMrDmaBuf = ncclCollNet_v6->regMrDmaBuf;
-  ncclCollNet_v6_as_v7.deregMr = ncclCollNet_v6->deregMr;
-  ncclCollNet_v6_as_v7.iallreduce = ncclCollNet_v6->iallreduce;
-  ncclCollNet_v6_as_v7.iflush = ncclCollNet_v6->iflush;
-  ncclCollNet_v6_as_v7.test = ncclCollNet_v6->test;
-  ncclCollNet_v6_as_v7.closeColl = ncclCollNet_v6->closeColl;
-  ncclCollNet_v6_as_v7.closeListen = ncclCollNet_v6->closeListen;
+  ncclCollNet_v6_as_v8.name = ncclCollNet_v6->name;
+  ncclCollNet_v6_as_v8.devices = ncclCollNet_v6->devices;
+  ncclCollNet_v6_as_v8.getProperties = ncclCollNet_v6_as_v8_getProperties;
+  ncclCollNet_v6_as_v8.listen = ncclCollNet_v6->listen;
+  ncclCollNet_v6_as_v8.connect = ncclCollNet_v6->connect;
+  ncclCollNet_v6_as_v8.reduceSupport = ncclCollNet_v6->reduceSupport;
+  ncclCollNet_v6_as_v8.regMr = ncclCollNet_v6_as_v8_regMr;
+  ncclCollNet_v6_as_v8.regMrDmaBuf = ncclCollNet_v6->regMrDmaBuf;
+  ncclCollNet_v6_as_v8.deregMr = ncclCollNet_v6->deregMr;
+  ncclCollNet_v6_as_v8.iallreduce = ncclCollNet_v6->iallreduce;
+  ncclCollNet_v6_as_v8.iallgather = nullptr;
+  ncclCollNet_v6_as_v8.ireducescatter = nullptr;
+  ncclCollNet_v6_as_v8.iflush = ncclCollNet_v6->iflush;
+  ncclCollNet_v6_as_v8.test = ncclCollNet_v6->test;
+  ncclCollNet_v6_as_v8.closeColl = ncclCollNet_v6->closeColl;
+  ncclCollNet_v6_as_v8.closeListen = ncclCollNet_v6->closeListen;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclCollNet_v7_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) {
+  ncclNetProperties_v7_t p7;
+  ncclResult_t ans = ncclCollNet_v7->getProperties(dev, &p7);
+  if (ans != ncclSuccess) return ans;
+  props->name = p7.name;
+  props->pciPath = p7.pciPath;
+  props->guid = p7.guid;
+  props->ptrSupport = p7.ptrSupport;
+  props->regIsGlobal = 0;
+  props->speed = p7.speed;
+  props->port = p7.port;
+  props->maxComms = p7.maxComms;
+  props->maxRecvs = p7.maxRecvs;
+  props->latency = p7.latency;
+  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
+  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclCollNet_v7_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
+  if (size >= 1<<31) return ncclInternalError;
+  return ncclCollNet_v7->regMr(comm, data, (int) size, type, mhandle);
+}
+
+// We use a wrapper around the v7 init to copy over the struct contents
+// post-init since they may not be initialized before hand.
+static ncclResult_t ncclCollNet_v7_as_v8_init(ncclDebugLogger_t logfn) {
+  NCCLCHECK(ncclCollNet_v7->init(logfn));
+  ncclCollNet_v7_as_v8.name = ncclCollNet_v7->name;
+  ncclCollNet_v7_as_v8.devices = ncclCollNet_v7->devices;
+  ncclCollNet_v7_as_v8.getProperties = ncclCollNet_v7_as_v8_getProperties;
+  ncclCollNet_v7_as_v8.listen = ncclCollNet_v7->listen;
+  ncclCollNet_v7_as_v8.connect = ncclCollNet_v7->connect;
+  ncclCollNet_v7_as_v8.reduceSupport = ncclCollNet_v7->reduceSupport;
+  ncclCollNet_v7_as_v8.regMr = ncclCollNet_v7_as_v8_regMr;
+  ncclCollNet_v7_as_v8.regMrDmaBuf = ncclCollNet_v7->regMrDmaBuf;
+  ncclCollNet_v7_as_v8.deregMr = ncclCollNet_v7->deregMr;
+  ncclCollNet_v7_as_v8.iallreduce = ncclCollNet_v7->iallreduce;
+  ncclCollNet_v7_as_v8.iallgather = nullptr;
+  ncclCollNet_v7_as_v8.ireducescatter = nullptr;
+  ncclCollNet_v7_as_v8.iflush = ncclCollNet_v7->iflush;
+  ncclCollNet_v7_as_v8.test = ncclCollNet_v7->test;
+  ncclCollNet_v7_as_v8.closeColl = ncclCollNet_v7->closeColl;
+  ncclCollNet_v7_as_v8.closeListen = ncclCollNet_v7->closeListen;
  return ncclSuccess;
 }

@@ -237,54 +363,72 @@ ncclResult_t ncclNetPluginInit() {
    return ncclSuccess;
  }

-  ncclNets[0] = (ncclNet_v7_t*)dlsym(netPluginLib, "ncclNetPlugin_v7");
+  ncclNets[0] = (ncclNet_v8_t*)dlsym(netPluginLib, "ncclNetPlugin_v8");
  if (ncclNets[0] == nullptr) {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v7 symbol.");
-    // Try v6 plugin
-    ncclNet_v6 = (ncclNet_v6_t*)dlsym(netPluginLib, "ncclNetPlugin_v6");
-    if (ncclNet_v6 == nullptr) {
-      // Try v5 plugin
-      ncclNet_v5 = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5");
-      if (ncclNet_v5 == nullptr) {
-        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin symbol (>= v5). ncclNetPlugin symbols v4 and lower are not supported.");
-        if (netPluginLib != nullptr) dlclose(netPluginLib);
-        return ncclSuccess;
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v8 symbol.");
+    // Try v7 plugin
+    ncclNet_v7 = (ncclNet_v7_t*)dlsym(netPluginLib, "ncclNetPlugin_v7");
+    if (ncclNet_v7 == nullptr) {
+      // Try v6 plugin
+      ncclNet_v6 = (ncclNet_v6_t*)dlsym(netPluginLib, "ncclNetPlugin_v6");
+      if (ncclNet_v6 == nullptr) {
+        // Try v5 plugin
+        ncclNet_v5 = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5");
+        if (ncclNet_v5 == nullptr) {
+          INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin symbol (>= v5). ncclNetPlugin symbols v4 and lower are not supported.");
+          if (netPluginLib != nullptr) dlclose(netPluginLib);
+          return ncclSuccess;
+        } else {
+          ncclNets[0] = &ncclNet_v5_as_v8;
+          ncclNet_v5_as_v8.init = ncclNet_v5_as_v8_init;
+          // Set the name right away to allow for NCCL_NET=... to work
+          ncclNet_v5_as_v8.name = ncclNet_v5->name;
+          INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v5)", ncclNets[0]->name);
+        }
      } else {
-        ncclNets[0] = &ncclNet_v5_as_v7;
-        ncclNet_v5_as_v7.init = ncclNet_v5_as_v7_init;
+        ncclNets[0] = &ncclNet_v6_as_v8;
+        ncclNet_v6_as_v8.init = ncclNet_v6_as_v8_init;
        // Set the name right away to allow for NCCL_NET=... to work
-        ncclNet_v5_as_v7.name = ncclNet_v5->name;
-        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v5)", ncclNets[0]->name);
+        ncclNet_v6_as_v8.name = ncclNet_v6->name;
+        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v6)", ncclNets[0]->name);
      }
    } else {
-      ncclNets[0] = &ncclNet_v6_as_v7;
-      ncclNet_v6_as_v7.init = ncclNet_v6_as_v7_init;
+      ncclNets[0] = &ncclNet_v7_as_v8;
+      ncclNet_v7_as_v8.init = ncclNet_v7_as_v8_init;
      // Set the name right away to allow for NCCL_NET=... to work
-      ncclNet_v6_as_v7.name = ncclNet_v6->name;
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v6)", ncclNets[0]->name);
+      ncclNet_v7_as_v8.name = ncclNet_v7->name;
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v7)", ncclNets[0]->name);
    }
  }

  // Check for CollNet
-  ncclCollNets[0] = (ncclCollNet_v7_t*) dlsym(netPluginLib, "ncclCollNetPlugin_v7");
+  ncclCollNets[0] = (ncclCollNet_v8_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v8");
  if (ncclCollNets[0] == nullptr) {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v7 symbol.");
-    ncclCollNet_v6 = (ncclCollNet_v6_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v6");
-    if (ncclCollNet_v6 == nullptr) {
-      ncclCollNet_v5 = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5");
-      if (ncclCollNet_v5 == nullptr) {
-        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin symbol (>= v5). ncclCollNetPlugin symbols v4 and lower are not supported.");
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v8 symbol.");
+    ncclCollNet_v7 = (ncclCollNet_v7_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v7");
+    if (ncclCollNet_v7 == nullptr) {
+      ncclCollNet_v6 = (ncclCollNet_v6_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v6");
+      if (ncclCollNet_v6 == nullptr) {
+        ncclCollNet_v5 = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5");
+        if (ncclCollNet_v5 == nullptr) {
+          INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin symbol (>= v5). ncclCollNetPlugin symbols v4 and lower are not supported.");
+        } else {
+          ncclCollNets[0] = &ncclCollNet_v5_as_v8;
+          ncclCollNet_v5_as_v8.init = ncclCollNet_v5_as_v8_init;
+          ncclCollNet_v5_as_v8.name = ncclCollNet_v5->name;
+          INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v5)", ncclCollNets[0]->name);
+        }
      } else {
-        ncclCollNets[0] = &ncclCollNet_v5_as_v7;
-        ncclCollNet_v5_as_v7.init = ncclCollNet_v5_as_v7_init;
-        ncclCollNet_v5_as_v7.name = ncclCollNet_v5->name;
-        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v5)", ncclCollNets[0]->name);
+        ncclCollNets[0] = &ncclCollNet_v6_as_v8;
+        ncclCollNet_v6_as_v8.init = ncclCollNet_v6_as_v8_init;
+        ncclCollNet_v6_as_v8.name = ncclCollNet_v6->name;
+        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v6)", ncclCollNets[0]->name);
      }
    } else {
-      ncclCollNets[0] = &ncclCollNet_v6_as_v7;
-      ncclCollNet_v6_as_v7.init = ncclCollNet_v6_as_v7_init;
-      ncclCollNet_v6_as_v7.name = ncclCollNet_v6->name;
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v6)", ncclCollNets[0]->name);
+      ncclCollNets[0] = &ncclCollNet_v7_as_v8;
+      ncclCollNet_v7_as_v8.init = ncclCollNet_v7_as_v8_init;
+      ncclCollNet_v7_as_v8.name = ncclCollNet_v7->name;
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v7)", ncclCollNets[0]->name);
    }
  }
  return ncclSuccess;
@@ -330,6 +474,7 @@ static ncclResult_t netGetState(int i, enum ncclNetState* state) {
 }

 static ncclResult_t collNetGetState(int i, enum ncclNetState* state) {
+  pthread_mutex_lock(&netLock);
  if (ncclCollNetStates[i] == ncclNetStateInit) {
    int ndev;
    if (ncclCollNets[i]->init(ncclDebugLog) != ncclSuccess) ncclCollNetStates[i] = ncclNetStateDisabled;
@@ -337,6 +482,7 @@ static ncclResult_t collNetGetState(int i, enum ncclNetState* state) {
    else ncclCollNetStates[i] = ncclNetStateEnabled;
  }
  *state = ncclCollNetStates[i];
+  pthread_mutex_unlock(&netLock);
  return ncclSuccess;
 }

@@ -421,7 +567,7 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
    while (!connected) {

      // If we're aborting now, skip to cleanup
-      if (*comm->abortFlag) {
+      if (__atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED)) {
        goto cleanup2;
      }

@@ -458,11 +604,9 @@ cleanup1:
 }

 int ncclNetVersion(struct ncclComm* comm) {
-  if (comm->ncclNet == &ncclNet_v5_as_v7) {
-    return 5;
-  } else if (comm->ncclNet == &ncclNet_v6_as_v7) {
-    return 6;
-  } else {
-    return 7;
-  }
+  return
+    (comm->ncclNet == &ncclNet_v5_as_v8) ? 5 :
+    (comm->ncclNet == &ncclNet_v6_as_v8) ? 6 :
+    (comm->ncclNet == &ncclNet_v7_as_v8) ? 7 :
+    8;
 }
@@ -353,20 +353,22 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr
    WARN("Proxy append out of bounds");
    return ncclInternalError;
  }
-
  //memset(sub, 0, sizeof(struct ncclProxySubArgs));
  sub->connection = op->connection;
  sub->channelId = op->channelId;
  sub->nsteps = op->nsteps;
  sub->nbytes = op->nbytes;
  sub->peer = op->root;
+  sub->reg = op->reg;
+  sub->buffer = op->buffer;
  args->nsubs = subIndex+1;
  if (subIndex) {
    if ((args->sliceSteps != op->sliceSteps) ||
        (args->chunkSteps != op->chunkSteps) ||
        (args->protocol != op->protocol) ||
        (args->dtype != op->dtype) ||
-        (args->redOp != op->redOp)) {
+        (args->redOp != op->redOp) ||
+        (args->coll != op->coll)) {
      WARN("Proxy append mismatch");
      return ncclInternalError;
    }
@@ -386,6 +388,8 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr
  args->redOp = op->redOp;
  args->pattern = op->pattern;
  args->protocol = op->protocol;
+  args->coll = op->coll;
+  args->specifics = op->specifics;
  args->state = ncclProxyOpReady;
  args->progress = op->connection->tcomm->proxyProgress;
  args->proxyAppendPtr = op->connection->proxyAppendPtr;
@@ -595,7 +599,7 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool

 NCCL_PARAM(ChunkSize, "CHUNK_SIZE", 0);

-ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op) {
+ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op, int reg) {
  memset(op, 0, sizeof(struct ncclProxyOp));
  int channelId = info->channelId;
  struct ncclChannel* channel = info->comm->channels+channelId;
@@ -616,15 +620,17 @@ ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op)
    op->pattern = ncclPatternSend;
    if (op->root != info->comm->rank && peer->send[1].transportComm == &netTransport.send) {
      // Tune chunk size for the network
-      if (info->count < stepSize) info->chunkSize /= 4;
+      if (info->protocol == NCCL_PROTO_SIMPLE && info->count < stepSize) info->chunkSize /= 4;
      else if (info->count < 8*stepSize) info->chunkSize /= 2;
+      if (info->protocol == NCCL_PROTO_SIMPLE && peer->send[1].proxyConn.sameProcess) op->reg = reg;
    }
  } else if (info->coll == ncclFuncRecv) {
    op->pattern = ncclPatternRecv;
    if (op->root != info->comm->rank && peer->recv[1].transportComm == &netTransport.recv) {
      // Tune chunk size for the network
-      if (info->count < stepSize) info->chunkSize /= 4;
+      if (info->protocol == NCCL_PROTO_SIMPLE && info->count < stepSize) info->chunkSize /= 4;
      else if (info->count < 8*stepSize) info->chunkSize /= 2;
+      if (info->protocol == NCCL_PROTO_SIMPLE && peer->recv[1].proxyConn.sameProcess) op->reg = reg;
    }
  } else {
    WARN("P2p operation is neither send or recv");
@@ -633,17 +639,21 @@ ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op)
  if (ncclParamChunkSize() != 0) {
    info->chunkSize = ncclParamChunkSize();
  }
+  op->buffer = op->reg ? info->recvbuff : NULL;
  op->chunkSize = info->chunkSize;
+  op->nbytes = info->count;

  // Compute nSteps for proxies
  int chunkEffectiveSize = op->chunkSize;
  if (op->protocol == NCCL_PROTO_LL) {
    chunkEffectiveSize /= 2;
+    op->nbytes *= 2;
+    op->nbytes = DIVUP(op->nbytes, sizeof(union ncclLLFifoLine)) * sizeof(union ncclLLFifoLine);
  }

-  op->nbytes = stepSize;
+  if (!op->reg) op->nbytes = std::min(op->nbytes, (ssize_t)info->chunkSize);
  op->nsteps = DIVUP(info->count, chunkEffectiveSize);
-  if (op->nsteps == 0) op->nsteps = 1;
+  if (op->nsteps == 0 || op->reg) op->nsteps = 1;

  return ncclSuccess;
 }
@@ -1074,35 +1084,60 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
  return ncclSuccess;
 }

-// cuMem API support
-// The response is sent out-of-band using ncclIpcSocket for this specific command
-ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void *handle, int* convertedFd) {
-  ncclResult_t ret = ncclSuccess;
-  ncclResult_t res = ncclInProgress;
+// UDS support
+ncclResult_t ncclProxyCallBlockingUDS(struct ncclComm* comm, int tpRank, int type, void* reqBuff, int reqSize, void* respBuff, int respSize, int *respFd) {
+  ncclResult_t res = ncclSuccess;
  struct ncclIpcSocket ipcSock = { 0 };
  void *opId = (void*)((((uintptr_t)random()) << 32) | random());

-  // Create a UDS socket to receive the converted fd
-  NCCLCHECK(ncclIpcSocketInit(&ipcSock, comm->topParentLocalRanks[comm->localRank], (uint64_t)opId, comm->abortFlag));
+  int rank = comm->topParentLocalRanks[comm->localRank];
+  struct ncclProxyState* sharedProxyState = comm->proxyState;
+  uint64_t pidHash = sharedProxyState->peerAddressesUDS[tpRank];

-  // Request the allocation of a UDS fd for the handle over sockets
-  NCCLCHECKGOTO(ncclProxyCallAsync(comm, proxyConn, ncclProxyMsgGetFd, handle, sizeof(CUmemGenericAllocationHandle), 0, opId), ret, error);
+  INFO(NCCL_PROXY, "ProxyCall UDS comm %p rank %d tpRank %d(%lx) reqSize %d respSize %d respFd %p opId %p",
+       comm, rank, tpRank, pidHash, reqSize, respSize, respFd, opId);

-  // Receive the converted fd over UDS
-  NCCLCHECKGOTO(ncclIpcSocketRecvFd(&ipcSock, convertedFd), ret, error);
-  TRACE(NCCL_PROXY, "UDS: ClientGetFd handle 0x%lx rank %d returned fd %d", *(uint64_t*)handle, proxyConn->tpLocalRank, *convertedFd);
-  NCCLCHECKGOTO(ncclIpcSocketClose(&ipcSock), ret, error);
+  // cuMem: Create a UDS socket to receive the response
+  NCCLCHECK(ncclIpcSocketInit(&ipcSock, rank, (uint64_t)opId, comm->abortFlag));

-  // Wait for proxy response (sockets)
-  while (res == ncclInProgress) {
-    res = ncclPollProxyResponse(comm, proxyConn, NULL, opId);
-  }
+  ncclIpcHdr hdr;
+  hdr.type = type;
+  hdr.rank = rank;
+  hdr.reqSize = reqSize;
+  hdr.respSize = respSize;
+  hdr.opId = opId;
+  assert(reqSize <= sizeof(hdr.data));
+  memcpy(&hdr.data, reqBuff, reqSize);
+  NCCLCHECKGOTO(ncclIpcSocketSendMsg(&ipcSock, &hdr, sizeof(hdr), -1, tpRank, pidHash), res, error);
+  NCCLCHECKGOTO(ncclIpcSocketRecvMsg(&ipcSock, respBuff, respSize, respFd), res, error);
+  NCCLCHECKGOTO(ncclIpcSocketClose(&ipcSock), res, error);
+
+  INFO(NCCL_PROXY, "ProxyCall UDS comm %p rank %d tpRank %d(%lx) reqSize %d respSize %d respFd %d opId %p - DONE",
+       comm, rank, tpRank, pidHash, reqSize, respSize, (respFd ? *respFd : -1), opId);
+
+  return res;
+
+error:
+  NCCLCHECK(ncclIpcSocketClose(&ipcSock));
+  WARN("ncclProxyCallBlockingUDS call to tpRank %d(%lx) failed : %d", tpRank, pidHash, res);
+  return res;
+}
+
+// cuMem API support
+// The request/response is sent out-of-band using ncclIpcSocket for this specific command
+ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, int tpRank, void *handle, int* convertedFd) {
+  ncclResult_t ret = ncclSuccess;
+
+  // Request the allocation of a UDS fd for the handle
+  NCCLCHECKGOTO(ncclProxyCallBlockingUDS(comm, tpRank, ncclProxyMsgGetFd, handle, sizeof(CUmemGenericAllocationHandle), NULL, 0, convertedFd), ret, error);
+
+  // We have now received the converted fd over UDS
+  INFO(NCCL_PROXY, "UDS: ClientGetFd handle 0x%lx tpRank %d returned fd %d", *(uint64_t*)handle, tpRank, *convertedFd);

  return ret;

 error:
-  NCCLCHECK(ncclIpcSocketClose(&ipcSock));
-  WARN("ncclProxyClientGetFd call to rank %d handle 0x%lx failed : %d", proxyConn->tpRank, *(uint64_t*)handle, ret);
+  WARN("ncclProxyClientGetFd call to tpRank %d handle 0x%lx failed : %d", tpRank, *(uint64_t*)handle, ret);
  return ret;
 }

@@ -1137,7 +1172,7 @@ error:
 ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void* respBuff, void* opId) {
  struct ncclProxyState* sharedProxyState = comm->proxyState;
  // Receive the connection pointer from the Proxy
-  if (*comm->abortFlag) {
+  if (__atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED)) {
    WARN("Comm %p is in abort state", comm);
    return ncclInternalError;
  }
@@ -1292,13 +1327,13 @@ static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclPr
 }

 // cuMem API support
-static ncclResult_t proxyGetFd(struct ncclProxyLocalPeer* peer, void *opId, struct ncclProxyState* proxyState, uint64_t handle) {
+static ncclResult_t proxyGetFd(struct ncclProxyState* proxyState, int rank, void *opId, uint64_t handle) {
 #if CUDART_VERSION >= 11030
  // cuMem API support
  ncclResult_t ret = ncclSuccess;
  struct ncclIpcSocket ipcSock = { 0 };
  uint64_t hash = (uint64_t) opId;
-  INFO(NCCL_PROXY, "UDS proxyGetFd received handle 0x%lx peer %d opId %lx", handle, peer->tpLocalRank, hash);
+  INFO(NCCL_PROXY, "UDS proxyGetFd received handle 0x%lx peer %d opId %lx", handle, rank, hash);

  CUmemAllocationHandleType type = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
  int fd = -1;
@@ -1306,7 +1341,7 @@ static ncclResult_t proxyGetFd(struct ncclProxyLocalPeer* peer, void *opId, stru
  CUCHECK(cuMemExportToShareableHandle(&fd, handle, type, 0));
  // Send back the converted fd using UDS
  NCCLCHECKGOTO(ncclIpcSocketInit(&ipcSock, proxyState->tpRank, hash^1, proxyState->abortFlag), ret, error);
-  NCCLCHECKGOTO(ncclIpcSocketSendFd(&ipcSock, fd, peer->tpLocalRank, hash), ret, error);
+  NCCLCHECKGOTO(ncclIpcSocketSendFd(&ipcSock, fd, rank, hash), ret, error);
 error:
  NCCLCHECK(ncclIpcSocketClose(&ipcSock));
  // We can now safely close the exported fd
@@ -1331,11 +1366,8 @@ static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclP
    TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgSharedInit opId=%p op.reqBuff=%p nChannels=%d", op->opId, op->reqBuff, nChannels);
    if (op->connection->tcomm->proxySharedInit) res = op->connection->tcomm->proxySharedInit(op->connection, proxyState, nChannels);
    __atomic_store_n(&op->connection->state, connSharedInitialized, __ATOMIC_RELEASE);
-  } else if (op->type == ncclProxyMsgGetFd) {
-    uint64_t handle = *(uint64_t*)op->reqBuff;
-    TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgGetFd opId=%p op.reqBuff=%p handle=0x%lx", op->opId, op->reqBuff, handle);
-    res = proxyGetFd(peer, op->opId, proxyState, handle); // cuMem API support
-  } else if (op->type == ncclProxyMsgInit) {
+  }
+  else if (op->type == ncclProxyMsgInit) {
    TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgInit opId=%p op.reqBuff=%p", op->opId, op->reqBuff);
    res = proxyConnInit(peer, connectionPool, proxyState, (ncclProxyInitReq*) op->reqBuff, (ncclProxyInitResp*) op->respBuff, &op->connection);
  } else return ncclInternalError;
@@ -1365,7 +1397,7 @@ static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclP
    (*asyncOpCount)--;
    return ncclSuccess;

-  } else if (*proxyState->abortFlag != 0) {
+  } else if (__atomic_load_n(proxyState->abortFlag, __ATOMIC_RELAXED) != 0) {
    return ncclInternalError;
  }

@@ -1451,7 +1483,7 @@ void* ncclProxyService(void* _args) {
    /* Even if local comm aborts, we cannot let proxy thread exit if we still have peer
     * connections. Need to wait until all other related comms call abort and safely exit
     * together, or we could face segmentation fault. */
-    if (*proxyState->abortFlag != 0) stop = 1;
+    if (__atomic_load_n(proxyState->abortFlag, __ATOMIC_RELAXED) != 0) stop = 1;
    /* never let proxy service thread blocks in poll, or it cannot receive abortFlag. */
    int ret;
    do {
@@ -1568,13 +1600,74 @@ void* ncclProxyService(void* _args) {
  return NULL;
 }

-ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses) {
+
+// Process a request on the UDS socket
+static ncclResult_t proxyUDSRecvReq(struct ncclProxyState* proxyState, int reqFd) {
+  ncclIpcHdr hdr;
+  NCCLCHECK(ncclIpcSocketRecvMsg(&proxyState->ipcSock, &hdr, sizeof(hdr), NULL));
+  if (hdr.type == ncclProxyMsgGetFd) {
+    // cuMem API support
+    uint64_t handle = *(uint64_t*)hdr.data;
+    INFO(NCCL_PROXY, "proxyUDSRecvReq::ncclProxyMsgGetFd rank %d opId %p handle=0x%lx", hdr.rank, hdr.opId, handle);
+    return proxyGetFd(proxyState, hdr.rank, hdr.opId, handle);
+  }
+
+  return ncclInternalError;
+}
+
+// UDS fd handle support
+void* ncclProxyServiceUDS(void* _args) {
+  struct ncclProxyState* proxyState =  (struct ncclProxyState*) _args;
+  struct pollfd pollfds[1];
+
+  if (setProxyThreadContext(proxyState)) {
+    INFO(NCCL_INIT, "[Proxy Service UDS] Created CUDA context on device %d", proxyState->cudaDev);
+  } else if (cudaSetDevice(proxyState->cudaDev) != cudaSuccess) {
+    WARN("[Proxy Service UDS] Failed to set CUDA device %d", proxyState->cudaDev);
+  }
+
+  if (ncclIpcSocketGetFd(&proxyState->ipcSock, &pollfds[0].fd) != ncclSuccess) {
+    WARN("[Proxy Service UDS] Get listenSock fd fails");
+    return NULL;
+  };
+  pollfds[0].events = POLLIN|POLLHUP;
+
+  while (1) {
+    /* never let proxy service thread blocks in poll, or it cannot receive abortFlag. */
+    int ret;
+    do {
+      ret = poll(pollfds, 1, 500);
+    } while (ret < 0 && errno == EINTR);
+    if (ret < 0) {
+      WARN("[Proxy Service UDS] Poll failed: %s", strerror(errno));
+      return NULL;
+    }
+
+    // Check for stop/abort
+    if (proxyState->stop || *proxyState->abortFlag) break;
+
+    if (pollfds[0].revents) {
+      // A request was seen on the UDS fd
+      proxyUDSRecvReq(proxyState, pollfds[0].fd);
+    }
+  }
+
+  ncclIpcSocketClose(&proxyState->ipcSock);
+  INFO(NCCL_PROXY, "[Proxy Service UDS] exit: stop %d abortFlag %d", proxyState->stop, *proxyState->abortFlag);
+  return NULL;
+}
+
+ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses, uint64_t *peerAddressesUDS) {
  assert(comm->sharedRes->proxyState == NULL);
  NCCLCHECK(ncclCalloc(&comm->sharedRes->proxyState, 1));
  comm->proxyState = comm->sharedRes->proxyState;
  comm->proxyState->refCount = 1;
  comm->proxyState->listenSock = sock;
  comm->proxyState->peerAddresses = peerAddresses;
+  comm->proxyState->peerAddressesUDS = peerAddressesUDS;
+
+  // UDS support
+  NCCLCHECK(ncclIpcSocketInit(&comm->proxyState->ipcSock, comm->rank, peerAddressesUDS[comm->rank], comm->abortFlag));
  // Seed the random number generator for UDS filename generation
  struct timeval time;
  gettimeofday(&time,NULL);
@@ -1606,6 +1699,11 @@ ncclResult_t ncclProxyCreate(struct ncclComm* comm) {

    pthread_create(&comm->proxyState->thread, NULL, ncclProxyService, comm->proxyState);
    ncclSetThreadName(comm->proxyState->thread, "NCCL Service %2d", comm->cudaDev);
+
+    // UDS support
+    INFO(NCCL_PROXY, "UDS: Creating service thread comm %p rank %d", comm, comm->rank);
+    pthread_create(&comm->proxyState->threadUDS, NULL, ncclProxyServiceUDS, comm->proxyState);
+    ncclSetThreadName(comm->proxyState->threadUDS, "NCCL UDS Service %2d", comm->cudaDev);
  }
  return ncclSuccess;
 }
@@ -1615,8 +1713,13 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) {
    struct ncclProxyState* sharedProxyState = comm->proxyState;

    if ((comm->proxyRefCountOld = ncclAtomicRefCountDecrement(&sharedProxyState->refCount)) == 0) {
+      if (comm->proxyState->threadUDS) {
+        // UDS support
+        comm->proxyState->stop = 1;
+      }
+
      if (sharedProxyState->peerAddresses) {
-        if (*comm->abortFlag == 0) {
+        if (__atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED) == 0) {
          struct ncclSocket sock;
          int type = ncclProxyMsgStop;
          NCCLCHECK(ncclSocketInit(&sock, sharedProxyState->peerAddresses + comm->topParentRanks[comm->rank], comm->sharedRes->magic, ncclSocketTypeProxy, comm->abortFlag));
@@ -1641,7 +1744,7 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) {
              }
            }
            int type = ncclProxyMsgClose;
-            if (*comm->abortFlag == 0) NCCLCHECK(ncclSocketSend(sharedProxyState->peerSocks + i, &type, sizeof(int)));
+            if (__atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED) == 0) NCCLCHECK(ncclSocketSend(sharedProxyState->peerSocks + i, &type, sizeof(int)));
            NCCLCHECK(ncclSocketClose(sharedProxyState->peerSocks + i));
          }
        }
@@ -1657,6 +1760,7 @@ ncclResult_t ncclProxyDestroy(struct ncclComm* comm) {

  assert(sharedProxyState->refCount == 0);
  free(sharedProxyState->peerAddresses);
+  free(sharedProxyState->peerAddressesUDS);
  free(sharedProxyState->peerSocks);
  free(sharedProxyState->proxyOps);
  free(sharedProxyState->sharedDevMems);
@@ -0,0 +1,182 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "argcheck.h" // Need some checks here since we access comm
+#include "nccl.h"
+#include "comm.h"
+#include "net.h"
+#include "register.h"
+
+ncclResult_t ncclNetDeregister(struct ncclComm* comm, struct ncclReg* reg) {
+  struct ncclRegCache* cache = &comm->regCache;
+  ncclDebugNoWarn = NCCL_NET;
+  for (int d=0; d<reg->nDevs; d++) {
+    if (reg->handles[d] != NULL) NCCLCHECK(comm->ncclNet->deregMr(cache->sComms[reg->devs[d]], reg->handles[d]));
+  }
+  reg->nDevs = 0;
+  free(reg->handles);
+  reg->handles = NULL;
+  ncclDebugNoWarn = 0;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclNetRegister(struct ncclComm* comm, void* addr, size_t size, struct ncclReg* reg) {
+  struct ncclRegCache* cache = &comm->regCache;
+  int netCount;
+  NCCLCHECK(ncclTopoGetNetCount(comm->topo, &netCount));
+  if (netCount == 0) return ncclSuccess;
+
+  ncclResult_t ret = ncclSuccess;
+
+  // Find local devices for p2p operations
+  for (int c=0; c<comm->p2pnChannels; c++) {
+    int dev;
+    if (ncclTopoGetLocalNet(comm->topo, comm->rank, c, &dev) != ncclSuccess) goto end; // No local net
+    ncclNetProperties_t props;
+    NCCLCHECKGOTO(comm->ncclNet->getProperties(dev, &props), ret, end);
+    if (props.regIsGlobal == 0) { // We need to be sure all NICs support global registration.
+      reg->nDevs = 0;
+      break;
+    }
+    int found = 0;
+    for (int d=0; d<reg->nDevs; d++) if (reg->devs[d] == dev) found = 1;
+    if (!found) reg->devs[reg->nDevs++] = dev;
+  }
+
+  NCCLCHECKGOTO(ncclCalloc(&reg->handles, reg->nDevs), ret, end);
+
+  ncclDebugNoWarn = NCCL_NET;
+  for (int d=0; d<reg->nDevs; d++) {
+    int dev = reg->devs[d];
+    reg->handles[d] = NULL;
+
+    if (cache->sComms[dev] == NULL) {
+      // Create a loopback network comm object for that device to register the buffers.
+      void *lComm = NULL;
+      ncclNetHandle_t netHandle;
+      bool connected = false;
+      NCCLCHECKGOTO(comm->ncclNet->listen(dev, &netHandle, &lComm), ret, end);
+      while (!connected) {
+        if (*comm->abortFlag) {
+          goto end;
+        }
+        if (cache->sComms[dev] == NULL)
+          NCCLCHECKGOTO(comm->ncclNet->connect(dev, &netHandle, cache->sComms+dev, NULL), ret, end);
+        if (cache->rComms[dev] == NULL)
+          NCCLCHECKGOTO(comm->ncclNet->accept(lComm, cache->rComms+dev, NULL), ret, end);
+        connected = (cache->rComms[dev] != NULL) && (cache->sComms[dev] != NULL);
+      }
+      NCCLCHECK(comm->ncclNet->closeListen(lComm));
+    }
+    if (comm->ncclNet->regMr(cache->sComms[dev], addr, size, NCCL_PTR_CUDA, reg->handles+d) != ncclSuccess) {
+      reg->handles[d] = NULL;
+      NCCLCHECK(ncclNetDeregister(comm, reg));
+      reg->nDevs = 0;
+      goto end;
+    }
+  }
+end:
+  ncclDebugNoWarn = 0;
+  if (ret != ncclSuccess) NCCLCHECK(ncclNetDeregister(comm, reg));
+  return ret;
+}
+
+ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** reg) {
+  struct ncclRegCache* cache = &comm->regCache;
+  uintptr_t pageSize = cache->pageSize;
+  uintptr_t addr = (uintptr_t)data & -pageSize;
+  size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
+
+  *reg = NULL;
+  for (int slot=0; /*true*/; slot++) {
+    if (slot == cache->population || addr < cache->slots[slot]->addr) return ncclSuccess;
+    if ((addr >= cache->slots[slot]->addr) &&
+        ((addr-cache->slots[slot]->addr)/pageSize+pages) <= cache->slots[slot]->pages) {
+      *reg = cache->slots[slot];
+      return ncclSuccess;
+    }
+  }
+}
+NCCL_PARAM(LocalRegister, "LOCAL_REGISTER", 1);
+
+ncclResult_t ncclRegister(struct ncclComm* comm, void* data, size_t size, void** handle) {
+  if (!ncclParamLocalRegister()) return ncclSuccess;
+  struct ncclRegCache* cache = &comm->regCache;
+  uintptr_t pageSize = cache->pageSize;
+  uintptr_t addr = (uintptr_t)data & -pageSize;
+  size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
+  for (int slot=0; /*true*/; slot++) {
+    if ((slot == cache->population) || (addr < cache->slots[slot]->addr)) {
+      if (cache->population == cache->capacity) { // must grow cache
+        cache->capacity = cache->capacity < 32 ? 32 : 2*cache->capacity;
+        NCCLCHECK(ncclRealloc(&cache->slots, cache->population, cache->capacity));
+      }
+      memmove(cache->slots+slot+1, cache->slots+slot, (cache->population-slot)*sizeof(struct ncclReg*));
+      NCCLCHECK(ncclCalloc(cache->slots+slot, 1));
+      struct ncclReg* regSlot = cache->slots[slot];
+      regSlot->addr = addr;
+      regSlot->pages = pages;
+      regSlot->refs = 1;
+      NCCLCHECK(ncclNetRegister(comm, (void*)addr, pages*pageSize, regSlot));
+      regSlot->state |= NET_REG_COMPLETE;
+      cache->population += 1;
+      *handle = regSlot;
+      return ncclSuccess;
+    } else if ((addr >= cache->slots[slot]->addr) &&
+        ((addr-cache->slots[slot]->addr)/pageSize+pages) <= cache->slots[slot]->pages) {
+      cache->slots[slot]->refs++;
+      *handle = cache->slots[slot];
+      return ncclSuccess;
+    }
+  }
+}
+
+ncclResult_t ncclRegCleanup(struct ncclComm* comm) {
+  struct ncclRegCache* cache = &comm->regCache;
+  for (int i=0; i<cache->population; i++) {
+    INFO(NCCL_INIT, "Cleanup buffer %p pages %lx", (void*)cache->slots[i]->addr, cache->slots[i]->pages);
+    NCCLCHECK(ncclNetDeregister(comm, cache->slots[i]));
+    if (cache->slots[i]->state & NVLS_REG_COMPLETE) NCCLCHECK(ncclNvlsDeregBuffer(&cache->slots[i]->mcHandle, cache->slots[i]->regAddr, cache->slots[i]->dev, cache->slots[i]->regSize));
+    free(cache->slots[i]);
+  }
+  free(cache->slots);
+  for (int d=0; d<MAXCHANNELS; d++) {
+    if (cache->sComms[d]) NCCLCHECK(comm->ncclNet->closeSend(cache->sComms[d]));
+    if (cache->rComms[d]) NCCLCHECK(comm->ncclNet->closeRecv(cache->rComms[d]));
+  }
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclCommRegister, const ncclComm_t comm, void* buff, size_t size, void** handle);
+ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) {
+  NCCLCHECK(PtrCheck(comm, "ncclCommRegister", "comm"));
+  if (comm->checkPointers) NCCLCHECK(CudaPtrCheck(buff, comm, "buff", "ncclCommRegister"));
+  NCCLCHECK(ncclRegister(comm, buff, size, handle));
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclCommDeregister, const ncclComm_t comm, void* handle);
+ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle) {
+  NCCLCHECK(PtrCheck(comm, "ncclCommRegister", "comm"));
+  struct ncclReg* reg = (struct ncclReg*)handle;
+  struct ncclRegCache* cache = &comm->regCache;
+  int slot;
+  for (slot=0; slot<cache->population && cache->slots[slot] != reg; slot++);
+  if (slot == cache->population) {
+    WARN("Deregister: Could not find handle");
+    return ncclInvalidUsage;
+  }
+  if (--reg->refs) return ncclSuccess;
+  NCCLCHECK(ncclNetDeregister(comm, reg));
+  if (reg->state & NVLS_REG_COMPLETE) {
+    NCCLCHECK(ncclNvlsDeregBuffer(&reg->mcHandle, reg->regAddr, reg->dev, reg->regSize));
+    reg->regAddr = (CUdeviceptr)NULL;
+  }
+  free(reg);
+  memmove(cache->slots+slot, cache->slots+slot+1, (cache->population-slot-1)*sizeof(struct ncclReg*));
+  cache->population -= 1;
+  return ncclSuccess;
+}
@@ -341,10 +341,10 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
    for (int r = 0; r < nranks; r++) {
      if (allConnects[r].isMaster) {
        memcpy(masterConnects+c, &(allConnects[r].connect), sizeof(struct ncclConnect));
-        if (r == rank) rankInCollNet = c;
        c++;
      }
    }
+    if (isMaster) rankInCollNet = comm->node;
  } else { // send side : copy in connect info received from peer recv master
    if (isMaster) memcpy(masterConnects+rankInCollNet, &(sendrecvExchange.connect), sizeof(struct ncclConnect));
  }
@@ -87,8 +87,8 @@ struct connectMap {
 };

 struct reqSlot {
-  volatile void* recvBuff;
-  volatile int size;
+  bool turnIsSendNotRecv;
+  int size;
 };

 struct sendResources {
@@ -246,9 +246,11 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne

  struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem);
  send->conn.tail = &recvMem->tail;
-  send->conn.sizesFifo = recvMem->sizesFifo;
-  for (int i=0; i<NCCL_STEPS; i++) send->conn.sizesFifo[i] = -1;
-  send->conn.offsFifo = recvMem->offsFifo;
+  send->conn.connFifo = recvMem->connFifo;
+  for (int i=0; i<NCCL_STEPS; i++) {
+    send->conn.connFifo[i].size = -1;
+    send->conn.connFifo[i].mode = NCCL_MODE_OFFSET;
+  }

  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
    send->conn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]);
@@ -277,7 +279,10 @@ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* conne
  struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem);
  void* gdcMem = map->mems[NCCL_NET_MAP_GDCMEM].gpuPtr;
  recv->conn.tail = gdcMem ? (uint64_t*)gdcMem : &recvMem->tail;
-  recv->conn.offsFifo = recvMem->offsFifo;
+  recv->conn.connFifo = recvMem->connFifo;
+  for (int i=0; i<NCCL_STEPS; i++) {
+    recv->conn.connFifo[i].mode = NCCL_MODE_OFFSET;
+  }

  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
    recv->conn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]);
@@ -478,7 +483,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
  resources->sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem);
  resources->recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem);
  // Don't give credits yet in shared mode.
-  resources->sendMem->head = -NCCL_STEPS;
+  (resources->gdcSync ? *resources->gdcSync : resources->sendMem->head) = -NCCL_STEPS;

  // Allocate & Register shared buffers for the Simple protocol
  int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM;
@@ -624,9 +629,49 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
  return ncclSuccess;
 }

+static size_t calcAlgoOffset(struct ncclProxyArgs* args, int isAllNotOne, int sub, uint64_t step) {
+  int chunkSize = args->chunkSize;
+  int nNodes = args->specifics.collnetDirect.nNodes;
+  int node = args->specifics.collnetDirect.node;
+  size_t sizePerRank = args->specifics.collnetDirect.sizePerRank;
+  size_t offset = (step*(args->nsubs) + sub)*chunkSize;
+  if (isAllNotOne) {
+    offset = std::min<size_t>(offset, nNodes*sizePerRank);
+  } else {
+    offset = std::max<size_t>(offset, (node+0)*sizePerRank);
+    offset = std::min<size_t>(offset, (node+1)*sizePerRank);
+  }
+  return offset;
+}

-#define LAST_OF_GROUP(s) \
-  (s % COLLNET_GROUP_NSUBS == COLLNET_GROUP_NSUBS-1 || s == args->nsubs-1)
+static int calcRegionOffset(
+    struct ncclProxyArgs* args, int isRecvNotSend, int sub, uint64_t step,
+    int side // 0=begin, 1=end
+  ) {
+  struct ncclCollNetSharedRes* collNet = args->subs[0].connection->collNet;
+  int slotSize = collNet->buffSize/NCCL_STEPS;
+  int chunkSize = args->chunkSize;
+  int base = isRecvNotSend*NCCL_STEPS + (step%NCCL_STEPS);
+  base *= collNet->nChannels*slotSize;
+  if (args->coll == ncclFuncAllReduce) {
+    return base + (sub+side)*chunkSize;
+  } else {
+    int isAllNotOne = isRecvNotSend ^ (args->coll == ncclFuncReduceScatter);
+    int sub0 = sub - (sub%COLLNET_GROUP_NSUBS);
+    size_t off = sub0*slotSize;
+    off += calcAlgoOffset(args, isAllNotOne, sub+side, step)
+         - calcAlgoOffset(args, isAllNotOne, sub0, step);
+    return base + off;
+  }
+}
+
+#define LAST_OF_GROUP(args, s) \
+  ((s)%COLLNET_GROUP_NSUBS == COLLNET_GROUP_NSUBS-1 || (s) == (args)->nsubs-1)
+
+static constexpr int calcStepsPerGroup(int nGroups) {
+  //return NCCL_STEPS/nGroups;
+  return NCCL_STEPS;
+}

 static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
  if (args->state == ncclProxyOpReady) {
@@ -645,88 +690,117 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
  if (args->state == ncclProxyOpProgress) {
    int p = NCCL_PROTO_SIMPLE;
    int nGroups = DIVUP(args->nsubs, COLLNET_GROUP_NSUBS);
-    int perGroupSteps = NCCL_STEPS / nGroups;
    for (int s=0; s<args->nsubs; s++) {
      struct ncclProxySubArgs* sub = args->subs+s;
      struct sendResources* resources = (struct sendResources*) (sub->connection->transportResources);
      void* sendMhandle = resources->sendMhandles[p];
      void* recvMhandle = resources->recvMhandles[p];
+      char* region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[p]);
      auto reqFifo = resources->reqFifo;
+      int group = s/COLLNET_GROUP_NSUBS;
+      int groupStart = s - (s%COLLNET_GROUP_NSUBS);
+
      if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) {
        int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
-        int sharedBuffSlot = sub->posted%NCCL_STEPS;
-        int offset;
-        NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 0, sharedBuffSlot, 0, &offset));
-        resources->recvMem->offsFifo[buffSlot] = offset + s*args->chunkSize;
+        resources->recvMem->connFifo[buffSlot].offset = calcRegionOffset(args, 0, s, sub->posted, 0);
        __sync_synchronize();
        volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
+        TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] posted offset %d @ %p signal %ld->%ld", long(sub->posted), group, buffSlot, resources->recvMem->connFifo[buffSlot].offset, &resources->recvMem->connFifo[buffSlot].offset, long(*sendHead), long(sub->base + sub->posted + args->sliceSteps - NCCL_STEPS));
        sub->posted += args->sliceSteps;
        *sendHead = sub->base + sub->posted - NCCL_STEPS;
        if (resources->gdcSync) wc_store_fence(); // Flush out WC write
      }
-      // Enforce sync between operations of the same group.
-      bool groupSync = (((s == 0) && ((sub+args->nsubs-1)->received == sub->received)) || (s && (sub-1)->received > sub->received));
-      if (groupSync && sub->received < sub->posted && sub->received < sub->done + perGroupSteps) {
+      if (sub->received < sub->posted && sub->received < sub->done + calcStepsPerGroup(nGroups)) {
        int buffSlot = (sub->base+sub->received)%NCCL_STEPS;
-        int sharedBuffSlot = sub->received%NCCL_STEPS;
-        volatile int* sizesFifo = resources->recvMem->sizesFifo;
+        volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
        volatile uint64_t* recvTail = &resources->recvMem->tail;
-        char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[p]);
-        if (sizesFifo[buffSlot] != -1 && ((*recvTail > (sub->base+sub->received)))) {
-          // We have something to receive, let's check whether data is ready.
-          int ready = 1;
-          if (s == 0) {
-            int offset;
-            NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 0, sharedBuffSlot, 0, &offset));
-            args->sharedBuff[sharedBuffSlot] = localBuff + offset;
-            args->sharedSize[sharedBuffSlot] = args->chunkSize;
+        if (connFifo[buffSlot].size != -1 && ((*recvTail > (sub->base+sub->received)))) {
+          if (args->coll != ncclFuncAllReduce) {
+            int sendBeg = calcRegionOffset(args, 0, s, sub->received, 0);
+            int sendEnd = calcRegionOffset(args, 0, s, sub->received, 1);
+            if (sendEnd-sendBeg != connFifo[buffSlot].size) {
+              WARN("CollNet sizes: want=%d got=%ld", sendEnd-sendBeg, connFifo[buffSlot].size);
+              return ncclInternalError;
+            }
          }
-          if (ready) {
-            sizesFifo[buffSlot] = -1;
-            sub->received += args->sliceSteps;
-            args->idle = 0;
-            //continue;
-            // flush HDP if not done
-            if (resources->curr_hdp_reg && args->hdp_flushed < *recvTail) {
-              args->hdp_flushed = *recvTail;
-              *resources->curr_hdp_reg = 1;
+          connFifo[buffSlot].size = -1;
+          sub->received += args->sliceSteps;
+          args->idle = 0;
+        }
+      }
+      // Enforce collective ordering of collnet ops.
+      bool ordered = s==0 ? args->subs[args->nsubs-1].transmitted == sub->transmitted
+                          : sub->transmitted < (sub-1)->transmitted;
+      if (ordered && (sub->transmitted < sub->received)) {
+        if (LAST_OF_GROUP(args, s)) {
+          int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
+          if (!reqFifo[group][buffSlot].turnIsSendNotRecv) continue;
+
+          ssize_t sizePerRank = 0;
+          size_t allBeg = calcAlgoOffset(args, 1, groupStart, sub->transmitted);
+          size_t allEnd = calcAlgoOffset(args, 1, s+1, sub->transmitted);
+          int sendBeg = calcRegionOffset(args, 0, groupStart, sub->transmitted, 0);
+          int sendEnd = calcRegionOffset(args, 0, s, sub->transmitted, 1);
+          int recvBeg = calcRegionOffset(args, 1, groupStart, sub->transmitted, 0);
+          int recvEnd = calcRegionOffset(args, 1, s, sub->transmitted, 1);
+          reqFifo[group][buffSlot].size = recvEnd - recvBeg;
+          size_t eltSize = ncclTypeSize((ncclDataType_t)args->dtype);
+
+          if (sendBeg==sendEnd && recvBeg==recvEnd) {
+            sub->requests[buffSlot] = nullptr; // trivally finished request
+          } else {
+            if (args->coll == ncclFuncAllReduce) {
+              int count = (sendEnd-sendBeg)/eltSize;
+              NCCLCHECK(proxyState->ncclCollNet->iallreduce(resources->collNetComm, region+sendBeg, region+recvBeg, count, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sendMhandle, recvMhandle, sub->requests+buffSlot));
+            } else {
+              sizePerRank = args->specifics.collnetDirect.sizePerRank;
+              if (args->coll == ncclFuncAllGather) {
+                ncclNetSGE_v8_t recvParts;
+                recvParts.mhandle = recvMhandle;
+                recvParts.address = region + recvBeg;
+                recvParts.size = allEnd - allBeg;
+                NCCLCHECK(proxyState->ncclCollNet->iallgather(
+                  resources->collNetComm, region+sendBeg, 1, &recvParts,
+                  sizePerRank, allBeg, allEnd-allBeg,
+                  sendMhandle, sub->requests+buffSlot));
+              } else {
+                ncclNetSGE_v8_t sendParts;
+                sendParts.mhandle = sendMhandle;
+                sendParts.address = region + sendBeg;
+                sendParts.size = allEnd - allBeg;
+                NCCLCHECK(proxyState->ncclCollNet->ireducescatter(
+                  resources->collNetComm, 1, &sendParts, region+recvBeg,
+                  sizePerRank, allBeg, allEnd-allBeg,
+                  (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp,
+                  recvMhandle, sub->requests+buffSlot));
+              }
+            }
+            if (sub->requests[buffSlot] == nullptr) continue;
+
+            if (args->coll == ncclFuncAllReduce) {
+              TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Iallreduce posted, size %d req %p", (long)sub->transmitted, group, buffSlot, int(sendEnd-sendBeg), sub->requests[buffSlot]);
+            } else if (args->coll == ncclFuncAllGather) {
+              TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Iallgather posted sendSize=%ld recvOffset=%ld recvSize=%ld request=%p", (long)sub->transmitted, group, buffSlot, long(sizePerRank), long(allBeg), long(allEnd-allBeg), sub->requests[buffSlot]);
+            } else {
+              TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Ireducescatter posted sendOffset=%ld sendSize=%ld recvSize=%ld request=%p", (long)sub->transmitted, group, buffSlot, long(allBeg), long(allEnd-allBeg), long(sizePerRank), sub->requests[buffSlot]);
            }
          }
        }
-      }
-      if (LAST_OF_GROUP(s) && (sub->transmitted < sub->received)) {
-        int group = s / COLLNET_GROUP_NSUBS;
-        int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
-        int sharedBuffSlot = sub->transmitted%NCCL_STEPS;
-        if (reqFifo[group][buffSlot].recvBuff != NULL) {
-          int totalSize = (s-group*COLLNET_GROUP_NSUBS+1) * args->sharedSize[sharedBuffSlot];
-          int count = totalSize / ncclTypeSize((ncclDataType_t)args->dtype);
-          reqFifo[group][buffSlot].size = args->sharedSize[sharedBuffSlot];
-          char* sendAddress = (char*)args->sharedBuff[sharedBuffSlot] + group*COLLNET_GROUP_NSUBS*args->sharedSize[sharedBuffSlot];
-          NCCLCHECK(proxyState->ncclCollNet->iallreduce(resources->collNetComm, sendAddress, (void*)(reqFifo[group][buffSlot].recvBuff), count, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sendMhandle, recvMhandle, sub->requests+buffSlot));
-          if (sub->requests[buffSlot] == NULL) continue;
-
-          TRACE(NCCL_NET, "sendProxy [%lu/%d/%d] Iallreduce posted, size %d req %p", sub->transmitted, group, buffSlot, totalSize, sub->requests[buffSlot]);
-          // Make sure size is reset to zero before we update the head.
-          __sync_synchronize();
-          sub->transmitted += args->sliceSteps;
-          args->idle = 0;
-          continue;
-        }
+        sub->transmitted += args->sliceSteps;
+        args->idle = 0;
+        continue;
      }
      // Check whether the network has completed some send operations.
-      if (LAST_OF_GROUP(s) && sub->done < sub->transmitted) {
+      if (LAST_OF_GROUP(args, s) && sub->done < sub->transmitted) {
        int done, size;
-        int group = s / COLLNET_GROUP_NSUBS;
        int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
-        NCCLCHECK(proxyState->ncclCollNet->test((void*)(sub->requests[buffSlot]), &done, &size));
+        done = 1;
+        if (sub->requests[buffSlot]) NCCLCHECK(proxyState->ncclCollNet->test((void*)(sub->requests[buffSlot]), &done, &size));
        if (done) {
-          TRACE(NCCL_NET, "sendProxy [%lu/%d/%d] request %p done, size %d", sub->done, group, buffSlot, sub->requests[buffSlot], size);
-          // Make sure size is updated before we set recvBuff to NULL (from the view of recv proxy, concerning the flush)
-          // (reordered store after store is possible on POWER, though not on x86)
-          __sync_synchronize();
-          reqFifo[group][buffSlot].recvBuff = NULL; // Notify recvProxy
-          for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].done += args->sliceSteps;
+          TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] request %p done, size %d", (long)sub->done, group, buffSlot, sub->requests[buffSlot], size);
+          sub->requests[buffSlot] = nullptr;
+          reqFifo[group][buffSlot].turnIsSendNotRecv = false; // Notify recvProxy
+          for (int i=groupStart; i<=s; i++) args->subs[i].done += args->sliceSteps;
          args->idle = 0;
          int allDone = 1;
          for (int i=0; i<args->nsubs; i++) {
@@ -734,7 +808,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
          }
          if (allDone) {
            args->state = ncclProxyOpNone;
-            TRACE(NCCL_NET, "sendProxy [%lu/%d] stopped", sub->done, s);
+            TRACE(NCCL_NET, "sendProxy [%ld/%d] stopped", (long)sub->done, s);
          }
        }
      }
@@ -752,6 +826,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
      sub->base = ROUNDUP(resources->step, args->chunkSteps);
      sub->posted = sub->received = sub->flushed = sub->transmitted = sub->done = 0;
      resources->step = sub->base + sub->nsteps;
+      memset(sub->requests, 0, sizeof(sub->requests));
    }
    args->state = ncclProxyOpProgress;
  }
@@ -759,38 +834,32 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
  if (args->state == ncclProxyOpProgress) {
    int p = NCCL_PROTO_SIMPLE;
    int nGroups = DIVUP(args->nsubs, COLLNET_GROUP_NSUBS);
-    int perGroupSteps = NCCL_STEPS / nGroups;
    for (int s=0; s<args->nsubs; s++) {
+      int group = s/COLLNET_GROUP_NSUBS;
+      int groupStart = s - (s%COLLNET_GROUP_NSUBS);
      struct ncclProxySubArgs* sub = args->subs+s;
      struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
      void* mhandle = resources->mhandles[p];
      auto reqFifo = resources->reqFifo;
-      char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
+      char* region = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);

      // Enforce sync between operations of the same group.
-      if (LAST_OF_GROUP(s) && (sub->posted < sub->done + perGroupSteps) && (sub->posted < sub->nsteps)) {
-        int group = s / COLLNET_GROUP_NSUBS;
+      if (LAST_OF_GROUP(args, s) && (sub->posted < sub->done + calcStepsPerGroup(nGroups)) && (sub->posted < sub->nsteps)) {
        int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
-        int sharedBuffSlot = sub->posted%NCCL_STEPS;
-        int startChannel = group*COLLNET_GROUP_NSUBS;
-        int offset;
-        NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 1, sharedBuffSlot, startChannel, &offset));
-        reqFifo[group][buffSlot].recvBuff = localBuff + offset;
-        TRACE(NCCL_NET, "recvProxy [%lu/%d/%d] posted buffer %p", sub->posted, group, buffSlot, reqFifo[group][buffSlot].recvBuff);
+        reqFifo[group][buffSlot].turnIsSendNotRecv = true;
+        TRACE(NCCL_NET, "recvProxy [%ld/%d/%d] posted buffer", (long)sub->posted, group, buffSlot);
        sub->posted += args->sliceSteps;
        args->idle = 0;
        continue;
      }
-      if (LAST_OF_GROUP(s) && (sub->posted > sub->received)) {
-        int group = s / COLLNET_GROUP_NSUBS;
+      if (LAST_OF_GROUP(args, s) && (sub->received < sub->posted)) {
        int buffSlot = (sub->base+sub->received)%NCCL_STEPS;
-        int sharedBuffSlot = sub->received%NCCL_STEPS;
-        if (reqFifo[group][buffSlot].recvBuff == NULL) { // Buffer is cleared : coll is complete
-          args->sharedSize[sharedBuffSlot] = reqFifo[group][buffSlot].size;
-          int totalSize = args->sharedSize[sharedBuffSlot]*(s-group*COLLNET_GROUP_NSUBS+1);
-          TRACE(NCCL_NET, "recvProxy [%lu/%d/%d] received, size %d", sub->received, group, buffSlot, totalSize);
+        if (!reqFifo[group][buffSlot].turnIsSendNotRecv) { // Buffer is cleared : coll is complete
+          int recvBeg = calcRegionOffset(args, 1, groupStart, sub->received, 0);
+          int recvEnd = calcRegionOffset(args, 1, s, sub->received, 1);
+          int totalSize = recvEnd - recvBeg;
+          TRACE(NCCL_NET, "recvProxy [%ld/%d/%d] received, size %d chunkSize=%d", (long)sub->received, group, buffSlot, totalSize, args->chunkSize);
          sub->received += args->sliceSteps;
-          sub->requests[buffSlot] = NULL;
          if (reqFifo[group][buffSlot].size > 0 && resources->useGdr && resources->needFlush) {
            // GDRCOPY support
            if (resources->gdcFlush) {
@@ -801,42 +870,31 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
              WARN("NET: GDR Flush only supported on x86_64");
              return ncclInternalError;
 #endif
-              sub->requests[buffSlot] = NULL;
            } else {
-              int startChannel = group*COLLNET_GROUP_NSUBS;
-              int offset;
-              NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 1, sharedBuffSlot, startChannel, &offset));
-              NCCLCHECK(proxyState->ncclCollNet->iflush(resources->collNetComm, localBuff + offset, totalSize, mhandle, sub->requests+buffSlot));
+              NCCLCHECK(proxyState->ncclCollNet->iflush(resources->collNetComm, region+recvBeg, totalSize, mhandle, sub->requests+buffSlot));
            }
-          } else {
-            for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].flushed += args->sliceSteps;
          }
          args->idle = 0;
          continue;
        }
      }
-      if (LAST_OF_GROUP(s) && (sub->received > sub->flushed)) {
+      if (LAST_OF_GROUP(args, s) && (sub->flushed < sub->received)) {
        // Progress flush operations
-        int group = s / COLLNET_GROUP_NSUBS;
        int buffSlot = (sub->base + sub->flushed)%NCCL_STEPS;
        int done = 1;
        if (sub->requests[buffSlot]) NCCLCHECK(proxyState->ncclCollNet->test(sub->requests[buffSlot], &done, NULL));
        if (done) {
-          TRACE(NCCL_NET, "recvProxy [%lu/%d/%d] flushed", sub->flushed, group, buffSlot);
+          sub->requests[buffSlot] = nullptr;
+          TRACE(NCCL_NET, "recvProxy [%ld/%d/%d] flushed", (long)sub->flushed, group, buffSlot);
          for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].flushed += args->sliceSteps;
          args->idle = 0;
          //continue;
        }
      }
-      if (sub->flushed > sub->transmitted) {
-        int group = s / COLLNET_GROUP_NSUBS;
+      if (sub->transmitted < sub->flushed) {
        int buffSlot = (sub->base + sub->transmitted)%NCCL_STEPS;
-        int sharedBuffSlot = sub->transmitted%NCCL_STEPS;
-        int startChannel = group*COLLNET_GROUP_NSUBS;
-        int offset;
-        NCCLCHECK(sharedBuffersGet(sub->connection->collNet, 1, sharedBuffSlot, startChannel, &offset));
-        volatile int* offsFifo = (volatile int*)resources->recvMem->offsFifo;
-        offsFifo[buffSlot] = offset + (s%COLLNET_GROUP_NSUBS)*args->chunkSize;
+        volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
+        connFifo[buffSlot].offset = calcRegionOffset(args, 1, s, sub->transmitted, 0);
        __sync_synchronize();
        volatile uint64_t* recvTail = resources->gdcSync ? resources->gdcSync : &resources->recvMem->tail;
        *recvTail = sub->base + sub->flushed;
@@ -848,14 +906,15 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
      // Enforce sync here to make sure the last sub doesn't increase "done" before all others in the group have
      // reached the same point, otherwise we would start posting buffers to the send proxy before we're done
      // processing all the shared buffer.
-      bool groupSync = (((s == 0) && ((sub+args->nsubs-1)->done == sub->done)) || (s && (sub-1)->done > sub->done));
+      bool groupSync = s==0 ? args->subs[args->nsubs-1].done == sub->done
+                            : (sub-1)->done > sub->done;
      volatile uint64_t* sendHead = &resources->sendMem->head;
      if (groupSync && sub->done < sub->transmitted && (sub->base+sub->done) < *sendHead) {
        sub->done += args->sliceSteps;
        args->idle = 0;
        if (sub->done == sub->nsteps && s == args->nsubs-1) {
          args->state = ncclProxyOpNone;
-          TRACE(NCCL_NET, "recvProxy [%lu/%d] stopped", sub->done, s);
+          TRACE(NCCL_NET, "recvProxy [%ld/%d] stopped", (long)sub->done, s);
        }
      }
    }
@@ -868,4 +927,4 @@ struct ncclTransport collNetTransport = {
  canConnect,
  { sendSetup, sendConnect, sendFree, NULL, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress },
  { recvSetup, recvConnect, recvFree, NULL, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress }
-};
+};
@@ -374,9 +374,12 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne

  struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem);
  send->conn.tail = &recvMem->tail;
-  send->conn.sizesFifo = recvMem->sizesFifo;
+  send->conn.connFifo = recvMem->connFifo;
  // Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree
-  send->conn.offsFifo = map->shared ? recvMem->offsFifo : NULL;
+  for (int i=0; i<NCCL_STEPS; i++) {
+    send->conn.connFifo[i].offset = -1;
+    recvMem->connFifo[i].mode = map->shared ? NCCL_MODE_OFFSET : NCCL_MODE_NORMAL;
+  }

  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
    send->conn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]);
@@ -436,9 +439,11 @@ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* conne
  struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem);
  void* gdcMem = map->mems[NCCL_NET_MAP_GDCMEM].gpuPtr;
  recv->conn.tail = gdcMem ? (uint64_t*)gdcMem : &recvMem->tail;
-  recv->conn.sizesFifo = recvMem->sizesFifo;
+  recv->conn.connFifo = recvMem->connFifo;
  // Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree
-  recv->conn.offsFifo = map->shared ? recvMem->offsFifo : NULL;
+  for (int i=0; i<NCCL_STEPS; i++) {
+    recvMem->connFifo[i].mode = map->shared ? NCCL_MODE_OFFSET : NCCL_MODE_NORMAL;
+  }

  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
    recv->conn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]);
@@ -548,10 +553,11 @@ static ncclResult_t sharedNetBuffersInit(struct ncclProxyState* proxyState, int
  return ncclSuccess;
 }

-static ncclResult_t sharedBuffersGet(struct ncclProxyState* proxyState, int channel, int slot, int* offset) {
+static ncclResult_t sharedBuffersGet(struct ncclProxyState* proxyState, int channel, int slot, int* offset, int* size) {
  // Use different pools for different channels and also separate send/recv.
  int globalSlot = (channel*NCCL_SHARED_STEPS)+slot;
  *offset = proxyState->p2pChunkSize * globalSlot;
+  if (size) *size = proxyState->p2pChunkSize;
  return ncclSuccess;
 }

@@ -802,8 +808,9 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
  resources->recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem);

  // Don't give credits yet in shared mode.
-  resources->sendMem->head = map->shared ? -NCCL_STEPS : 0;
-  for (int i=0; i<NCCL_STEPS; i++) resources->recvMem->sizesFifo[i] = -1;
+  (resources->gdcSync ? *resources->gdcSync : resources->sendMem->head) =
+    (map->shared ? -NCCL_STEPS : 0);
+  for (int i=0; i<NCCL_STEPS; i++) resources->recvMem->connFifo[i].size = -1;

  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
    resources->buffers[p] = NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]);
@@ -1099,6 +1106,7 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
 }

 static_assert(NCCL_STEPS <= NCCL_NET_MAX_REQUESTS, "Not enough net requests to cover for steps");
+#define MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two.

 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
 static int g_npkit_net_poll_cnt = 0;
@@ -1114,8 +1122,15 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
      struct sendNetResources* resources = (struct sendNetResources*) (sub->connection->transportResources);
      // Round to next multiple of sliceSteps
      sub->base = ROUNDUP(resources->step, args->chunkSteps);
+      // Set step base for next op
+      resources->step = sub->base + sub->nsteps;
      sub->posted = sub->transmitted = sub->done = 0;
      for (uint64_t step=0; step<sub->nsteps; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileBegin);
+      if (sub->reg && sub->nbytes > 0) {
+        NCCLCHECK(proxyState->ncclNet->regMr(resources->netSendComm, sub->buffer, sub->nbytes, NCCL_PTR_CUDA, &sub->mhandle));
+      } else {
+        sub->mhandle = resources->mhandles[args->protocol];
+      }
    }
    args->state = ncclProxyOpProgress;
    args->hdp_flushed = 0;
@@ -1128,23 +1143,24 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
      struct ncclProxySubArgs* sub = args->subs+s;
      if (sub->done == sub->nsteps) continue;
      struct sendNetResources* resources = (struct sendNetResources*) (sub->connection->transportResources);
-      void* mhandle = resources->mhandles[p];
+      volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
      int stepSize = resources->buffSizes[p] / NCCL_STEPS;
      char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
-      int buffSize = stepSize*args->sliceSteps;
-      if (sub->nbytes < buffSize) buffSize = sub->nbytes;
      // Post buffers to the GPU
      if (sub->posted < sub->nsteps && sub->posted < sub->done + maxDepth) {
        int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
        if (resources->shared) {
-          int sharedBuffSlot = sub->posted%maxDepth;
-          int offset;
-          NCCLCHECK(sharedBuffersGet(proxyState, sub->channelId, sharedBuffSlot*args->nsubs+s, &offset));
-          resources->recvMem->offsFifo[buffSlot] = offset;
-          __sync_synchronize();
+          if (!sub->reg) {
+            int sharedBuffSlot = sub->posted%maxDepth;
+            int offset;
+            NCCLCHECK(sharedBuffersGet(proxyState, sub->channelId, sharedBuffSlot*args->nsubs+s, &offset, NULL));
+            resources->recvMem->connFifo[buffSlot].offset = offset;
+            __sync_synchronize();
+          }
          volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
          sub->posted += args->sliceSteps;
-          *sendHead = sub->base + sub->posted - NCCL_STEPS;
+          // Only post one credit for registered buffer
+          if (sub->reg == 0 || sub->posted == args->sliceSteps) *sendHead = sub->base + sub->posted - NCCL_STEPS;
          if (resources->gdcSync) wc_store_fence(); // Flush out WC write
        } else sub->posted += args->sliceSteps;
        for (uint64_t step=sub->posted-args->sliceSteps; step<sub->posted; step++) {
@@ -1158,14 +1174,15 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
        int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
        volatile int* sizesFifo = resources->recvMem->sizesFifo;
        volatile uint64_t* recvTail = &resources->recvMem->tail;
-        if (sizesFifo[buffSlot] != -1 && ((*recvTail > (sub->base+sub->transmitted)) || p == NCCL_PROTO_LL)) {
+        uint64_t tail = sub->base + (sub->reg ? 0 : sub->transmitted);
+        if ((sub->reg || connFifo[buffSlot].size != -1) && ((*recvTail > tail) || p == NCCL_PROTO_LL)) {
          // We have something to receive, let's check if it's completely ready.
-          int size = sizesFifo[buffSlot];
+          int size = sub->reg ? std::min(MAX_NET_SIZE, sub->nbytes) : connFifo[buffSlot].size;
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT)
          sub->npKitSizesFifo[buffSlot] = size;
 #endif
          bool shared = (p == NCCL_PROTO_SIMPLE) && resources->shared;
-          char* buff = shared ? localBuff+resources->recvMem->offsFifo[buffSlot] : localBuff+buffSlot*stepSize;
+          char* buff = shared ? localBuff+connFifo[buffSlot].offset : localBuff+buffSlot*stepSize;
          int ready = 1;
          if (p == NCCL_PROTO_LL128) {
            ready = resources->useGdr;
@@ -1173,7 +1190,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
              // When data is in sysmem, we need to wait until all flags are correct since the GPU only
              // called threadfence()
              uint64_t flag = sub->base+sub->transmitted+1;
-              int nFifoLines = DIVUP(sizesFifo[buffSlot], sizeof(uint64_t)*NCCL_LL128_LINEELEMS);
+              int nFifoLines = DIVUP(connFifo[buffSlot].size, sizeof(uint64_t)*NCCL_LL128_LINEELEMS);
              volatile uint64_t* lines = (volatile uint64_t*)buff;
              ready = 1;
              for (int i=0; i<nFifoLines; i++) {
@@ -1189,6 +1206,8 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
              volatile uint32_t *f2 = &lines[i].flag2;
              if (f1[0] != flag || f2[0] != flag) { ready = 0; break; }
            }
+          } else if (p == NCCL_PROTO_SIMPLE && resources->shared) {
+            buff = sub->reg ? (char*)sub->buffer : localBuff+resources->recvMem->connFifo[buffSlot].offset;
          }
          if (ready) {
            // flush HDP if not done
@@ -1197,7 +1216,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
              *resources->curr_hdp_reg = 1;
            }
            // Data is ready, try to send.
-            NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, mhandle, sub->requests+buffSlot));
+            NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, sub->mhandle, sub->requests+buffSlot));
            if (sub->requests[buffSlot] != NULL) {

 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT)
@@ -1231,12 +1250,13 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
      // Check whether the network has completed some send operations.
      if (sub->done < sub->transmitted) {
        int done;
+        int size;
        int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT)
        if (sub->timestamp[buffSlot] == 0)
          sub->timestamp[buffSlot] = *(volatile uint64_t*)NpKit::GetCpuTimestamp();
 #endif
-        NCCLCHECK(proxyState->ncclNet->test(sub->requests[buffSlot], &done, NULL));
+        NCCLCHECK(proxyState->ncclNet->test(sub->requests[buffSlot], &done, &size));
        if (done) {
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT)
          NpKit::CollectCpuEvent(
@@ -1280,28 +1300,48 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
          g_npkit_net_poll_cnt = 0;
 #endif
 #endif
+            if (sub->reg) {
+              if (size < sub->nbytes) {
+                sub->buffer = ((char*)sub->buffer)+size;
+                sub->nbytes -= size;
+                // Do one more step (at least)
+                sub->nsteps++;
+              } else {
+                // Signal the GPU the send is complete and it can return.
+                connFifo[sub->base%NCCL_STEPS].size = -1;
+              }
+            }
+            // Make sure size is reset to -1 before we update the head.
+            if (sub->reg == 0) connFifo[buffSlot].size = -1;
+            __sync_synchronize();
+            TRACE(NCCL_NET, "sendProxy [%ld/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]);
+            sub->done += args->sliceSteps;
+            for (uint64_t step=sub->done-args->sliceSteps; step<sub->done; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileEnd);

-          TRACE(NCCL_NET, "sendProxy [%ld/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]);
-          sub->done += args->sliceSteps;
-          for (uint64_t step=sub->done-args->sliceSteps; step<sub->done; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileEnd);
-
-          if (resources->shared == 0) {
-            volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
-            *sendHead = sub->base + sub->done;
-            if (resources->gdcSync) wc_store_fence(); // Flush out WC write
-          }
-          args->idle = 0;
-          if (sub->done == sub->nsteps) {
-            resources->step = sub->base + sub->nsteps;
-            args->done++;
+            if (resources->shared == 0) {
+              volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
+              if (sub->reg) {
+                // We may have added more net steps, but reg operations only have a single step w.r.t. the GPU.
+                if (sub->done == sub->nsteps) *sendHead = sub->base + args->sliceSteps;
+              } else {
+                *sendHead = sub->base + sub->done;
+              }
+              if (resources->gdcSync) wc_store_fence(); // Flush out WC write
+            }
+            args->idle = 0;
+            if (sub->done == sub->nsteps) {
+              if (sub->reg && sub->nbytes > 0) {
+                NCCLCHECK(proxyState->ncclNet->deregMr(resources->netSendComm, sub->mhandle));
+              }
+              args->done++;
+            }
          }
        }
      }
+      if (args->done == args->nsubs) {
+        args->state = ncclProxyOpNone;
+      }
    }
-    if (args->done == args->nsubs) {
-      args->state = ncclProxyOpNone;
-    }
-  }
  return ncclSuccess;
 }

@@ -1339,9 +1379,17 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
      recvComm = resources->netRecvComm;
      // Round to next multiple of sliceSteps
      sub->base = ROUNDUP(resources->step, args->chunkSteps);
+      // Set step base for next op
+      resources->step = sub->base + sub->nsteps;
      sub->posted = sub->received = sub->transmitted = sub->done = 0;
      for (int i=0; i<groupSize; i++) sub[-i].groupSize = groupSize;
      for (uint64_t step=0; step<sub->nsteps; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileBegin);
+      if (sub->reg && sub->nbytes > 0) {
+        // Register buffer
+        NCCLCHECK(proxyState->ncclNet->regMr(resources->netRecvComm, sub->buffer, sub->nbytes, NCCL_PTR_CUDA, &sub->mhandle));
+      } else {
+        sub->mhandle = resources->mhandles[args->protocol];
+      }
    }
    args->state = ncclProxyOpProgress;
  }
@@ -1356,29 +1404,37 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
      int sizes[NCCL_PROXY_MAX_SUBS];
      int tags[NCCL_PROXY_MAX_SUBS];
      void* mhandles[NCCL_PROXY_MAX_SUBS];
-
      for (int i=0; i<subGroup->groupSize; i++) {
        struct ncclProxySubArgs* sub = subGroup + i;
        if (sub->posted < sub->nsteps) {
          if (sub->posted >= sub->done + maxDepth) { subCount = 0; break; }
          struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
+          if (sub->reg) maxDepth = 1;
          int stepSize = resources->buffSizes[p] / NCCL_STEPS;
          char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
          int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
+          volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
          if (p == NCCL_PROTO_SIMPLE && resources->shared) {
-            int sharedBuffSlot = sub->posted%maxDepth;
-            int offset;
-            NCCLCHECK(sharedBuffersGet(proxyState, sub->channelId, sharedBuffSlot*args->nsubs+s+i, &offset));
-            volatile int* offsFifo = (volatile int*)resources->recvMem->offsFifo;
-            offsFifo[buffSlot] = offset;
-            ptrs[subCount] = localBuff+offset;
+            if (sub->reg) {
+              // Wait until CUDA kernel has started before we access the user buffer directly.
+              if (connFifo[sub->base%NCCL_STEPS].size == -1) continue;
+              ptrs[subCount] = sub->buffer;
+              sizes[subCount] = std::min(MAX_NET_SIZE, sub->nbytes);
+            } else {
+              int sharedBuffSlot = sub->posted%maxDepth;
+              int offset;
+              NCCLCHECK(sharedBuffersGet(proxyState, sub->channelId, sharedBuffSlot*args->nsubs+s+i, &offset, sizes+subCount));
+              connFifo[buffSlot].offset = offset;
+              ptrs[subCount] = localBuff+offset;
+            }
          } else {
            ptrs[subCount] = localBuff+buffSlot*stepSize;
+            sizes[subCount] = stepSize*args->sliceSteps;
          }
          sizes[subCount] = stepSize*args->sliceSteps;
          if (sub->nbytes < sizes[subCount]) sizes[subCount] = sub->nbytes;
          tags[subCount] = resources->tpRemoteRank;
-          mhandles[subCount] = resources->mhandles[p];
+          mhandles[subCount] = sub->mhandle;
          subCount++;
        }
      }
@@ -1430,6 +1486,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
        if (done) {
          int needFlush = 0;
          int totalSize = 0;
+          int subIndex = 0;
          for (int i=0; i<NCCL_PROXY_MAX_SUBS; i++) totalSize += sizes[i];
          for (int i=0; i<subGroup->groupSize; i++) {
            struct ncclProxySubArgs* sub = subGroup + i;
@@ -1449,6 +1506,23 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
 #endif
 #endif

+            if (sub->received < sub->nsteps) {
+              int size = sizes[subIndex++];
+              if (sub->reg) {
+                if (size < sub->nbytes) {
+                  sub->buffer = ((char*)sub->buffer) + size;
+                  sub->nbytes -= size;
+                  // Do one more step (at least)
+                  sub->nsteps++;
+                } else {
+                  // Reset connFifo size indicating the GPU was ready to receive.
+                  // There is a __sync_synchronize() later to ensure it is reset before it is set again by the GPU.
+                  struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
+                  volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
+                  connFifo[sub->base%NCCL_STEPS].size = -1;
+                }
+              }
+            }
            sub->received += args->sliceSteps;
            for (uint64_t step=sub->received-args->sliceSteps; step<sub->received; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvFlushWait);
            if (step < sub->nsteps) {
@@ -1476,9 +1550,11 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
                  struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
                  int stepSize = resources->buffSizes[p] / NCCL_STEPS;
                  char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
-                  int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
-                  ptrs[subCount] = resources->shared ? localBuff+resources->recvMem->offsFifo[buffSlot] : localBuff+buffSlot*stepSize;
-                  mhandles[subCount] = resources->mhandles[p];
+                  int buffSlot = (sub->base+sub->received-args->sliceSteps)%NCCL_STEPS;
+                  ptrs[subCount] = resources->shared ?
+                    (sub->reg ? sub->buffer : localBuff+resources->recvMem->connFifo[buffSlot].offset) :
+                    localBuff+buffSlot*stepSize;
+                  mhandles[subCount] = sub->mhandle;
                  subCount++;
                }
              }
@@ -1502,13 +1578,18 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
        if (done) {
          for (int i=0; i<subGroup->groupSize; i++) {
            struct ncclProxySubArgs* sub = subGroup + i;
+
            sub->transmitted += args->sliceSteps;
            for (uint64_t step=sub->transmitted-args->sliceSteps; step<sub->transmitted; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvGPUWait);
            if (step < sub->nsteps) {
              __sync_synchronize();
              struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
              volatile uint64_t* recvTail = resources->gdcSync ? resources->gdcSync : &resources->recvMem->tail;
-              *recvTail = sub->base + sub->transmitted;
+              if (sub->reg) {
+                // We may have added more net steps, but reg operations only have a single step w.r.t. the GPU.
+                if (sub->transmitted == sub->nsteps) *recvTail = sub->base + args->sliceSteps;
+              } else
+                *recvTail = sub->base + sub->transmitted;
              if (resources->gdcSync) wc_store_fence(); // Flush out WC write
            }
          }
@@ -1526,7 +1607,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
        if (sub->transmitted > sub->done) {
          struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
          volatile uint64_t* sendHead = &resources->sendMem->head;
-          uint64_t done = *sendHead;
+          uint64_t done = sub->reg ? sub->base + sub->nsteps : *sendHead;
          while (done > sub->base + sub->done &&
              // LL and LL128 can acknowledge 0-bytes send before they even happen. Don't go past what we transmitted.
              sub->transmitted > sub->done) {
@@ -1541,7 +1622,9 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
            args->idle = 0;
            if (sub->done == sub->nsteps) {
              struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
-              resources->step = sub->base + sub->nsteps;
+              if (sub->reg && sub->nbytes > 0) {
+                NCCLCHECK(proxyState->ncclNet->deregMr(resources->netRecvComm, sub->mhandle));
+              }
              args->done++;
              break;
            }
@@ -1561,4 +1644,4 @@ struct ncclTransport netTransport = {
  canConnect,
  { sendSetup, sendConnect, sendFree, proxySharedInit, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress },
  { recvSetup, recvConnect, recvFree, proxySharedInit, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress }
-};
+};
@@ -96,6 +96,7 @@ ncclResult_t ncclNetSocketGetProperties(int dev, ncclNetProperties_t* props) {
  props->pciPath = ncclNetSocketDevs[dev].pciPath;
  props->guid = dev;
  props->ptrSupport = NCCL_PTR_HOST;
+  props->regIsGlobal = 0;
  NCCLCHECK(ncclNetSocketGetSpeed(props->name, &props->speed));
  props->latency = 0; // Not set
  props->port = 0;
@@ -534,7 +535,7 @@ ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) {
  return ncclSuccess;
 }

-ncclResult_t ncclNetSocketRegMr(void* comm, void* data, int size, int type, void** mhandle) {
+ncclResult_t ncclNetSocketRegMr(void* comm, void* data, size_t size, int type, void** mhandle) {
  return (type != NCCL_PTR_HOST) ? ncclInternalError : ncclSuccess;
 }
 ncclResult_t ncclNetSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; }
@@ -11,6 +11,7 @@
 #include "utils.h"
 #include "proxy.h"
 #include "enqueue.h"
+#include "register.h"

 #if CUDART_VERSION >= 12010

@@ -20,19 +21,8 @@ struct graphRegData {
 };

 struct localRegData {
-  /* Registration record data */
-  uintptr_t recSendbuff, recRecvbuff;
-  intptr_t recSendOffset, recRecvOffset;
-  /* Registration request data */
-  uintptr_t reqSendbuff, reqRecvbuff;
-  size_t reqSendSize, reqRecvSize;
-  intptr_t reqSendOffset, reqRecvOffset;
-};
-
-struct localRequestData {
-  uintptr_t reqBuff;
-  size_t reqSize;
-  intptr_t reqOffset;
+  struct ncclReg reg;
+  intptr_t offset;
 };

 ncclResult_t nvlsCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
@@ -116,11 +106,9 @@ ncclResult_t nvlsGroupConnect(struct ncclComm *comm, char *shareableHandle, int
    // cuMem UDS support
    int fd = -1;
    TRACE(NCCL_NVLS, "NVLS rank %d Importing shareable handle %p from rank %d", comm->localRank, shareableHandle, rank);
-    struct ncclProxyConnector proxyConn;
    int tpProxyRank = comm->topParentRanks[rank];
-    NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, tpProxyRank, &proxyConn));
    TRACE(NCCL_NVLS, "NVLS rank %d request conversion of handle 0x%lx from rank %d", comm->localRank, *(uint64_t*)shareableHandle, rank);
-    NCCLCHECK(ncclProxyClientGetFdBlocking(comm, &proxyConn, shareableHandle, &fd));
+    NCCLCHECK(ncclProxyClientGetFdBlocking(comm, tpProxyRank, shareableHandle, &fd));
    TRACE(NCCL_NVLS, "NVLS rank %d received converted fd %d from rank %d", comm->localRank, fd, rank);
    CUCHECK(cuMemImportFromShareableHandle(mcHandle, (void *)(uintptr_t)fd, type));
    (void) close(fd);
@@ -248,7 +236,8 @@ ncclResult_t ncclNvlsInit(struct ncclComm* comm) {

  int gpuCount;
  NCCLCHECK(ncclTopoGetGpuCount(comm->topo, &gpuCount));
-  if (!ncclParamNvlsEnable() || gpuCount <= 2) return ncclSuccess;
+  // NVLS is not supported on MNNVL yet
+  if (!ncclParamNvlsEnable() || gpuCount <= 2 || comm->nNodes > 1 || comm->MNNVL) return ncclSuccess;

  CUdevice dev;
  int driverVersion;
@@ -292,14 +281,14 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
  if (nvlsShare) {
    /* reuse NVLS resources */
    comm->nvlsChannels = std::min(comm->nvlsChannels, parent->nvlsResources->nChannels);
-    for (int c = 0; c < comm->nvlsChannels; c++) {
+    for (int c = 0; c < comm->nChannels; c++) {
      NCCLCHECKGOTO(initNvlsChannel(comm, c, parent, true), res, cleanup);
    }

    comm->nvlsResources = parent->nvlsResources;
    ncclAtomicRefCountIncrement(&parent->nvlsResources->refCount);
  } else {
-    int nChannels;
+    int nChannels = comm->nChannels;
    struct ncclNvlsSharedRes* resources;

    NCCLCHECK(ncclCalloc(&resources, 1));
@@ -312,7 +301,7 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
      comm->nvlsChannels = std::min(comm->nvlsChannels, parent->nvlsResources->nChannels);
    }

-    nChannels = resources->nChannels = comm->nvlsChannels;
+    resources->nChannels = comm->nvlsChannels;
    for (int c = 0; c < nChannels; c++) {
      NCCLCHECK(initNvlsChannel(comm, c, parent, false));
    }
@@ -390,7 +379,8 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
  }

  /* create shared memory for fast NVLS buffer registration */
-  typeSize = sizeof(struct localRegData);
+  typeSize = sizeof(struct localRegData) << 1;
+
  if (comm->localRank == 0) {
    shmPath[0] = '\0';
    NCCLCHECKGOTO(ncclShmOpen(shmPath, (sizeof(size_t) + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, comm->localRanks - 1, &comm->nvlsResources->nvlsShmemHandle), res, cleanup);
@@ -405,6 +395,7 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
  comm->nvlsResources->nvlsShmem.cnt[1] = (size_t*)((char*)comm->nvlsResources->nvlsShmem.ptr[0] + typeSize * comm->localRanks);
  comm->nvlsResources->nvlsShmem.ptr[1] = (void*)((char*)comm->nvlsResources->nvlsShmem.cnt[1] + sizeof(size_t));
  comm->nvlsResources->nvlsShmem.round = 0;
+  comm->nvlsResources->nvlsShmem.maxTypeSize = typeSize;

  return res;

@@ -427,23 +418,59 @@ ncclResult_t ncclNvlsFree(struct ncclComm* comm) {
  return ncclSuccess;
 }

-ncclResult_t tryRegisterBuffer(struct ncclComm *comm, struct localRequestData *reqData, uintptr_t userBuff, size_t buffSize, CUdeviceptr *regAddr, bool *regUsed) {
+ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t buffSize, CUdeviceptr *regAddr, bool *regUsed) {
  ncclResult_t ret = ncclSuccess;
-  struct ncclRegRecord *regRecord = NULL;
-  struct localRequestData *myReqData = &reqData[comm->localRank];
+  struct ncclReg *regRecord = NULL;
  CUdeviceptr regPtr = 0;
  CUmulticastObjectProp prop;
  char shareableHandle[NVLS_HANDLE_SIZE];
  CUmemGenericAllocationHandle mcHandle;
  size_t granularity;
-  size_t minSize;
+  size_t minSize = SIZE_MAX;
  bool localRegBufUsed = false;
+  struct localRegData* regData = NULL;
+  cudaPointerAttributes attr;

-  /* get minimal size of nvls buffers */
-  minSize = reqData[0].reqSize;
-  for (int i = 1; i < comm->localRanks; ++i) {
-    if (minSize > reqData[i].reqSize)
-      minSize = reqData[i].reqSize;
+  NCCLCHECKGOTO(ncclCalloc(&regData, comm->localRanks), ret, fail);
+
+  if (userBuff) {
+    NCCLCHECKGOTO(ncclRegFind(comm, (void*)userBuff, buffSize, &regRecord), ret, fail);
+    if (regRecord) {
+      CUDACHECK(cudaPointerGetAttributes(&attr, (void*)regRecord->addr));
+      if (attr.type == cudaMemoryTypeDevice) {
+        size_t regSize = regRecord->pages * comm->regCache.pageSize;
+        prop = comm->nvlsResources->properties;
+        prop.size = regSize;
+        CUCHECK(cuMulticastGetGranularity(&granularity, &prop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
+        CUCHECK(cuMemGetAddressRange((CUdeviceptr*)&regRecord->baseAddr, &regRecord->baseSize, (CUdeviceptr)regRecord->addr));
+        if (regSize % granularity == 0) {
+          regRecord->regSize = regSize;
+        } else {
+          regRecord->regSize = regRecord->baseSize - (regRecord->addr - regRecord->baseAddr);
+        }
+
+        if (regRecord->addr % comm->nvlsResources->ucGran == 0 && regRecord->regSize % granularity == 0) {
+          regRecord->state |= NVLS_REG_POSSIBLE;
+          memcpy(&regData[comm->localRank].reg, regRecord, sizeof(struct ncclReg));
+          regData[comm->localRank].offset = userBuff - regRecord->addr;
+        }
+      }
+
+      if ((regRecord->state & NVLS_REG_POSSIBLE) == 0) {
+        regRecord->state |= NVLS_REG_NO_SUPPORT;
+      }
+    }
+  }
+
+  NCCLCHECKGOTO(ncclShmemAllgather(comm, &comm->nvlsResources->nvlsShmem, regData + comm->localRank, regData, sizeof(struct localRegData)), ret, fail);
+
+  for (int i = 0; i < comm->localRanks; ++i) {
+    if ((regData[i].reg.state & NVLS_REG_POSSIBLE) == 0) {
+      goto fail;
+    }
+    /* get minimal reg size of nvls buffers */
+    if (minSize > regData[i].reg.regSize)
+      minSize = regData[i].reg.regSize;
  }

  /* start registration */
@@ -459,7 +486,7 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, struct localRequestData *r
  }

  CUCHECKGOTO(cuMulticastAddDevice(mcHandle, comm->nvlsResources->dev), ret, fail);
-  CUCHECKGOTO(cuMulticastBindAddr(mcHandle, 0, (CUdeviceptr)myReqData->reqBuff, minSize, 0), ret, fail);
+  CUCHECKGOTO(cuMulticastBindAddr(mcHandle, 0, (CUdeviceptr)regRecord->addr, minSize, 0), ret, fail);

  // Create a VA for the NVLS
  CUCHECKGOTO(cuMemAddressReserve(&regPtr, minSize, granularity, 0U, 0), ret, fail);
@@ -467,26 +494,28 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, struct localRequestData *r
  CUCHECKGOTO(cuMemMap(regPtr, minSize, 0, mcHandle, 0), ret, fail);
  CUCHECKGOTO(cuMemSetAccess(regPtr, minSize, &comm->nvlsResources->accessDesc, 1), ret, fail);

-  NCCLCHECKGOTO(ncclCalloc(&regRecord, 1), ret, fail);
-  regRecord->buff = myReqData->reqBuff;
-  regRecord->size = myReqData->reqSize;
  regRecord->regAddr = regPtr;
  regRecord->regSize = minSize;
  regRecord->dev = comm->nvlsResources->dev;
  regRecord->mcHandle = mcHandle;
+  regRecord->state |= NVLS_REG_COMPLETE;
  /* get all buffer addresses */
-  NCCLCHECKGOTO(ncclCalloc(&regRecord->addrs, comm->localRanks), ret, fail);
-  regRecord->addrs[comm->localRank] = regRecord->buff;
-  NCCLCHECKGOTO(ncclShmemAllgather(comm, &comm->nvlsResources->nvlsShmem, regRecord->addrs + comm->localRank, regRecord->addrs, sizeof(uintptr_t)), ret, fail);
-  /* enqueue record */
-  ncclIntruQueueEnqueue(&comm->regRecordQueue, regRecord);
+  regRecord->caddrs[comm->localRank] = regRecord->addr;
+  NCCLCHECKGOTO(ncclShmemAllgather(comm, &comm->nvlsResources->nvlsShmem, regRecord->caddrs + comm->localRank, regRecord->caddrs, sizeof(uintptr_t)), ret, fail);
+
+  /* Although registration is done, we still need to check whether the offsets are same among ranks. */
+  for (int i = 0; i < comm->localRanks - 1; ++i) {
+    if (regData[i].offset != regData[i + 1].offset) {
+      goto fail;
+    }
+  }

  localRegBufUsed = true;

 exit:
-  if (localRegBufUsed)
-    *regAddr = (uintptr_t)regPtr + userBuff - myReqData->reqBuff;
+  if (localRegBufUsed) *regAddr = (uintptr_t)regPtr + regData[comm->localRank].offset;
  *regUsed = localRegBufUsed;
+  free(regData);
  return ret;
 fail:
  localRegBufUsed = false;
@@ -497,77 +526,52 @@ ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *send
  ncclResult_t ret = ncclSuccess;
  bool localRegBufUsed = false;
  struct localRegData *regData = NULL;
-  struct localRequestData *reqData = NULL;
-  struct ncclRegRecord *regRecordHead = NULL, *sendRegRecord = NULL, *recvRegRecord = NULL;
-  struct ncclRegRequest *regRequestHead = NULL, *sendRegRequest = NULL, *recvRegRequest = NULL;
  bool sendNeedReg = false, recvNeedReg = false;
  CUdeviceptr regSendPtr = 0;
  CUdeviceptr regRecvPtr = 0;
+  struct ncclReg *sendRegRecord = NULL;
+  struct ncclReg *recvRegRecord = NULL;

  *outRegBufUsed = false;

-  NCCLCHECKGOTO(ncclCalloc(&regData, comm->localRanks), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&regData, comm->localRanks * 2), ret, fail);

-  /* first check whether the buffer has been registered and matches each other globally */
-  regRecordHead = ncclIntruQueueHead(&comm->regRecordQueue);
-  while (regRecordHead && ((sendRegRecord == NULL && sendbuff != NULL) || (recvRegRecord == NULL && recvbuff != NULL))) {
-    /* check send reg record */
-    if (sendRegRecord == NULL && regRecordHead->buff <= (uintptr_t)sendbuff &&
-      regRecordHead->buff + regRecordHead->size >= (uintptr_t)sendbuff + sendbuffSize) {
-      regData[comm->localRank].recSendbuff = regRecordHead->buff;
-      regData[comm->localRank].recSendOffset = (uintptr_t)sendbuff - regRecordHead->buff;
-      sendRegRecord = regRecordHead;
+  if (sendbuff) {
+    NCCLCHECKGOTO(ncclRegFind(comm, sendbuff, sendbuffSize, &sendRegRecord), ret, fail);
+    if (sendRegRecord) {
+      memcpy(&regData[comm->localRank * 2].reg, sendRegRecord, sizeof(struct ncclReg));
+      regData[comm->localRank * 2].offset = (uintptr_t)sendbuff - sendRegRecord->addr;
    }
-
-    /* check recv reg record */
-    if (recvRegRecord == NULL && regRecordHead->buff <= (uintptr_t)recvbuff &&
-      regRecordHead->buff + regRecordHead->size >= (uintptr_t)recvbuff + recvbuffSize) {
-      regData[comm->localRank].recRecvbuff = regRecordHead->buff;
-      regData[comm->localRank].recRecvOffset = (uintptr_t)recvbuff - regRecordHead->buff;
-      recvRegRecord = regRecordHead;
-    }
-    regRecordHead = regRecordHead->next;
  }

-  /* prepare registration request for later reference */
-  regRequestHead = ncclIntruQueueHead(&comm->regRequestQueue);
-  while (regRequestHead && ((sendRegRequest == NULL && sendbuff != NULL) || (recvRegRequest == NULL && recvbuff != NULL))) {
-    /* check send reg request */
-    if (regRequestHead->buff <= (uintptr_t)sendbuff &&
-      regRequestHead->buff + regRequestHead->size >= (uintptr_t)sendbuff + sendbuffSize) {
-      regData[comm->localRank].reqSendbuff = regRequestHead->buff;
-      regData[comm->localRank].reqSendSize = regRequestHead->size;
-      regData[comm->localRank].reqSendOffset = (uintptr_t)sendbuff - regRequestHead->buff;
-      sendRegRequest = regRequestHead;
+  if (recvbuff) {
+    NCCLCHECKGOTO(ncclRegFind(comm, recvbuff, recvbuffSize, &recvRegRecord), ret, fail);
+    if (recvRegRecord) {
+      memcpy(&regData[comm->localRank * 2 + 1].reg, recvRegRecord, sizeof(struct ncclReg));
+      regData[comm->localRank * 2 + 1].offset = (uintptr_t)recvbuff - recvRegRecord->addr;
    }
-
-    /* check recv reg request */
-    if (regRequestHead->buff <= (uintptr_t)recvbuff &&
-      regRequestHead->buff + regRequestHead->size >= (uintptr_t)recvbuff + recvbuffSize) {
-      regData[comm->localRank].reqRecvbuff = regRequestHead->buff;
-      regData[comm->localRank].reqRecvSize = regRequestHead->size;
-      regData[comm->localRank].reqRecvOffset = (uintptr_t)recvbuff - regRequestHead->buff;
-      recvRegRequest = regRequestHead;
-    }
-    regRequestHead = regRequestHead->next;
  }

-  NCCLCHECKGOTO(ncclShmemAllgather(comm, &comm->nvlsResources->nvlsShmem, regData + comm->localRank, regData, sizeof(struct localRegData)), ret, fail);
+  NCCLCHECKGOTO(ncclShmemAllgather(comm, &comm->nvlsResources->nvlsShmem, regData + comm->localRank * 2, regData, sizeof(struct localRegData) * 2), ret, fail);

  /* first check whether all local ranks find their registered buffer */
  for (int i = 0; i < comm->localRanks; ++i) {
-    if (regData[i].recSendbuff == 0 || sendRegRecord->addrs[i] != regData[i].recSendbuff) {
+    if ((regData[i * 2].reg.state & NVLS_REG_COMPLETE) == 0 || regData[comm->localRank * 2].reg.caddrs[i] != regData[i * 2].reg.addr) {
      sendNeedReg = true;
    }

-    if (regData[i].recRecvbuff == 0 || recvRegRecord->addrs[i] != regData[i].recRecvbuff) {
+    if ((regData[i * 2 + 1].reg.state & NVLS_REG_COMPLETE) == 0 || regData[comm->localRank * 2 + 1].reg.caddrs[i] != regData[i * 2 + 1].reg.addr) {
      recvNeedReg = true;
    }
+
+    if ((regData[i * 2].reg.state & NVLS_REG_NO_SUPPORT) || (regData[i * 2 + 1].reg.state & NVLS_REG_NO_SUPPORT)) {
+      goto fail;
+    }
  }

  if (sendNeedReg == false) {
    for (int i = 0; i < comm->localRanks - 1; ++i) {
-      if (regData[i].recSendOffset != regData[i + 1].recSendOffset) {
+      if (regData[i * 2].offset != regData[(i + 1) * 2].offset) {
        /* offset are different, we cannot apply user buffer registration */
        goto fail;
      }
@@ -575,18 +579,18 @@ ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *send

    /* reuse previous registered buffer if possible */
    if (!sendNeedReg)
-      regSendPtr = (CUdeviceptr)((uintptr_t)sendRegRecord->regAddr + regData[comm->localRank].recSendOffset);
+      regSendPtr = (CUdeviceptr)((uintptr_t)sendRegRecord->regAddr + regData[comm->localRank * 2].offset);
  }

  if (recvNeedReg == false) {
    for (int i = 0; i < comm->localRanks - 1; ++i) {
-      if (regData[i].recRecvOffset != regData[i + 1].recRecvOffset) {
+      if (regData[i * 2 + 1].offset != regData[(i + 1) * 2 + 1].offset) {
        goto fail;
      }
    }

    if (!recvNeedReg)
-      regRecvPtr = (CUdeviceptr)((uintptr_t)recvRegRecord->regAddr + regData[comm->localRank].recRecvOffset);
+      regRecvPtr = (CUdeviceptr)((uintptr_t)recvRegRecord->regAddr + regData[comm->localRank * 2 + 1].offset);
  }

  if ((!sendNeedReg || sendbuff == NULL) && (!recvNeedReg || recvbuff == NULL)) {
@@ -597,29 +601,13 @@ ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *send

  /* Start Registration. Not found registered buffers, then check whether both send and recv buffer locate
   * in register request cache. */
-  NCCLCHECKGOTO(ncclCalloc(&reqData, comm->localRanks), ret, fail);
-  if (sendNeedReg && sendbuff != NULL) {
-    /* copy request data got from previous shmem AG */
-    intptr_t offset = regData[0].reqSendOffset;
-    for (int i = 0; i < comm->localRanks; ++i) {
-      if (regData[i].reqSendbuff == 0 || offset != regData[i].reqSendOffset) goto fail;
-      reqData[i].reqBuff = regData[i].reqSendbuff;
-      reqData[i].reqSize = regData[i].reqSendSize;
-      reqData[i].reqOffset = regData[i].reqSendOffset;
-    }
-    tryRegisterBuffer(comm, reqData, (uintptr_t)sendbuff, sendbuffSize, &regSendPtr, &localRegBufUsed);
+  if (sendNeedReg && sendbuff) {
+    tryRegisterBuffer(comm, (uintptr_t)sendbuff, sendbuffSize, &regSendPtr, &localRegBufUsed);
    if (localRegBufUsed == false) goto fail;
  }

-  if (recvNeedReg && recvbuff != NULL) {
-    intptr_t offset = regData[0].reqRecvOffset;
-    for (int i = 0; i < comm->localRanks; ++i) {
-      if (regData[i].reqRecvbuff == 0 || offset != regData[i].reqRecvOffset) goto fail;
-      reqData[i].reqBuff = regData[i].reqRecvbuff;
-      reqData[i].reqSize = regData[i].reqRecvSize;
-      reqData[i].reqOffset = regData[i].reqRecvOffset;
-    }
-    tryRegisterBuffer(comm, reqData, (uintptr_t)recvbuff, recvbuffSize, &regRecvPtr, &localRegBufUsed);
+  if (recvNeedReg && recvbuff) {
+    tryRegisterBuffer(comm, (uintptr_t)recvbuff, recvbuffSize, &regRecvPtr, &localRegBufUsed);
    if (localRegBufUsed == false) goto fail;
  }

@@ -630,7 +618,6 @@ exit:
  *outRegBufRecv = (void*)regRecvPtr;
  *outRegBufUsed = localRegBufUsed;
  free(regData);
-  free(reqData);
  return ncclSuccess;
 fail:
  localRegBufUsed = false;
@@ -647,7 +634,7 @@ ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKerne
  CUmulticastObjectProp prop;
  char shareableHandle[NVLS_HANDLE_SIZE];
  CUmemGenericAllocationHandle sendMcHandle, recvMcHandle;
-  size_t sendGran, recvGran;
+  size_t sendGran = 0, recvGran = 0;
  bool *regBufFlags = NULL;
  struct graphRegData *rdata = NULL;
  const void *baseSend = NULL;
@@ -667,19 +654,17 @@ ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKerne
    if (recvbuff != NULL)
      CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&baseRecv, &baseRecvSize, (CUdeviceptr)recvbuff), ret, fail);

-    memcpy(&prop, &comm->nvlsResources->properties, sizeof(CUmulticastObjectProp));
-    prop.size = baseSendSize;
-    CUCHECKGOTO(cuMulticastGetGranularity(&sendGran, &prop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail);
-    prop.size = baseRecvSize;
-    CUCHECKGOTO(cuMulticastGetGranularity(&recvGran, &prop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail);
-
-    localRegBufUsed = ((uint64_t)baseSend % sendGran != 0 || (uint64_t)baseRecv % recvGran != 0) ? false : true;
+    localRegBufUsed = ((uint64_t)baseSend % comm->nvlsResources->ucGran != 0 || (uint64_t)baseRecv % comm->nvlsResources->ucGran != 0) ? false : true;
    regBufFlags[comm->localRank] = localRegBufUsed;
    NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, regBufFlags, sizeof(bool)), ret, fail);
    for (int i = 0; i < comm->localRanks; ++i)
      if (regBufFlags[i] == false) goto fail;

+    memcpy(&prop, &comm->nvlsResources->properties, sizeof(CUmulticastObjectProp));
    if (sendbuff != NULL) {
+      prop.size = baseSendSize;
+      CUCHECKGOTO(cuMulticastGetGranularity(&sendGran, &prop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail);
+
      /* check send buffer offset and size */
      rdata[comm->localRank].offset = (uintptr_t)sendbuff - (uintptr_t)baseSend;
      rdata[comm->localRank].size = baseSendSize;
@@ -719,6 +704,9 @@ ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKerne
    }

    if (recvbuff != NULL) {
+      prop.size = baseRecvSize;
+      CUCHECKGOTO(cuMulticastGetGranularity(&recvGran, &prop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail);
+
      rdata[comm->localRank].offset = (uintptr_t)recvbuff - (uintptr_t)baseRecv;
      rdata[comm->localRank].size = baseRecvSize;
      NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, rdata, sizeof(struct graphRegData)), ret, fail);
@@ -113,6 +113,12 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
  }
 #endif

+  // MNNVL support
+  if (info1->hostHash != info2->hostHash) {
+    NCCLCHECK(ncclTopoCheckMNNVL(topo, info1, info2, ret));
+    if (*ret) return ncclSuccess;
+  }
+
  // Rule out different nodes / isolated containers
  if (info1->hostHash != info2->hostHash || info1->shmDev != info2->shmDev) {
    *ret = 0;
@@ -203,8 +209,9 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
 ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, void **ptr) {
  if (ncclCuMemEnable()) {
 #if CUDART_VERSION >= 11030
+    CUmemAllocationHandleType type = ncclCuMemHandleType;
+
    // cuMem API support
-    CUmemAllocationHandleType type = NCCL_P2P_HANDLE_TYPE;
    CUmemGenericAllocationHandle handle;
    NCCLCHECK(ncclCuMemAlloc(ptr, &handle, size));
    if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
@@ -244,18 +251,16 @@ ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, siz
 #if CUDART_VERSION >= 11030
    // cuMem API support
    CUdeviceptr dptr = 0;
-    CUmemAllocationHandleType type = NCCL_P2P_HANDLE_TYPE;
+    CUmemAllocationHandleType type = ncclCuMemHandleType;
    CUmemGenericAllocationHandle handle;
    ncclCuDesc *cuDesc = &ipcDesc->cuDesc;

    // Import and map the remote memory descriptor to the local GPU
    if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
      // UDS fd support
-      struct ncclProxyConnector proxyConn;
      int fd = -1;
      // Send cuMem handle to remote for conversion to an fd
-      NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, tpPeer, &proxyConn));
-      NCCLCHECK(ncclProxyClientGetFdBlocking(comm, &proxyConn, &cuDesc->data, &fd));
+      NCCLCHECK(ncclProxyClientGetFdBlocking(comm, tpPeer, &cuDesc->data, &fd));
      INFO(NCCL_P2P, "UDS converted handle 0x%lx to fd %d on remote peer %d", *(uint64_t*)&cuDesc->data, fd, tpPeer);
      CUCHECK(cuMemImportFromShareableHandle(&handle, (void *)(uintptr_t)fd, type));
      (void) close(fd);
@@ -293,6 +298,8 @@ ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, siz
 NCCL_PARAM(P2pReadEnable, "P2P_READ_ENABLE", -2);
 NCCL_PARAM(P2pDirectDisable, "P2P_DIRECT_DISABLE", 0);

+#define P2P_SAME_PID(MYINFO, PEERINFO) ((MYINFO->hostHash == PEERINFO->hostHash) && (MYINFO->pidHash == PEERINFO->pidHash))
+
 static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* read, int* intermediateRank) {
  int p2p;
  // Queries the topology to see if the GPUs are Ampere and
@@ -305,7 +312,7 @@ static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo*
 }

 static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclProxyConnector* proxyConn, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclP2pBuff* p2pBuff, void** devMem, void** ipcPtr) {
-  if (myInfo->pidHash == peerInfo->pidHash) {
+  if (P2P_SAME_PID(myInfo, peerInfo)) {
    if (peerInfo->cudaDev != myInfo->cudaDev) {
      // Same PID different GPUs, enable P2P access
      // Legacy CUDA IPC
@@ -333,15 +340,9 @@ static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclProxyConnector* pro
    *devMem = p2pBuff->directPtr;
    *ipcPtr = NULL;
  } else {
-    if ((myInfo->pidHash == peerInfo->pidHash) && (peerInfo->cudaDev == myInfo->cudaDev)) {
-      // Same PID and GPU
-      *devMem = p2pBuff->directPtr;
-      *ipcPtr = NULL;
-    } else {
-      // Different PID or different GPU
-      NCCLCHECK(ncclP2pImportShareableBuffer(comm, comm->topParentRanks[peerInfo->rank], p2pBuff->size, &p2pBuff->ipcDesc, devMem));
-      *ipcPtr = *devMem;
-    }
+    // Different PID
+    NCCLCHECK(ncclP2pImportShareableBuffer(comm, comm->topParentRanks[peerInfo->rank], p2pBuff->size, &p2pBuff->ipcDesc, devMem));
+    *ipcPtr = *devMem;
  }
  return ncclSuccess;
 }
@@ -382,7 +383,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st

  if (intermediateRank == -1) {
    info->rank = myInfo->rank;
-    if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0) {
+    if (P2P_SAME_PID(myInfo, peerInfo) && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0) {
      resources->type = P2P_DIRECT;
      send->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
      INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s comm %p nRanks %02d",
@@ -391,8 +392,9 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
      // cuMem API support
      if (ncclCuMemEnable()) {
        resources->type = P2P_CUMEM;
-        INFO(NCCL_INIT|NCCL_P2P,"Channel %02d/%01d : %d[%x] -> %d[%x] via P2P/CUMEM%s%s comm %p nRanks %02d",
-             channelId, connIndex, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev, useReadStr, useMemcpy ? "/CE" : "", comm, comm->nRanks);;
+        const char *MNNVL = comm->MNNVL ? "MNNVL" : "CUMEM";
+        INFO(NCCL_INIT|NCCL_P2P,"Channel %02d/%01d : %d[%x] -> %d[%x] via P2P/CUMEM%s%s%s comm %p nRanks %02d",
+             channelId, connIndex, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev, MNNVL, useReadStr, useMemcpy ? "/CE" : "", comm, comm->nRanks);;
      } else {
        // Legacy CUDA IPC
        resources->type = P2P_IPC;
@@ -446,7 +448,7 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st

  if (intermediateRank == -1) {
    info->rank = myInfo->rank;
-    if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0) {
+    if (P2P_SAME_PID(myInfo, peerInfo) && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0) {
      resources->type = P2P_DIRECT;
      recv->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
    } else {
@@ -496,7 +498,7 @@ static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* co

  if (useMemcpy) {
    send->conn.tail = &resources->proxyInfo.ceRecvMem->tail;
-    send->conn.sizesFifo = resources->proxyInfo.ceRecvMem->sizesFifo;
+    send->conn.connFifo = resources->proxyInfo.ceRecvMem->connFifo;
    send->conn.head = &resources->proxyInfo.devShm->sendMem.head;
    // Send SIMPLE buff to proxy, and replace it by local buffer
    NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgConnect, &send->conn.buffs[NCCL_PROTO_SIMPLE], sizeof(void*), NULL, 0));
@@ -744,11 +746,11 @@ static ncclResult_t p2pSendProxyProgress(struct ncclProxyState* proxyState, stru
      }
      if (sub->transmitted < sub->done + NCCL_STEPS && sub->transmitted < sub->nsteps) {
        int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
-        volatile int* sizesFifo = resources->ceRecvMem->sizesFifo;
+        volatile struct ncclConnFifo* connFifo = resources->ceRecvMem->connFifo;
        volatile uint64_t* recvTail = &resources->ceRecvMem->tail;
        // Check GPU has sent everything
        if ((*recvTail > sub->base+sub->transmitted)) {
-          int size = sizesFifo[buffSlot];
+          int size = connFifo[buffSlot].size;
          CUDACHECK(cudaMemcpyAsync(resources->recvFifo+buffSlot*stepSize, resources->ceDevBuff+buffSlot*stepSize, size, cudaMemcpyDeviceToDevice, resources->stream));
          CUDACHECK(cudaEventRecord(resources->events[buffSlot], resources->stream));
          sub->transmitted += args->sliceSteps;
@@ -793,4 +795,4 @@ static void initCeOperation() {
    }
    init = 1;
  }
-}
+}
@@ -152,7 +152,7 @@ static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* co
  send->conn.head = &resources->devHostMem->head;

  if (useMemcpyRecv) {
-    send->conn.sizesFifo = resources->devRemHostMem->sizesFifo;
+    send->conn.connFifo = resources->devRemHostMem->connFifo;
  }
  if (useMemcpySend) {
    int tpProxyRank;
@@ -162,7 +162,7 @@ static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* co
    NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
    send->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo;
    send->conn.tail = &proxyInfo.ceRecvMem->tail;
-    send->conn.sizesFifo = proxyInfo.ceRecvMem->sizesFifo;
+    send->conn.connFifo = proxyInfo.ceRecvMem->connFifo;
  }

  // We must assign the proxyConn's proxyProgress property for proper checking at enqueue-time
@@ -315,15 +315,15 @@ static ncclResult_t shmSendProxyProgress(struct ncclProxyState* proxyState, stru
      }
      if (sub->transmitted < sub->done + NCCL_STEPS && sub->transmitted < sub->nsteps) {
        int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
-        volatile int* sizesFifo = resources->ceRecvMem->sizesFifo;
+        volatile struct ncclConnFifo* connFifo = resources->ceRecvMem->connFifo;
        volatile uint64_t* recvTail = &resources->ceRecvMem->tail;
        // Check GPU has sent everything
        if ((*recvTail > sub->base+sub->transmitted)) {
-          int size = sizesFifo[buffSlot];
+          int size = connFifo[buffSlot].size;
          CUDACHECK(cudaMemcpyAsync(resources->shmFifo+buffSlot*stepSize, resources->devFifo+buffSlot*stepSize, size, cudaMemcpyDeviceToHost, resources->stream));
          CUDACHECK(cudaEventRecord(resources->events[buffSlot], resources->stream));
-          resources->recvMem->sizesFifo[buffSlot] = size;
-          __sync_synchronize(); // make sure sizesFifo is visible
+          resources->recvMem->connFifo[buffSlot].size = size;
+          __sync_synchronize(); // make sure connFifo[].size is visible
          sub->transmitted += args->sliceSteps;
        }
      }
@@ -374,11 +374,11 @@ static ncclResult_t shmRecvProxyProgress(struct ncclProxyState* proxyState, stru
      }
      if (sub->transmitted < sub->done + NCCL_STEPS && sub->transmitted < sub->nsteps) {
        int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
-        volatile int* sizesFifo = resources->recvMem->sizesFifo;
+        volatile struct ncclConnFifo* connFifo = resources->recvMem->connFifo;
        volatile uint64_t* recvTail = &resources->recvMem->tail;
        // Check data is ready in SHM
        if ((*recvTail > sub->base+sub->transmitted)) {
-          int size = sizesFifo[buffSlot];
+          int size = connFifo[buffSlot].size;
          CUDACHECK(cudaMemcpyAsync(resources->devFifo+buffSlot*stepSize, resources->shmFifo+buffSlot*stepSize, size, cudaMemcpyHostToDevice, resources->stream));
          CUDACHECK(cudaEventRecord(resources->events[buffSlot], resources->stream));
          sub->transmitted += args->sliceSteps;
@@ -26,7 +26,20 @@ if(BUILD_TESTS)
  include_directories(${GTEST_INCLUDE_DIRS} ./common)

  # Collect testing framework source files
-  set (COMMON_SOURCE_FILES
+  set(TEST_SOURCE_FILES
+    AllGatherTests.cpp
+    AllReduceTests.cpp
+    AllToAllTests.cpp
+    AllToAllVTests.cpp
+    BroadcastTests.cpp
+    GatherTests.cpp
+    GroupCallTests.cpp
+    NonBlockingTests.cpp
+    ReduceScatterTests.cpp
+    ReduceTests.cpp
+    ScatterTests.cpp
+    SendRecvTests.cpp
+    StandaloneTests.cpp
    common/main.cpp
    common/CollectiveArgs.cpp
    common/EnvVars.cpp
@@ -36,38 +49,7 @@ if(BUILD_TESTS)
    common/TestBedChild.cpp
    )

-  # Collect source files for tests
-  if(ONLY_FUNCS)
-    # Convert input string to a list
-    string(REPLACE "|" ";" CONFIG_LIST ${ONLY_FUNCS})
-
-    # For each config in config list
-    foreach(item ${CONFIG_LIST})
-      string(REPLACE " " ";" CONFIG_PARAMS ${item})
-      list(GET CONFIG_PARAMS 0 COLL)
-
-      set(TEST_FILE "${COLL}Tests.cpp")
-      list(APPEND TEST_SOURCE_FILES ${TEST_FILE})
-    endforeach()
-  else()
-    set(TEST_SOURCE_FILES
-      AllGatherTests.cpp
-      AllReduceTests.cpp
-      AllToAllTests.cpp
-      AllToAllVTests.cpp
-      BroadcastTests.cpp
-      GatherTests.cpp
-      GroupCallTests.cpp
-      NonBlockingTests.cpp
-      ReduceScatterTests.cpp
-      ReduceTests.cpp
-      ScatterTests.cpp
-      SendRecvTests.cpp
-      StandaloneTests.cpp
-      )
-  endif()
-
-  add_executable(rccl-UnitTests ${COMMON_SOURCE_FILES} ${TEST_SOURCE_FILES})
+  add_executable(rccl-UnitTests ${TEST_SOURCE_FILES})

  ## Set rccl-UnitTests include directories
  target_include_directories(rccl-UnitTests PRIVATE ${ROCM_PATH} ${GTEST_INCLUDE_DIRS})