diff --git a/ext-net/example/Makefile b/ext-net/example/Makefile
index efa841c53c..e0a6aa6193 100644
--- a/ext-net/example/Makefile
+++ b/ext-net/example/Makefile
@@ -5,7 +5,7 @@
 #
 NCCL_HOME:=../../build/
 CUDA_HOME:=/usr/local/cuda
-INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include
+INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl
 PLUGIN_SO:=libnccl-net.so
 
 default: $(PLUGIN_SO)
diff --git a/ext-net/example/nccl/net.h b/ext-net/example/nccl/net.h
index 6b5b62c729..f5101aec8b 100644
--- a/ext-net/example/nccl/net.h
+++ b/ext-net/example/nccl/net.h
@@ -24,6 +24,7 @@ typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCC
 
 typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
 
+#include "net_v7.h"
 #include "net_v6.h"
 #include "net_v5.h"
 #include "net_v4.h"
diff --git a/ext-net/example/nccl/net_device.h b/ext-net/example/nccl/net_device.h
new file mode 100644
index 0000000000..32cc519ded
--- /dev/null
+++ b/ext-net/example/nccl/net_device.h
@@ -0,0 +1,31 @@
+/*************************************************************************
+ * Copyright (c) 2023-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_DEVICE_H_
+#define NET_DEVICE_H_
+
+#include "net_device.h"
+
+#define NCCL_NET_DEVICE_INVALID_VERSION      0x0
+#define NCCL_NET_MTU_SIZE                    4096
+
+// Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin
+// version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version.
+#define NCCL_NET_DEVICE_UNPACK_VERSION 0x7  
+
+typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType;
+
+typedef struct {
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+  void* handle;
+  size_t size;
+  int needsProxyProgress;
+} ncclNetDeviceHandle_v7_t;
+
+typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_t;
+
+#endif
diff --git a/ext-net/example/nccl/net_v6.h b/ext-net/example/nccl/net_v6.h
index 8bc16787b5..21379d3d11 100644
--- a/ext-net/example/nccl/net_v6.h
+++ b/ext-net/example/nccl/net_v6.h
@@ -18,8 +18,6 @@ typedef struct {
   int maxRecvs;   // Maximum number of grouped receives.
 }ncclNetProperties_v6_t;
 
-typedef ncclNetProperties_v6_t ncclNetProperties_t;
-
 typedef struct {
   // Name of the network (mainly for logs)
   const char* name;
diff --git a/ext-net/example/nccl/net_v7.h b/ext-net/example/nccl/net_v7.h
new file mode 100644
index 0000000000..77d6cb73ee
--- /dev/null
+++ b/ext-net/example/nccl/net_v7.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NCCL_NET_V7_H_
+#define NCCL_NET_V7_H_
+
+#include "net_device.h"
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+} ncclNetProperties_v7_t;
+
+typedef ncclNetProperties_v7_t ncclNetProperties_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+} ncclNet_v7_t;
+
+#endif // end include guard
diff --git a/ext-net/example/plugin.c b/ext-net/example/plugin.c
index a44ce9e51d..cc860b0067 100644
--- a/ext-net/example/plugin.c
+++ b/ext-net/example/plugin.c
@@ -4,7 +4,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include <nccl/net.h>
+#include "net.h"
 
 #define __hidden __attribute__ ((visibility("hidden")))
 
@@ -15,14 +15,14 @@ __hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess;
 
 __hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; }
 __hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; }
-__hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_v6_t* props) {
+__hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_v7_t* props) {
   //pluginPciPath(dev, &props.pciPath);
   //pluginPtrSupport(dev, &props.ptrSupport);
   return ncclInternalError;
 }
 __hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; }
-__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm) { return ncclInternalError; }
-__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm) { return ncclInternalError; }
 __hidden ncclResult_t pluginRegMr(void* collComm, void* data, int size, int type, void** mhandle) { return ncclInternalError; }
 __hidden ncclResult_t pluginRegMrDmaBuf(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { return ncclInternalError; }
 __hidden ncclResult_t pluginDeregMr(void* collComm, void* mhandle) { return ncclInternalError;}
@@ -33,10 +33,12 @@ __hidden ncclResult_t pluginTest(void* request, int* done, int* size) { return n
 __hidden ncclResult_t pluginCloseSend(void* sendComm) { return ncclInternalError; }
 __hidden ncclResult_t pluginCloseRecv(void* recvComm) { return ncclInternalError; }
 __hidden ncclResult_t pluginCloseListen(void* listenComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginIrecvConsumed(void* recvComm, int n, void* request) { return ncclInternalError; }
+__hidden ncclResult_t pluginGetDeviceMr(void* comm, void* mhandle, void** dptr_mhandle) { return ncclInternalError; }
 
 #define PLUGIN_NAME "Plugin"
 
-const ncclNet_v6_t ncclNetPlugin_v6 = {
+const ncclNet_v7_t ncclNetPlugin_v7 = {
   .name = PLUGIN_NAME,
   .init = pluginInit,
   .devices = pluginDevices,
@@ -54,6 +56,37 @@ const ncclNet_v6_t ncclNetPlugin_v6 = {
   .closeSend = pluginCloseSend,
   .closeRecv = pluginCloseRecv,
   .closeListen = pluginCloseListen,
+  .getDeviceMr = pluginGetDeviceMr,
+  .irecvConsumed = pluginIrecvConsumed,
+};
+
+__hidden ncclResult_t pluginGetProperties_v6(int dev, ncclNetProperties_v6_t* props) {
+  //pluginPciPath(dev, &props.pciPath);
+  //pluginPtrSupport(dev, &props.ptrSupport);
+  return ncclInternalError;
+}
+
+__hidden ncclResult_t pluginConnect_v6(int dev, void* handle, void** sendComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginAccept_v6(void* listenComm, void** recvComm) { return ncclInternalError; }
+
+const ncclNet_v6_t ncclNetPlugin_v6 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit,
+  .devices = pluginDevices,
+  .getProperties = pluginGetProperties_v6,
+  .listen = pluginListen,
+  .connect = pluginConnect_v6,
+  .accept = pluginAccept_v6,
+  .regMr = pluginRegMr,
+  .regMrDmaBuf = pluginRegMrDmaBuf,
+  .deregMr = pluginDeregMr,
+  .isend = pluginIsend,
+  .irecv = pluginIrecv,
+  .iflush = pluginIflush,
+  .test = pluginTest,
+  .closeSend = pluginCloseSend,
+  .closeRecv = pluginCloseRecv,
+  .closeListen = pluginCloseListen
 };
 
 /* v5 Compat */
@@ -61,10 +94,10 @@ const ncclNet_v5_t ncclNetPlugin_v5 = {
   .name = PLUGIN_NAME,
   .init = pluginInit,
   .devices = pluginDevices,
-  .getProperties = pluginGetProperties,
+  .getProperties = pluginGetProperties_v6,
   .listen = pluginListen,
-  .connect = pluginConnect,
-  .accept = pluginAccept,
+  .connect = pluginConnect_v6,
+  .accept = pluginAccept_v6,
   .regMr = pluginRegMr,
   .deregMr = pluginDeregMr,
   .isend = pluginIsend,
@@ -79,7 +112,7 @@ const ncclNet_v5_t ncclNetPlugin_v5 = {
 /* v4 Compat */
 static ncclResult_t pluginGetProperties_v4(int dev, ncclNetProperties_v4_t* props) {
   ncclNetProperties_v6_t props_v6;
-  ncclResult_t ret = pluginGetProperties(dev, &props_v6);
+  ncclResult_t ret = pluginGetProperties_v6(dev, &props_v6);
   if (ret != ncclSuccess) return ret;
   props->name = props_v6.name;
   props->pciPath = props_v6.pciPath;
@@ -103,14 +136,16 @@ static ncclResult_t pluginIflush_v4(void* recvComm, void* data, int size, void*
 static ncclResult_t pluginConnect_v4(int dev, void* handle, void** sendComm) {
   ncclResult_t ret;
   do {
-    ret = pluginConnect(dev, handle, sendComm);
+    ncclNetDeviceHandle_v7_t* handle = NULL;
+    ret = pluginConnect(dev, handle, sendComm, &handle);
   } while (ret == ncclSuccess && *sendComm == NULL);
   return ret;
 }
 static ncclResult_t pluginAccept_v4(void* listenComm, void** recvComm) {
   ncclResult_t ret;
   do {
-    ret = pluginAccept(listenComm, recvComm);
+    ncclNetDeviceHandle_v7_t* handle = NULL;
+    ret = pluginAccept(listenComm, recvComm, &handle);
   } while (ret == ncclSuccess && *recvComm == NULL);
   return ret;
 }
@@ -151,12 +186,12 @@ static ncclResult_t pluginInit_v3(ncclDebugLogger_t logFunction) {
 static ncclResult_t pluginListen_v3(int dev, void* handle, void** listenComm) {
   char pluginHandle[NCCL_NET_HANDLE_MAXSIZE];
   ncclResult_t ret = pluginListen(dev, &pluginHandle, listenComm);
-  memcpy(handle, &pluginHandle, NCCL_NET_HANDLE_MAXSIZE_V3);
+  memcpy(handle, &pluginHandle, NCCL_NET_HANDLE_MAXSIZE_V4);
   return ret;
 }
 static ncclResult_t pluginConnect_v3(int dev, void* handle, void** sendComm) {
   char pluginHandle[NCCL_NET_HANDLE_MAXSIZE];
-  memcpy(&pluginHandle, handle, NCCL_NET_HANDLE_MAXSIZE_V3);
+  memcpy(&pluginHandle, handle, NCCL_NET_HANDLE_MAXSIZE_V4);
   return pluginConnect_v4(dev, &pluginHandle, sendComm);
 }
 const ncclNet_v3_t ncclNetPlugin_v3 = {
diff --git a/ext-tuner/example/Makefile b/ext-tuner/example/Makefile
new file mode 100644
index 0000000000..9d9ace4842
--- /dev/null
+++ b/ext-tuner/example/Makefile
@@ -0,0 +1,17 @@
+#
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+NCCL_HOME:=../../build/
+CUDA_HOME:=/usr/local/cuda
+INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl
+PLUGIN_SO:=libnccl-tuner.so
+
+default: $(PLUGIN_SO)
+
+$(PLUGIN_SO): plugin.c
+	$(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
+
+clean:
+	rm -f $(PLUGIN_SO)
diff --git a/ext-tuner/example/nccl/tuner.h b/ext-tuner/example/nccl/tuner.h
new file mode 100644
index 0000000000..8076aa872a
--- /dev/null
+++ b/ext-tuner/example/nccl/tuner.h
@@ -0,0 +1,77 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_TUNER_H_
+#define NCCL_TUNER_H_
+
+#include "nccl.h"
+
+typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
+typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys;
+
+typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
+
+#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
+typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t;
+
+#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
+#define NCCL_ALGO_UNDEF -1
+#define NCCL_ALGO_TREE 0
+#define NCCL_ALGO_RING 1
+#define NCCL_ALGO_COLLNET_DIRECT 2
+#define NCCL_ALGO_COLLNET_CHAIN 3
+#define NCCL_ALGO_NVLS 4
+#define NCCL_ALGO_NVLS_TREE 5
+
+#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
+#define NCCL_PROTO_UNDEF -1
+#define NCCL_PROTO_LL 0
+#define NCCL_PROTO_LL128 1
+#define NCCL_PROTO_SIMPLE 2
+
+// API to be implemented by external tuner
+typedef struct {
+  // Name of the tuner
+  const char* name;
+
+  // Initializes tuner states.
+  // nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
+  // nNodes: number of nodes in current communicator.
+  // logFunction: a logFunction can be useful to integrate logging together with NCCL core.
+  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction);
+
+  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
+  // Inputs:
+  //   - collType: collective type , e.g., allreduce, allgather…
+  //   - nBytes: collective size in bytes
+  //   - collNetSupport: whether collnet supports this type
+  //   - nvlsSupport: whether nvlink sharp supports this time
+  //   - numPipeOps: number of operations in the group
+  //
+  // Outputs:
+  //   - algorithm: selected algorithm to be used for the given collective
+  //   - protocol: selected protocol to be used for the given collective
+  //   - nChannels: number of channels (hence SMs) to be used.
+  //
+  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
+  // default tuning for the given collective.
+  // Also, the plugin is allowed to not set any output, or set only the
+  // algorithm and protocol, but not only the algorithm or only the protocol.
+  // Unset fields will be set automatically by NCCL.
+  ncclResult_t (*getCollInfo)(ncclFunc_t collType, size_t nBytes,
+                              int collNetSupport, int nvlsSupport, int numPipeOps,
+                              int *algorithm, int *protocol, int* nChannels);
+
+  // Terminates the plugin and cleans up any resources that the plugin allocated.
+  ncclResult_t (*destroy)();
+} ncclTuner_v1_t;
+
+typedef ncclTuner_v1_t ncclTuner_t;
+
+#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v1"
+
+#endif
diff --git a/ext-tuner/example/plugin.c b/ext-tuner/example/plugin.c
new file mode 100644
index 0000000000..d972de3d3a
--- /dev/null
+++ b/ext-tuner/example/plugin.c
@@ -0,0 +1,26 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "tuner.h"
+
+#define __hidden __attribute__ ((visibility("hidden")))
+
+__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction) { return ncclSuccess; }
+
+__hidden ncclResult_t pluginGetCollInfo(ncclFunc_t collType, size_t nBytes,
+                              int collNetSupport, int nvlsSupport, int numPipeOps,
+                              int *algorithm, int *protocol, int* nChannels) { *algorithm = NCCL_ALGO_RING; *protocol = NCCL_PROTO_SIMPLE; return ncclSuccess; }
+
+__hidden ncclResult_t pluginDestroy() { return ncclSuccess; }
+
+#define PLUGIN_NAME "Example"
+
+const ncclTuner_v1_t ncclTunerPlugin_v1 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit,
+  .getCollInfo = pluginGetCollInfo,
+  .destroy = pluginDestroy
+};
diff --git a/makefiles/common.mk b/makefiles/common.mk
index 60a019c0b2..a037cf348b 100644
--- a/makefiles/common.mk
+++ b/makefiles/common.mk
@@ -9,6 +9,7 @@ PREFIX ?= /usr/local
 VERBOSE ?= 0
 KEEP ?= 0
 DEBUG ?= 0
+ASAN ?= 0
 TRACE ?= 0
 PROFAPI ?= 1
 NVTX ?= 1
@@ -85,6 +86,13 @@ NVCUFLAGS += -O0 -G -g
 CXXFLAGS  += -O0 -g -ggdb3
 endif
 
+# Make sure to run with ASAN_OPTIONS=protect_shadow_gap=0 otherwise CUDA will fail with OOM
+ifneq ($(ASAN), 0)
+CXXFLAGS += -fsanitize=address
+LDFLAGS += -fsanitize=address -static-libasan
+NVLDFLAGS += -Xcompiler -fsanitize=address,-static-libasan
+endif
+
 ifneq ($(VERBOSE), 0)
 NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
 CXXFLAGS  += -Wall -Wextra
diff --git a/makefiles/version.mk b/makefiles/version.mk
index fde92c08a0..5e32150b1c 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 18
-NCCL_PATCH   := 6
+NCCL_MINOR   := 19
+NCCL_PATCH   := 3
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
diff --git a/src/Makefile b/src/Makefile
index dd5754989e..7a1881d9d6 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -3,19 +3,17 @@
 #
 # See LICENSE.txt for license information
 #
-
 include ../makefiles/common.mk
 include ../makefiles/version.mk
 
 ##### src files
 INCEXPORTS  := nccl.h nccl_net.h
-LIBSRCFILES := init.cc init_nvtx.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc net.cc \
-		misc/cudawrap.cc misc/nvmlwrap.cc misc/ibvsymbols.cc misc/ibvwrap.cc misc/gdrwrap.cc \
-		misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc misc/strongstream.cc \
-		misc/ipcsocket.cc \
-		transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc transport/nvls.cc \
-                collectives/sendrecv.cc collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \
-                graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc graph/xml.cc
+LIBSRCFILES := \
+	bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \
+	init.cc init_nvtx.cc net.cc proxy.cc transport.cc \
+	$(wildcard graph/*.cc) \
+	$(wildcard misc/*.cc) \
+	$(wildcard transport/*.cc)
 
 ##### lib files
 LIBNAME     := libnccl.so
@@ -45,7 +43,7 @@ LIBOBJ     := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o)
 DEPFILES   := $(LIBOBJ:%.o=%.d)
 LDFLAGS    += -L${CUDA_LIB} -l$(CUDARTLIB) -lpthread -lrt -ldl
 
-DEVICELIB  := $(BUILDDIR)/obj/collectives/device/colldevice.a
+DEVMANIFEST := $(BUILDDIR)/obj/device/manifest
 
 ##### rules
 build : lib staticlib
@@ -54,8 +52,8 @@ lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET) $(PKGDIR)/$(PKGTARGET)
 
 staticlib : $(LIBDIR)/$(STATICLIBTARGET)
 
-$(DEVICELIB): ALWAYS_REBUILD $(INCTARGETS)
-	$(MAKE) -C collectives/device
+$(DEVMANIFEST): ALWAYS_REBUILD $(INCTARGETS)
+	$(MAKE) -C ./device
 
 # Empty target to force rebuild
 ALWAYS_REBUILD:
@@ -75,21 +73,17 @@ $(INCDIR)/nccl.h : nccl.h.in ../makefiles/version.mk
 	    -e "s/\$${nccl:Version}/$(NCCL_VERSION)/g" \
 	    $< > $@
 
-$(LIBDIR)/$(LIBTARGET): $(LIBOBJ) $(DEVICELIB)
+$(LIBDIR)/$(LIBTARGET): $(LIBOBJ) $(DEVMANIFEST)
 	@printf "Linking    %-35s > %s\n" $(LIBTARGET) $@
 	mkdir -p $(LIBDIR)
-	$(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LIBOBJ) $(DEVICELIB) $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LIBOBJ) $$(cat $(DEVMANIFEST)) $(LDFLAGS)
 	ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
 	ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
 
-null :=
-space := $(null) #
-comma := ,
-
-$(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) $(DEVICELIB)
+$(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) $(DEVMANIFEST)
 	@printf "Archiving  %-35s > %s\n" $(STATICLIBTARGET) $@
 	mkdir -p $(LIBDIR)
-	printf "create $@\naddlib $(DEVICELIB)\naddmod $(subst $(space),$(comma),$(strip $(LIBOBJ)))\nsave\nend" | ar -M
+	ar cr $@ $(LIBOBJ) $$(cat $(DEVMANIFEST))
 
 $(PKGDIR)/nccl.pc : nccl.pc.in
 	mkdir -p $(PKGDIR)
@@ -126,7 +120,7 @@ $(OBJDIR)/%.o : %.cc $(INCTARGETS)
 	@rm -f $(@:%.o=%.d.tmp)
 
 clean :
-	$(MAKE) -C collectives/device clean
+	$(MAKE) -C device clean
 	rm -rf ${INCDIR} ${LIBDIR} ${PKGDIR} ${OBJDIR}
 
 install : build
diff --git a/src/bootstrap.cc b/src/bootstrap.cc
index 764cb6c391..0c8a338d6e 100644
--- a/src/bootstrap.cc
+++ b/src/bootstrap.cc
@@ -12,6 +12,7 @@
 #include <unistd.h>
 #include <sys/types.h>
 #include "proxy.h"
+#include "param.h"
 
 struct bootstrapRootArgs {
   struct ncclSocket* listenSock;
@@ -28,21 +29,24 @@ ncclResult_t bootstrapNetInit() {
   if (bootstrapNetInitDone == 0) {
     pthread_mutex_lock(&bootstrapNetLock);
     if (bootstrapNetInitDone == 0) {
-      char* env = getenv("NCCL_COMM_ID");
+      const char* env = ncclGetEnv("NCCL_COMM_ID");
       if (env) {
         union ncclSocketAddress remoteAddr;
         if (ncclSocketGetAddrFromString(&remoteAddr, env) != ncclSuccess) {
           WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
+          pthread_mutex_unlock(&bootstrapNetLock);
           return ncclInvalidArgument;
         }
         if (ncclFindInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
           WARN("NET/Socket : No usable listening interface found");
+          pthread_mutex_unlock(&bootstrapNetLock);
           return ncclSystemError;
         }
       } else {
         int nIfs = ncclFindInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1);
         if (nIfs <= 0) {
           WARN("Bootstrap : no socket interface found");
+          pthread_mutex_unlock(&bootstrapNetLock);
           return ncclInternalError;
         }
       }
@@ -189,7 +193,7 @@ ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle) {
   memset(handle, 0, sizeof(ncclBootstrapHandle));
   NCCLCHECK(getRandomData(&handle->magic, sizeof(handle->magic)));
 
-  char* env = getenv("NCCL_COMM_ID");
+  const char* env = ncclGetEnv("NCCL_COMM_ID");
   if (env) {
     INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env);
     if (ncclSocketGetAddrFromString(&handle->addr, env) != ncclSuccess) {
diff --git a/src/collectives.cc b/src/collectives.cc
new file mode 100644
index 0000000000..89d8932db8
--- /dev/null
+++ b/src/collectives.cc
@@ -0,0 +1,167 @@
+/*************************************************************************
+ * Copyright (c) 2015-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "argcheck.h" // Need some checks here since we access comm
+#include "collectives.h"
+#include "enqueue.h"
+#include "nccl.h"
+
+NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
+  // Just pass the size of one message and not the total bytes sent/received.
+  constexpr nvtxPayloadSchemaEntry_t AllGatherSchema[] = {
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}
+  };
+  size_t msgsize = sendcount * ncclTypeSize(datatype);
+  NVTX3_FUNC_WITH_PARAMS(AllGather, AllGatherSchema, msgsize)
+
+  struct ncclInfo info = { ncclFuncAllGather, "AllGather",
+    sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
+    ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
+}
+
+NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
+ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
+  struct NvtxParamsAllReduce {
+    size_t bytes;
+    ncclRedOp_t op;
+  };
+  // Just pass the size of one message and not the total bytes sent/received.
+  static constexpr nvtxPayloadSchemaEntry_t AllReduceSchema[] = {
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
+    {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
+      offsetof(NvtxParamsAllReduce, op)}
+  };
+  NvtxParamsAllReduce payload{count * ncclTypeSize(datatype), op};
+  NVTX3_FUNC_WITH_PARAMS(AllReduce, AllReduceSchema, payload)
+
+  struct ncclInfo info = { ncclFuncAllReduce, "AllReduce",
+    sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
+    ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
+}
+
+NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream) {
+  struct NvtxParamsBroadcast {
+    size_t bytes;
+    int root;
+  };
+  constexpr nvtxPayloadSchemaEntry_t BroadcastSchema[] = {
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsBroadcast, root)}
+  };
+  NvtxParamsBroadcast payload{count * ncclTypeSize(datatype), root};
+  NVTX3_FUNC_WITH_PARAMS(Broadcast, BroadcastSchema, payload)
+
+  struct ncclInfo info = { ncclFuncBroadcast, "Broadcast",
+    sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
+    BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
+}
+/* Deprecated original "in place" function, similar to MPI */
+NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream) {
+  return ncclBroadcast(buff, buff, count, datatype, root, comm, stream);
+}
+
+NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  struct NvtxParamsReduce {
+    size_t bytes;
+    int root;
+    ncclRedOp_t op;
+  };
+  constexpr nvtxPayloadSchemaEntry_t ReduceSchema[] = {
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsReduce, root)},
+    {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
+      offsetof(NvtxParamsReduce, op)}
+  };
+  NvtxParamsReduce payload{count * ncclTypeSize(datatype), root, op};
+  NVTX3_FUNC_WITH_PARAMS(Reduce, ReduceSchema, payload)
+
+  struct ncclInfo info = { ncclFuncReduce, "Reduce",
+    sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
+    REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
+}
+
+NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
+ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
+  struct NvtxParamsReduceScatter {
+    size_t bytes;
+    ncclRedOp_t op;
+  };
+  constexpr nvtxPayloadSchemaEntry_t ReduceScatterSchema[] = {
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
+    {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
+      offsetof(NvtxParamsReduceScatter, op)}
+  };
+  NvtxParamsReduceScatter payload{recvcount * ncclTypeSize(datatype), op};
+  NVTX3_FUNC_WITH_PARAMS(ReduceScatter, ReduceScatterSchema, payload)
+
+  struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter",
+    sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
+    REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
+}
+
+struct NvtxParamsSendRecv {
+    size_t bytes;
+    int peer;
+};
+constexpr const nvtxPayloadSchemaEntry_t SendRecvSchema[] = {
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Peer rank", nullptr, 0, offsetof(NvtxParamsSendRecv, peer)}
+};
+
+NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
+    ncclComm_t comm, cudaStream_t stream) {
+  NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
+  NVTX3_FUNC_WITH_PARAMS(Send, SendRecvSchema, payload)
+
+  struct ncclInfo info = { ncclFuncSend, "Send",
+    NULL, (void*)sendbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
+    1, 1 };
+  ncclResult_t ret;
+  NCCLCHECK(ncclGroupStart());
+  ret = ncclEnqueueCheck(&info);
+  NCCLCHECK(ncclGroupEnd());
+  return ret;
+}
+
+NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
+    ncclComm_t comm, cudaStream_t stream) {
+  NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
+  NVTX3_FUNC_WITH_PARAMS(Recv, SendRecvSchema, payload)
+
+  struct ncclInfo info = { ncclFuncRecv, "Recv",
+    NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
+    1, 1 };
+  ncclResult_t ret;
+  NCCLCHECK(ncclGroupStart());
+  ret = ncclEnqueueCheck(&info);
+  NCCLCHECK(ncclGroupEnd());
+  return ret;
+}
diff --git a/src/collectives/all_gather.cc b/src/collectives/all_gather.cc
deleted file mode 100644
index 97ec981ed4..0000000000
--- a/src/collectives/all_gather.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "enqueue.h"
-#include "collectives.h"
-
-NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
-    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
-ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
-    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
-  // Just pass the size of one message and not the total bytes sent/received.
-  constexpr nvtxPayloadSchemaEntry_t AllGatherSchema[] = {
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}
-  };
-  size_t msgsize = sendcount * ncclTypeSize(datatype);
-  NVTX3_FUNC_WITH_PARAMS(AllGather, AllGatherSchema, msgsize)
-
-  struct ncclInfo info = { ncclFuncAllGather, "AllGather",
-    sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
-    ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS };
-  return ncclEnqueueCheck(&info);
-}
diff --git a/src/collectives/all_reduce.cc b/src/collectives/all_reduce.cc
deleted file mode 100644
index 8ac61a2a78..0000000000
--- a/src/collectives/all_reduce.cc
+++ /dev/null
@@ -1,31 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "enqueue.h"
-#include "nccl.h"
-
-NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
-ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
-  struct NvtxParamsAllReduce {
-    size_t bytes;
-    ncclRedOp_t op;
-  };
-  // Just pass the size of one message and not the total bytes sent/received.
-  static constexpr nvtxPayloadSchemaEntry_t AllReduceSchema[] = {
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
-    {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
-      offsetof(NvtxParamsAllReduce, op)}
-  };
-  NvtxParamsAllReduce payload{count * ncclTypeSize(datatype), op};
-  NVTX3_FUNC_WITH_PARAMS(AllReduce, AllReduceSchema, payload)
-
-  struct ncclInfo info = { ncclFuncAllReduce, "AllReduce",
-    sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
-    ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS };
-  return ncclEnqueueCheck(&info);
-}
diff --git a/src/collectives/broadcast.cc b/src/collectives/broadcast.cc
deleted file mode 100644
index c73502eedb..0000000000
--- a/src/collectives/broadcast.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "enqueue.h"
-#include "collectives.h"
-
-NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream);
-ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream) {
-  struct NvtxParamsBroadcast {
-    size_t bytes;
-    int root;
-  };
-  constexpr nvtxPayloadSchemaEntry_t BroadcastSchema[] = {
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsBroadcast, root)}
-  };
-  NvtxParamsBroadcast payload{count * ncclTypeSize(datatype), root};
-  NVTX3_FUNC_WITH_PARAMS(Broadcast, BroadcastSchema, payload)
-
-  struct ncclInfo info = { ncclFuncBroadcast, "Broadcast",
-    sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
-    BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS };
-  return ncclEnqueueCheck(&info);
-}
-/* Deprecated original "in place" function, similar to MPI */
-NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream);
-ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream) {
-  return ncclBroadcast(buff, buff, count, datatype, root, comm, stream);
-}
-
diff --git a/src/collectives/device/Makefile b/src/collectives/device/Makefile
deleted file mode 100644
index a2498a00c7..0000000000
--- a/src/collectives/device/Makefile
+++ /dev/null
@@ -1,76 +0,0 @@
-#
-# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
-#
-# See LICENSE.txt for license information
-#
-
-include ../../../makefiles/common.mk
-include ../../../makefiles/version.mk
-
-BUILDDIR ?= $(abspath ../../../build)
-OBJDIR := $(BUILDDIR)/obj/collectives/device
-
-LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu sendrecv.cu onerank_reduce.cu
-
-LIBSRCFILES += functions.cu
-
-DEPFILES   := $(patsubst %.cu, $(OBJDIR)/%.d, $(LIBSRCFILES))
-DEPENDFILES:= $(DEPFILES:%.d=%.dep)
-STATICLIB  := $(OBJDIR)/colldevice.a
-DEVOBJ     := $(OBJDIR)/devlink.o
-RULESFILE  := $(OBJDIR)/Makefile.rules
-
-NVCUFLAGS  += -I. -I.. -I$(BUILDDIR)/include -I../../include --compiler-options "-fPIC -fvisibility=hidden"
-
-
-all: $(STATICLIB)
-
-# Dummy rule so that the extra dependency (%.dep) files are preserved by make
-all_deps: $(DEPENDFILES)
-
-# Auto-generating the rules per op/reduction/datatype/algorithm
-$(RULESFILE) : gen_rules.sh
-	@printf "Generating %-35s > %s\n" rules $@
-	@mkdir -p $(OBJDIR)
-	@CUDA_MAJOR=${CUDA_MAJOR} CUDA_MINOR=${CUDA_MINOR} ./gen_rules.sh $(OBJDIR) > $@
-
--include $(RULESFILE)
-
-LIBOBJ     := $(GENOBJS) $(OBJDIR)/functions.o $(OBJDIR)/onerank_reduce.o
-
--include $(DEPFILES)
-
-$(STATICLIB): $(LIBOBJ) $(DEVOBJ)
-	@printf "Archiving  %-35s > %s\n" objects $@
-	ar cr $@ $^
-
-# We do not want make to build *.d when running make clean.
-# So we only provide targets for .dep which will produce .dep and .d,
-# with only .d being included, and .dep keeping track of what needs to
-# be regenerated.
-$(OBJDIR)/%.dep : %.cu
-	@mkdir -p $(OBJDIR)
-	@$(NVCC) $(NVCUFLAGS) -M $< -o $@.tmp
-	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $@.tmp > $@
-	@sed -e 's/.*://' -e 's/\\$$//' < $@.tmp | fmt -1 | \
-                sed -e 's/^ *//' -e 's/$$/:/' >> $@
-	@rm -f $@.tmp
-	@cp $@ $(@:.dep=.d)
-
-# Compiled kernels and collectives with relocatable device code ...
-$(OBJDIR)/functions.o : functions.cu $(OBJDIR)/functions.dep
-	@printf "Compiling  %-35s > %s\n" $< $@
-	mkdir -p `dirname $@`
-	$(NVCC) $(NVCUFLAGS) -dc $< -o $@
-
-$(OBJDIR)/onerank_reduce.o : onerank_reduce.cu $(OBJDIR)/onerank_reduce.dep
-	@printf "Compiling  %-35s > %s\n" $< $@
-	mkdir -p `dirname $@`
-	$(NVCC) $(NVCUFLAGS) -dc $< -o $@
-
-# ... and create the device-side linked object with all those.
-$(DEVOBJ) : $(LIBOBJ)
-	$(NVCC) $(NVCUFLAGS) -dlink $^ -o $@
-
-clean:
-	rm -f $(LIBOBJ) $(DEVOBJ) $(DEPFILES) $(DEPENDFILES) $(RULESFILE) $(STATICLIB)
diff --git a/src/collectives/device/all_gather.cu b/src/collectives/device/all_gather.cu
deleted file mode 100644
index 4022e2e9f5..0000000000
--- a/src/collectives/device/all_gather.cu
+++ /dev/null
@@ -1,11 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "all_gather.h"
-#include "common.h"
-#include "collectives.h"
-
-IMPL_COLL_C(AllGather);
diff --git a/src/collectives/device/all_reduce.cu b/src/collectives/device/all_reduce.cu
deleted file mode 100644
index e7c3c28cfb..0000000000
--- a/src/collectives/device/all_reduce.cu
+++ /dev/null
@@ -1,11 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "all_reduce.h"
-#include "common.h"
-#include "collectives.h"
-
-IMPL_COLL_R(AllReduce);
diff --git a/src/collectives/device/broadcast.cu b/src/collectives/device/broadcast.cu
deleted file mode 100644
index 77595858bf..0000000000
--- a/src/collectives/device/broadcast.cu
+++ /dev/null
@@ -1,11 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "broadcast.h"
-#include "common.h"
-#include "collectives.h"
-
-IMPL_COLL_C(Broadcast);
diff --git a/src/collectives/device/functions.cu b/src/collectives/device/functions.cu
deleted file mode 100644
index 52e9efe842..0000000000
--- a/src/collectives/device/functions.cu
+++ /dev/null
@@ -1,122 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "devcomm.h"
-#include "collectives.h"
-#include "common.h"
-
-__shared__ ncclShmemData ncclShmem;
-#if __CUDA_ARCH__ < 700
-  __shared__ ulong2 ncclShmemPerWarp[ncclShmemScratchWarpSize()*(NCCL_MAX_NTHREADS/WARP_SIZE)/sizeof(ulong2)];
-#endif
-
-#define NCCL_FUNC5(func, algo, devredop, type, nullify) \
-  MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL,     devredop, type)), \
-  MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL128,  devredop, type)), \
-  MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, SIMPLE, devredop, type))
-
-#define NCCL_FUNC4(func, devredop, type, nullify) \
-  NCCL_FUNC5(func, TREE,    devredop, type, nullify), \
-  NCCL_FUNC5(func, RING,    devredop, type, nullify), \
-  NCCL_FUNC5(func, COLLNET_DIRECT, devredop, type, nullify), \
-  NCCL_FUNC5(func, COLLNET_CHAIN,  devredop, type, nullify), \
-  NCCL_FUNC5(func, NVLS,           devredop, type, nullify), \
-  NCCL_FUNC5(func, NVLS_TREE,      devredop, type, nullify)
-
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-// Must be consistent with ncclDataType_t
-#define NCCL_FUNCS3A(func, devredop, nullForFloat) \
-  NCCL_FUNC4(func, devredop, int8_t, 0), \
-  NCCL_FUNC4(func, devredop, uint8_t, 0), \
-  NCCL_FUNC4(func, devredop, int32_t, 0), \
-  NCCL_FUNC4(func, devredop, uint32_t, 0), \
-  NCCL_FUNC4(func, devredop, int64_t, 0), \
-  NCCL_FUNC4(func, devredop, uint64_t, 0), \
-  NCCL_FUNC4(func, devredop, half, nullForFloat), \
-  NCCL_FUNC4(func, devredop, float, nullForFloat), \
-  NCCL_FUNC4(func, devredop, double, nullForFloat), \
-  NCCL_FUNC4(func, devredop, __nv_bfloat16, nullForFloat)
-#define NCCL_FUNCS3B(func, devredop) \
-  NCCL_FUNC4(func, devredop, int8_t, 0), \
-  NCCL_FUNC4(func, devredop, int8_t, 0), \
-  NCCL_FUNC4(func, devredop, int8_t, 0), \
-  NCCL_FUNC4(func, devredop, int8_t, 0), \
-  NCCL_FUNC4(func, devredop, int8_t, 0), \
-  NCCL_FUNC4(func, devredop, int8_t, 0), \
-  NCCL_FUNC4(func, devredop, int8_t, 0), \
-  NCCL_FUNC4(func, devredop, int8_t, 0), \
-  NCCL_FUNC4(func, devredop, int8_t, 0), \
-  NCCL_FUNC4(func, devredop, int8_t, 0)
-#else
-// Must be consistent with ncclDataType_t
-#define NCCL_FUNCS3A(func, devredop, nullForFloat) \
-  NCCL_FUNC4(func, devredop, int8_t, 0), \
-  NCCL_FUNC4(func, devredop, uint8_t, 0), \
-  NCCL_FUNC4(func, devredop, int32_t, 0), \
-  NCCL_FUNC4(func, devredop, uint32_t, 0), \
-  NCCL_FUNC4(func, devredop, int64_t, 0), \
-  NCCL_FUNC4(func, devredop, uint64_t, 0), \
-  NCCL_FUNC4(func, devredop, half, nullForFloat), \
-  NCCL_FUNC4(func, devredop, float, nullForFloat), \
-  NCCL_FUNC4(func, devredop, double, nullForFloat)
-#define NCCL_FUNCS3B(func, devredop) \
-  NCCL_FUNC4(func, devredop, int8_t, 0), \
-  NCCL_FUNC4(func, devredop, int8_t, 0), \
-  NCCL_FUNC4(func, devredop, int8_t, 0), \
-  NCCL_FUNC4(func, devredop, int8_t, 0), \
-  NCCL_FUNC4(func, devredop, int8_t, 0), \
-  NCCL_FUNC4(func, devredop, int8_t, 0), \
-  NCCL_FUNC4(func, devredop, int8_t, 0), \
-  NCCL_FUNC4(func, devredop, int8_t, 0), \
-  NCCL_FUNC4(func, devredop, int8_t, 0)
-#endif
-
-// Must be consistent with ncclRedOp_t
-#define NCCL_FUNCS2A(func) \
-  NCCL_FUNCS3A(func, Sum,        /*nullForFloat=*/0), \
-  NCCL_FUNCS3A(func, Prod,       /*nullForFloat=*/0), \
-  NCCL_FUNCS3A(func, Max,        /*nullForFloat=*/0), \
-  NCCL_FUNCS3A(func, Min,        /*nullForFloat=*/0), \
-  NCCL_FUNCS3A(func, PreMulSum,  /*nullForFloat=*/0), \
-  NCCL_FUNCS3A(func, SumPostDiv, /*nullForFloat=*/1)
-
-#define NCCL_FUNCS2B(func) \
-  NCCL_FUNCS3B(func, Sum), \
-  NCCL_FUNCS3B(func, Sum), \
-  NCCL_FUNCS3B(func, Sum), \
-  NCCL_FUNCS3B(func, Sum), \
-  NCCL_FUNCS3B(func, Sum), \
-  NCCL_FUNCS3B(func, Sum)
-
-// Must be consistent with the ncclFuncSet enum
-__device__ ncclKern_t ncclFuncs[1+ncclNumTypes+NCCL_NUM_FUNCTIONS*ncclNumDevRedOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
-// Don't try to initialize the host shadow copy of this device-side global
-// variable. There is no host pointer to a device-side function, which
-// confuses clang. This will be fixed in the next clang release.
-#if __CUDA_ARCH__
-  NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
-  NCCL_ONERANK_REDUCE_NAME(PreMulSum, int8_t),
-  NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint8_t),
-  NCCL_ONERANK_REDUCE_NAME(PreMulSum, int32_t),
-  NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint32_t),
-  NCCL_ONERANK_REDUCE_NAME(PreMulSum, int64_t),
-  NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint64_t),
-  NCCL_ONERANK_REDUCE_NAME(PreMulSum, half),
-  NCCL_ONERANK_REDUCE_NAME(PreMulSum, float),
-  NCCL_ONERANK_REDUCE_NAME(PreMulSum, double),
-  #if defined(__CUDA_BF16_TYPES_EXIST__)
-    NCCL_ONERANK_REDUCE_NAME(PreMulSum, __nv_bfloat16),
-  #endif
-  NCCL_FUNCS2B(Broadcast),
-  NCCL_FUNCS2A(Reduce),
-  NCCL_FUNCS2B(AllGather),
-  NCCL_FUNCS2A(ReduceScatter),
-  NCCL_FUNCS2A(AllReduce)
-#endif
-};
-
-// Workaround for https://reviews.llvm.org/D55580
-__device__ void ncclWorkaroundClangD55580() {}
diff --git a/src/collectives/device/gen_rules.sh b/src/collectives/device/gen_rules.sh
deleted file mode 100755
index 8c7387c701..0000000000
--- a/src/collectives/device/gen_rules.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/bash
-#
-# Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
-#
-# See LICENSE.txt for license information
-#
-
-dir=$1
-
-datatypes="i8 u8 i32 u32 i64 u64 f16 f32 f64"
-if [ "$CUDA_MAJOR" -ge 11 ]
-then
-    datatypes+=" bf16"
-fi
-
-targets="GENOBJS := \\\\\n"
-
-for base in sendrecv all_reduce all_gather broadcast reduce reduce_scatter; do
-  opn=0
-  for op in sum prod min max premulsum sumpostdiv; do
-    dtn=0
-    # Order must match that of the ncclDataType_t enum
-    for dt in ${datatypes}; do
-      # Generate a unique filename for each compilation unit,
-      # otherwise the __nv_module_id may conflict at link time
-      echo "${dir}/${base}_${op}_${dt}.cu : ${base}.cu"
-      echo "	@printf \"Copying    %-35s > %s\\\\n\" \$< \$@"
-      echo "	cp \$< \$@"
-      echo ""
-      # Compile the file
-      echo "${dir}/${base}_${op}_${dt}.o : ${dir}/${base}_${op}_${dt}.cu ${base}.cu ${dir}/${base}.dep"
-
-      echo "	@printf \"Compiling  %-35s > %s\\\\n\" ${base}.cu ${dir}/${base}_${op}_${dt}.o"
-      echo "	mkdir -p ${dir}"
-      echo "	\${NVCC} -DNCCL_OP=${opn} -DNCCL_TYPE=${dtn} \${NVCUFLAGS} -dc \$< -o \$@"
-      echo ""
-      targets="$targets\t${dir}/${base}_${op}_${dt}.o \\\\\n"
-      dtn=$(($dtn + 1))
-    done
-    opn=$(($opn + 1))
-  done
-done
-echo -e "$targets"
diff --git a/src/collectives/device/onerank_reduce.cu b/src/collectives/device/onerank_reduce.cu
deleted file mode 100644
index 552d1f2050..0000000000
--- a/src/collectives/device/onerank_reduce.cu
+++ /dev/null
@@ -1,62 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "devcomm.h"
-#include "collectives.h"
-#include "common_kernel.h"
-#include "common.h"
-
-namespace {
-  template<typename T, typename RedOp>
-  __device__ __forceinline__ void oneRankReduce() {
-    ncclWork *w = &ncclShmem.work;
-    int tid = threadIdx.x;
-    int tn = blockDim.x;
-    #pragma unroll 1
-    for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].isUsed; e++) {
-      ncclWorkElem *we = &w->elems[e];
-      intptr_t eltN = we->count;
-      int bid = we->bid;
-      int bn = we->nChannels;
-      T const *src = (T const*)we->sendbuff;
-      T *dst = (T*)we->recvbuff;
-
-      // each block/channel gets a roughly equal segment of 16 byte packs
-      constexpr int EltPerPack = 16/sizeof(T);
-      intptr_t packN = (eltN + EltPerPack-1) - (eltN + EltPerPack-1)%EltPerPack;
-      intptr_t i0 = (bid+0)*(packN/bn) + (bid+0 < packN%bn ? bid+0 : packN%bn);
-      intptr_t i1 = (bid+1)*(packN/bn) + (bid+1 < packN%bn ? bid+1 : packN%bn);
-      i0 *= EltPerPack;
-      i0 = i0 < eltN ? i0 : eltN;
-      i1 *= EltPerPack;
-      i1 = i1 < eltN ? i1 : eltN;
-      src += i0;
-      dst += i0;
-      void *vsrc = (void*)src;
-      void *vdst = (void*)dst;
-      reduceCopy<COLL_UNROLL, RedOp, T, 0,1,1, 0,1,1, /*PreOpSrcs=*/1>
-        (tid, tn, we->redOpArg, &(we->redOpArg), true, 1, &vsrc, 1, &vdst, i1-i0);
-    }
-  }
-}
-
-#define INSTANTIATE(devredop, type) \
-  __device__ void NCCL_ONERANK_REDUCE_NAME(devredop, type)() { \
-    oneRankReduce<type, Func##devredop<type>>(); \
-  }
-
-INSTANTIATE(PreMulSum, int8_t)
-INSTANTIATE(PreMulSum, uint8_t)
-INSTANTIATE(PreMulSum, int32_t)
-INSTANTIATE(PreMulSum, uint32_t)
-INSTANTIATE(PreMulSum, int64_t)
-INSTANTIATE(PreMulSum, uint64_t)
-INSTANTIATE(PreMulSum, half)
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-INSTANTIATE(PreMulSum, __nv_bfloat16)
-#endif
-INSTANTIATE(PreMulSum, float)
-INSTANTIATE(PreMulSum, double)
diff --git a/src/collectives/device/reduce.cu b/src/collectives/device/reduce.cu
deleted file mode 100644
index 66f1bb2ec2..0000000000
--- a/src/collectives/device/reduce.cu
+++ /dev/null
@@ -1,11 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "reduce.h"
-#include "common.h"
-#include "collectives.h"
-
-IMPL_COLL_R(Reduce);
diff --git a/src/collectives/device/reduce_scatter.cu b/src/collectives/device/reduce_scatter.cu
deleted file mode 100644
index c2c6d42806..0000000000
--- a/src/collectives/device/reduce_scatter.cu
+++ /dev/null
@@ -1,11 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "reduce_scatter.h"
-#include "common.h"
-#include "collectives.h"
-
-IMPL_COLL_R(ReduceScatter);
diff --git a/src/collectives/device/sendrecv.cu b/src/collectives/device/sendrecv.cu
deleted file mode 100644
index 59e38b528e..0000000000
--- a/src/collectives/device/sendrecv.cu
+++ /dev/null
@@ -1,11 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "sendrecv.h"
-#include "common.h"
-#include "collectives.h"
-
-IMPL_COLL_P(SendRecv);
diff --git a/src/collectives/reduce.cc b/src/collectives/reduce.cc
deleted file mode 100644
index 63355162f7..0000000000
--- a/src/collectives/reduce.cc
+++ /dev/null
@@ -1,33 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "enqueue.h"
-#include "collectives.h"
-#include "nccl.h"
-
-NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
-ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
-  struct NvtxParamsReduce {
-    size_t bytes;
-    int root;
-    ncclRedOp_t op;
-  };
-  constexpr nvtxPayloadSchemaEntry_t ReduceSchema[] = {
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsReduce, root)},
-    {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
-      offsetof(NvtxParamsReduce, op)}
-  };
-  NvtxParamsReduce payload{count * ncclTypeSize(datatype), root, op};
-  NVTX3_FUNC_WITH_PARAMS(Reduce, ReduceSchema, payload)
-
-  struct ncclInfo info = { ncclFuncReduce, "Reduce",
-    sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
-    REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS };
-  return ncclEnqueueCheck(&info);
-}
diff --git a/src/collectives/reduce_scatter.cc b/src/collectives/reduce_scatter.cc
deleted file mode 100644
index 5242545490..0000000000
--- a/src/collectives/reduce_scatter.cc
+++ /dev/null
@@ -1,31 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "enqueue.h"
-#include "collectives.h"
-#include "nccl.h"
-
-NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
-ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
-  struct NvtxParamsReduceScatter {
-    size_t bytes;
-    ncclRedOp_t op;
-  };
-  constexpr nvtxPayloadSchemaEntry_t ReduceScatterSchema[] = {
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
-    {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
-      offsetof(NvtxParamsReduceScatter, op)}
-  };
-  NvtxParamsReduceScatter payload{recvcount * ncclTypeSize(datatype), op};
-  NVTX3_FUNC_WITH_PARAMS(ReduceScatter, ReduceScatterSchema, payload)
-
-  struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter",
-    sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
-    REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS };
-  return ncclEnqueueCheck(&info);
-}
diff --git a/src/collectives/sendrecv.cc b/src/collectives/sendrecv.cc
deleted file mode 100644
index 9a81b0a935..0000000000
--- a/src/collectives/sendrecv.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "enqueue.h"
-#include "collectives.h"
-#include "argcheck.h" // Need some checks here since we access comm
-
-struct NvtxParamsSendRecv {
-    size_t bytes;
-    int peer;
-};
-constexpr const nvtxPayloadSchemaEntry_t SendRecvSchema[] = {
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Peer rank", nullptr, 0, offsetof(NvtxParamsSendRecv, peer)}
-};
-
-NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
-    ncclComm_t comm, cudaStream_t stream);
-ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
-    ncclComm_t comm, cudaStream_t stream) {
-  NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
-  NVTX3_FUNC_WITH_PARAMS(Send, SendRecvSchema, payload)
-
-  struct ncclInfo info = { ncclFuncSend, "Send",
-    NULL, (void*)sendbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
-    1, 1 };
-  ncclResult_t ret;
-  NCCLCHECK(ncclGroupStart());
-  ret = ncclEnqueueCheck(&info);
-  NCCLCHECK(ncclGroupEnd());
-  return ret;
-}
-
-NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
-    ncclComm_t comm, cudaStream_t stream);
-ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
-    ncclComm_t comm, cudaStream_t stream) {
-  NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
-  NVTX3_FUNC_WITH_PARAMS(Recv, SendRecvSchema, payload)
-
-  struct ncclInfo info = { ncclFuncRecv, "Recv",
-    NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
-    1, 1 };
-  ncclResult_t ret;
-  NCCLCHECK(ncclGroupStart());
-  ret = ncclEnqueueCheck(&info);
-  NCCLCHECK(ncclGroupEnd());
-  return ret;
-}
diff --git a/src/debug.cc b/src/debug.cc
index b88fa5982a..21cec22faa 100644
--- a/src/debug.cc
+++ b/src/debug.cc
@@ -9,6 +9,7 @@
 #include <stdlib.h>
 #include <stdarg.h>
 #include <sys/syscall.h>
+#include "param.h"
 
 int ncclDebugLevel = -1;
 static int pid = -1;
@@ -25,7 +26,7 @@ static __thread int tid = -1;
 void ncclDebugInit() {
   pthread_mutex_lock(&ncclDebugLock);
   if (ncclDebugLevel != -1) { pthread_mutex_unlock(&ncclDebugLock); return; }
-  const char* nccl_debug = getenv("NCCL_DEBUG");
+  const char* nccl_debug = ncclGetEnv("NCCL_DEBUG");
   int tempNcclDebugLevel = -1;
   if (nccl_debug == NULL) {
     tempNcclDebugLevel = NCCL_LOG_NONE;
@@ -45,7 +46,7 @@ void ncclDebugInit() {
    * This can be a comma separated list such as INIT,COLL
    * or ^INIT,COLL etc
    */
-  char* ncclDebugSubsysEnv = getenv("NCCL_DEBUG_SUBSYS");
+  const char* ncclDebugSubsysEnv = ncclGetEnv("NCCL_DEBUG_SUBSYS");
   if (ncclDebugSubsysEnv != NULL) {
     int invert = 0;
     if (ncclDebugSubsysEnv[0] == '^') { invert = 1; ncclDebugSubsysEnv++; }
@@ -97,7 +98,7 @@ void ncclDebugInit() {
    * then create the debug file. But don't bother unless the
    * NCCL_DEBUG level is > VERSION
    */
-  const char* ncclDebugFileEnv = getenv("NCCL_DEBUG_FILE");
+  const char* ncclDebugFileEnv = ncclGetEnv("NCCL_DEBUG_FILE");
   if (tempNcclDebugLevel > NCCL_LOG_VERSION && ncclDebugFileEnv != NULL) {
     int c = 0;
     char debugFn[PATH_MAX+1] = "";
diff --git a/src/device/Makefile b/src/device/Makefile
new file mode 100644
index 0000000000..1e9311f1f0
--- /dev/null
+++ b/src/device/Makefile
@@ -0,0 +1,100 @@
+#
+# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+SHELL := /usr/bin/env bash
+MAKEFALGS += -r
+.SUFFIXES:
+.SECONDARY:
+
+NCCLDIR := ../..
+include $(NCCLDIR)/makefiles/common.mk
+include $(NCCLDIR)/makefiles/version.mk
+
+BUILDDIR ?= $(abspath ../../build)
+OBJDIR := $(BUILDDIR)/obj/device
+
+MANIFEST := $(OBJDIR)/manifest
+DEVGLUE_OBJ  := $(OBJDIR)/device_glue.o
+
+INCFLAGS  = -I. -I.. -I$(BUILDDIR)/include -I../include
+NVCUFLAGS += $(INCFLAGS) --compiler-options "-fPIC -fvisibility=hidden"
+CXXFLAGS  += $(INCFLAGS)
+
+SAY = @bash -c 'path="$$2"; [[ "$$(realpath "$$2")" =~ ^$(subst .,\.,$(abspath $(NCCLDIR)))/(.*)$$ ]] && path="$${BASH_REMATCH[1]}"; printf "%-15s %s\n" "$$1" "$$path"' SAY
+
+COMPILE.cu = $(NVCC) $(NVCUFLAGS) -dc $2 -o $1
+COMPILE.cc = $(CXX) $(CXXFLAGS) -c $2 -o $1
+define COMPILE
+@$(SAY) "Compiling" $2;\
+ mkdir -p $(dir $1);\
+ $(call COMPILE$(suffix $2),$1,$2)
+endef
+
+DEPENDS.cu = $(NVCC) $(NVCUFLAGS) -M -dc $1
+DEPENDS.cc = $(CXX) $(CXXFLAGS) -M -c $1
+define DEPENDS
+@$(SAY) "Dependencies" $2;\
+ mkdir -p $(dir $1);\
+ mk=$$($(call DEPENDS$(suffix $2),$2));\
+ [[ $$mk =~ ^[^:]*:(.*)$$ ]];\
+ files=$${BASH_REMATCH[1]};\
+ files=$$(for x in $$files; do case "$$x" in '\'|$$'\t') ;; *) echo "$$x"; esac; done);\
+ files=$$(for x in $$files; do [[ "$$(realpath "$$x")" == "$$(realpath "$(NCCLDIR)")"* ]] && echo "$$x"; done);\
+ echo "$(patsubst %.d,%.o,$1) $1: " $$files > $1
+endef
+
+all: $(MANIFEST)
+
+ifeq (1,1)
+# Case if the <gensrc> directory is generated on-demand:
+$(OBJDIR)/gensrc: generate.py
+	@mkdir -p $@
+	(which python3 >/dev/null || \
+	  (bar='!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'; \
+	   printf "\n$${bar}\nERROR: Building NCCL requires a Python 3 installation invokable as 'python3'.\n$${bar}\n\n" 1>&2; \
+	   exit 1)) \
+	&& ./generate.py $@ "$(ONLY_FUNCS)"
+else
+# Case if the <gensrc> directory is pre-generated and checked in the repo as ./gen:
+$(OBJDIR)/gensrc:
+	@mkdir -p $(OBJDIR); ln -srfn ./gen $@
+endif
+
+# The trailing ";" is necessary to make this an "empty recipe":
+# https://www.gnu.org/software/make/manual/html_node/Empty-Recipes.html
+$(OBJDIR)/gensrc/rules.mk: $(OBJDIR)/gensrc ;
+
+-include $(OBJDIR)/gensrc/rules.mk
+# "gensrc/rules.mk" populates $(LIB_OBJS_GEN)
+
+SRCS = common.cu onerank.cu
+
+LIB_OBJS = $(patsubst %, $(OBJDIR)/%.o, $(SRCS)) $(LIB_OBJS_GEN)
+
+$(OBJDIR)/%.o: % $(OBJDIR)/%.d
+	$(call COMPILE,$@,$<)
+
+$(OBJDIR)/genobj/%.o: $(OBJDIR)/gensrc $(OBJDIR)/genobj/%.d
+	$(call COMPILE,$@,$(OBJDIR)/gensrc/$*)
+
+$(OBJDIR)/%.d: %
+	$(call DEPENDS,$@,$<)
+
+$(OBJDIR)/genobj/%.d: $(OBJDIR)/gensrc/%
+	$(call DEPENDS,$@,$<)
+
+$(DEVGLUE_OBJ): $(LIB_OBJS)
+	$(NVCC) $(NVCUFLAGS) -dlink $^ -o $@
+
+$(MANIFEST): $(LIB_OBJS) $(DEVGLUE_OBJ)
+	@echo $^ > $@
+
+-include $(wildcard $(OBJDIR)/*.d)
+-include $(wildcard $(OBJDIR)/genobj/*.d)
+
+.PHONY: clean
+clean:
+	rm -rf $(OBJDIR)
diff --git a/src/collectives/device/all_gather.h b/src/device/all_gather.h
similarity index 61%
rename from src/collectives/device/all_gather.h
rename to src/device/all_gather.h
index 76ae2a4182..0122499320 100644
--- a/src/collectives/device/all_gather.h
+++ b/src/device/all_gather.h
@@ -4,7 +4,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "devcomm.h"
+#include "device.h"
 #include "collectives.h"
 #include "primitives.h"
 
@@ -108,33 +108,65 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
     const ssize_t chunkSize = int(args->lastChunkSize);
     const ssize_t size = args->count;
     const ssize_t loopSize = nChannels*chunkSize;
+    const ssize_t rank = ncclShmem.comm.rank;
 
-    const int nThreadsGather = 128;
-    const int nThreadsBcast = 384 + WARP_SIZE;
+    const int nThreadsBcast = args->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : 4 * WARP_SIZE;
+    const int nThreadsGather = args->regUsed ? WARP_SIZE : NCCL_MAX_NTHREADS - nThreadsBcast;
     const int tidEndGather = nThreadsGather;
     const int tidEndBcast = tidEndGather + nThreadsBcast;
 
-    using Proto = ProtoSimple<1, 1>;
-
-    if (tid < tidEndGather) {
-      // Gather
-      Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
-        prims(tid, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
-           args->redOpArg, 0*Proto::MaxGroupWidth, 0, 0);
-      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-        ssize_t offset = gridOffset + bid*chunkSize;
-        int nelem = min(chunkSize, size-offset);
-        prims.gather(offset, nvls->nHeads*size, nelem, size, -1, 0);
+    if (!args->regUsed) {
+      if (tid < tidEndGather) {
+        // Gather
+        using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
+        Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
+          prims(tid, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
+            args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
+        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+          ssize_t offset = gridOffset + bid * chunkSize;
+          int nelem = min(chunkSize, size - offset);
+          prims.gather(offset, nvls->nHeads * size, nelem, size, -1, 0);
+        }
+      } else if (tid < tidEndBcast) {
+        // Bcast through NVLS
+        using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
+        Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
+          prims(tid - tidEndGather, nThreadsBcast, NULL, &nvls->down, args->sendbuff, NULL,
+            args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0);
+        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+          ssize_t offset = gridOffset + bid * chunkSize;
+          int nelem = min(chunkSize, size - offset);
+          prims.send(offset, nelem);
+        }
       }
-    } else if (tid < tidEndBcast) {
-      // Bcast through NVLS
-      Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
-        prims(tid-tidEndGather, nThreadsBcast, NULL, &nvls->down, args->sendbuff, NULL,
-           args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
-      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-        ssize_t offset = gridOffset + bid*chunkSize;
-        int nelem = min(chunkSize, size-offset);
-        prims.send(offset, nelem);
+    } else {
+      /* direct allgather */
+      if (tid < tidEndGather) {
+        using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
+        Primitives<T, RedOp, FanSymmetric<NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
+          prims(tid, nThreadsGather, nvls->up, nvls->up, NULL, NULL,
+            args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
+
+        /* used as sync */
+        prims.scatter(0, 0, 0, 0, -1, 0);
+
+        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+          prims.gather(0, 0, 0, 0, -1, 0);
+        }
+      } else if (tid < tidEndBcast) {
+        using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
+        Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
+          prims(tid - tidEndGather, nThreadsBcast, &nvls->down, &nvls->down, args->sendbuff, NULL,
+            args->redOpArg, 1 * Proto::MaxGroupWidth, 0, 0, args);
+        /* used as sync */
+        prims.recv(0, 0);
+
+        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+          ssize_t inpOffset = gridOffset + bid * chunkSize;
+          ssize_t outOffset = inpOffset + rank * size;
+          int nelem = min(chunkSize, size - inpOffset);
+          prims.directSend(inpOffset, outOffset, nelem);
+        }
       }
     }
   }
diff --git a/src/collectives/device/all_reduce.h b/src/device/all_reduce.h
similarity index 85%
rename from src/collectives/device/all_reduce.h
rename to src/device/all_reduce.h
index 32597f1769..bf37dfe962 100644
--- a/src/collectives/device/all_reduce.h
+++ b/src/device/all_reduce.h
@@ -4,7 +4,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "devcomm.h"
+#include "device.h"
 #include "collectives.h"
 #include "primitives.h"
 
@@ -377,7 +377,6 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
   __device__ __forceinline__ void run(ncclWorkElem *args) {
-  #if NCCL_NVLS_ENABLED
     const int tid = threadIdx.x;
     const int bid = args->bid;
     const int nChannels = args->nChannels;
@@ -387,10 +386,11 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
     const ssize_t loopSize = nChannels*nvls->nHeads*chunkSize;
     const int nranks = ncclShmem.comm.nRanks;
     const bool hasOut = nvls->out != -1;
-    const int reduceWarps = hasOut ? 3 : nranks <= 6 ? 7 : 5;
-    const int bcastWarps = hasOut ? 2 : 0;
-    const int scatterWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps + 1)/2;
-    const int gatherWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps)/2;
+    const int totalWarps = NCCL_MAX_NTHREADS/WARP_SIZE;
+    const int bcastWarps = hasOut ? (args->regUsed ? ((totalWarps - 2) >> 1) - 1 : 2) : 0;
+    const int reduceWarps = args->regUsed ? (totalWarps - bcastWarps - 2) : (hasOut ? 3 : nranks <= 6 ? 7 : 5);
+    const int scatterWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps + 1) >> 1;
+    const int gatherWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps) >> 1;
 
     const int nThreadsScatter = scatterWarps*WARP_SIZE;
     const int nThreadsGather  = gatherWarps*WARP_SIZE;
@@ -406,67 +406,65 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
       using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
       Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
         prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
-           args->redOpArg, 0*Proto::MaxGroupWidth, 1, 1);
+          args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-        ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize;
-        int nelem = min(nvls->nHeads*chunkSize, size-offset);
+        ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
+        int nelem = args->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset);
         prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
       }
     } else if (tid < tidEndGather) {
       // Gather
       using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
       Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
-        prims(tid-tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
-           args->redOpArg, 1*Proto::MaxGroupWidth, 1, 1);
+        prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
+          args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-        ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize;
-        int nelem = min(nvls->nHeads*chunkSize, size-offset);
+        ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
+        int nelem = args->regUsed ? 0 :min(nvls->nHeads * chunkSize, size - offset);
         prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
       }
     } else if (tid < tidEndReduce && nvls->headRank != -1) {
       if (!hasOut) {
         // Reduce, broadcast through NVLS
         using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
-        Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/0, Proto, 0>
-          prims(tid-tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL,
-             args->redOpArg, 2*Proto::MaxGroupWidth, 0, 0);
+        Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
+          prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL,
+            args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
-          int nelem = min(chunkSize, size-offset);
-          prims.recvSend(nelem);
+          ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
+          int nelem = min(chunkSize, size - offset);
+          prims.directRecvDirectSend(offset, offset, nelem);
         }
       } else {
         // Reduce, send to network
         using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
-        Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/0, Proto, 0>
-          prims(tid-tidEndGather, nThreadsReduce, &nvls->down, &nvls->out, NULL, NULL,
-             args->redOpArg, 2*Proto::MaxGroupWidth, 0, 1);
+        Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
+          prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->out, NULL, NULL,
+            args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 1, args);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
-          int nelem = min(chunkSize, size-offset);
-          prims.recvSend(nelem);
+          ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
+          int nelem = min(chunkSize, size - offset);
+          prims.directRecvDirectSend(offset, offset, nelem);
         }
       }
     } else if (tid < tidEndBcast && nvls->headRank != -1) {
       // Recv from network, broadcast
       using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
-      Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/0, Proto, 0>
-        prims(tid-tidEndReduce, nThreadsBcast, &nvls->out, &nvls->down, NULL, NULL,
-           args->redOpArg, 3*Proto::MaxGroupWidth, 0, 0);
+      Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
+        prims(tid - tidEndReduce, nThreadsBcast, &nvls->out, &nvls->down, NULL, NULL,
+          args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, args);
       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-        ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
-        int nelem = min(chunkSize, size-offset);
-        prims.recvSend(nelem);
+        ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
+        int nelem = min(chunkSize, size - offset);
+        prims.directRecvDirectSend(offset, offset, nelem);
       }
     }
-  #endif // NCCL_NVLS_ENABLED
   }
 };
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PROTO_SIMPLE> {
   __device__ __forceinline__ void run(ncclWorkElem *args) {
-  #if NCCL_NVLS_ENABLED
     const int tid = threadIdx.x;
     const int bid = args->bid;
     const int nChannels = args->nChannels;
@@ -478,10 +476,11 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
     const ssize_t loopSize = nChannels*nvls->nHeads*chunkSize;
     const int nranks = ncclShmem.comm.nRanks;
     const bool hasUp = treeUp != -1;
-    const int reduceWarps = hasUp ? 5 : nranks <= 6 ? 7 : 5;
-    const int bcastWarps = hasUp ? 4 : 0;
-    const int scatterWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps + 1)/2;
-    const int gatherWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps)/2;
+    const int totalWarps = NCCL_MAX_NTHREADS/WARP_SIZE;
+    const int bcastWarps = hasUp ? (args->regUsed ? ((totalWarps - 2) >> 1) - 1 : 4) : 0;
+    const int reduceWarps = args->regUsed ? (totalWarps - bcastWarps - 2) : (hasUp ? 5 : nranks <= 6 ? 7 : 5);
+    const int scatterWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps + 1) >> 1;
+    const int gatherWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps) >> 1;
 
     const int nThreadsScatter = scatterWarps*WARP_SIZE;
     const int nThreadsGather  = gatherWarps*WARP_SIZE;
@@ -497,60 +496,59 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
       using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
       Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
         prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
-           args->redOpArg, 0*Proto::MaxGroupWidth, 1, 1);
+          args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-        ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize;
-        int nelem = min(nvls->nHeads*chunkSize, size-offset);
+        ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
+        int nelem = args->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset);
         prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
       }
     } else if (tid < tidEndGather) {
       // Gather
       using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
       Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
-        prims(tid-tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
-           args->redOpArg, 1*Proto::MaxGroupWidth, 1, 1);
+        prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
+          args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-        ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize;
-        int nelem = min(nvls->nHeads*chunkSize, size-offset);
+        ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
+        int nelem = args->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset);
         prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
       }
     } else if (tid < tidEndReduce && nvls->headRank != -1) {
       if (!hasUp) {
         // Reduce and Broadcast
         using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
-        Primitives<T, RedOp, FanSymmetric<3>, /*Direct=*/0, Proto, 0>
-          prims(tid-tidEndGather, nThreadsReduce, treeDown, treeDown, NULL, NULL,
-             args->redOpArg, 2*Proto::MaxGroupWidth, 0, 0);
+        Primitives<T, RedOp, FanSymmetric<3>, /*Direct=*/1, Proto, 0>
+          prims(tid - tidEndGather, nThreadsReduce, treeDown, treeDown, NULL, NULL,
+            args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
-          int nelem = min(chunkSize, size-offset);
-          prims.recvSend(nelem);
+          ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
+          int nelem = min(chunkSize, size - offset);
+          prims.directRecvDirectSend(offset, offset, nelem);
         }
       } else {
         // Reduce, send to network
         using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
-        Primitives<T, RedOp, FanAsymmetric<3, 1>, /*Direct=*/0, Proto, 0>
-          prims(tid-tidEndGather, nThreadsReduce, treeDown, &treeUp, NULL, NULL,
-              args->redOpArg, 2*Proto::MaxGroupWidth, 0, 0);
+        Primitives<T, RedOp, FanAsymmetric<3, 1>, /*Direct=*/1, Proto, 0>
+          prims(tid - tidEndGather, nThreadsReduce, treeDown, &treeUp, NULL, NULL,
+            args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
-          int nelem = min(chunkSize, size-offset);
-          prims.recvSend(nelem);
+          ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
+          int nelem = min(chunkSize, size - offset);
+          prims.directRecvDirectSend(offset, offset, nelem);
         }
       }
     } else if (tid < tidEndBcast && nvls->headRank != -1) {
       // Recv from network, broadcast
       using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
-      Primitives<T, RedOp, FanAsymmetric<1, 3>, /*Direct=*/0, Proto, 0>
-        prims(tid-tidEndReduce, nThreadsBcast, &treeUp, treeDown, NULL, NULL,
-           args->redOpArg, 3*Proto::MaxGroupWidth, 0, 0);
+      Primitives<T, RedOp, FanAsymmetric<1, 3>, /*Direct=*/1, Proto, 0>
+        prims(tid - tidEndReduce, nThreadsBcast, &treeUp, treeDown, NULL, NULL,
+          args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, args);
       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-        ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
-        int nelem = min(chunkSize, size-offset);
-        prims.recvSend(nelem);
+        ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
+        int nelem = min(chunkSize, size - offset);
+        prims.directRecvDirectSend(offset, offset, nelem);
       }
     }
-  #endif // NCCL_NVLS_ENABLED
   }
 };
 
diff --git a/src/collectives/device/broadcast.h b/src/device/broadcast.h
similarity index 99%
rename from src/collectives/device/broadcast.h
rename to src/device/broadcast.h
index ebe4381206..15bf841d50 100644
--- a/src/collectives/device/broadcast.h
+++ b/src/device/broadcast.h
@@ -4,7 +4,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "devcomm.h"
+#include "device.h"
 #include "collectives.h"
 #include "primitives.h"
 
diff --git a/src/device/common.cu b/src/device/common.cu
new file mode 100644
index 0000000000..d1b6acd1bc
--- /dev/null
+++ b/src/device/common.cu
@@ -0,0 +1,24 @@
+/*************************************************************************
+ * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "device.h"
+#include "collectives.h"
+#include "common.h"
+
+__shared__ ncclShmemData ncclShmem;
+#if __CUDA_ARCH__ < 700
+  __shared__ ulong2 ncclShmemPerWarp[ncclShmemScratchWarpSize()*(NCCL_MAX_NTHREADS/WARP_SIZE)/sizeof(ulong2)];
+#endif
+
+struct RunWorkNop {
+  __device__ void run(ncclWork *w) {}
+};
+
+__global__ void ncclDevKernel_Generic(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) {
+  ncclKernelMain<-1, RunWorkNop>(comm, channelMask, workHead);
+}
+
+__device__ void ncclDevFunc_Nop() {}
diff --git a/src/collectives/device/common.h b/src/device/common.h
similarity index 61%
rename from src/collectives/device/common.h
rename to src/device/common.h
index accf8371a3..97581f738d 100644
--- a/src/collectives/device/common.h
+++ b/src/device/common.h
@@ -8,19 +8,23 @@
 #define NCCL_DEVICE_COMMON_H_
 
 #include "collectives.h"
-#include "devcomm.h"
+#include "device.h"
 #include "op128.h"
+#include "network/unpack/unpack_defs.h"
 
 #define COLL_UNROLL (ncclCollUnroll())
 
-typedef void(*ncclKern_t)();
-extern __device__ ncclKern_t ncclFuncs[];
+typedef void(*ncclDevFuncPtr_t)();
+extern __device__ ncclDevFuncPtr_t const ncclDevFuncTable[];
 
 struct ncclShmemGroup {
   ncclConnInfo *recvConns[NCCL_MAX_NVLS_ARITY];
   ncclConnInfo *sendConns[NCCL_MAX_NVLS_ARITY];
   void* srcs[NCCL_MAX_NVLS_ARITY+1];
   void* dsts[NCCL_MAX_NVLS_ARITY+1];
+  union {
+    unpackGroupShmem unpack;
+  } devicePlugin;
 };
 
 struct ncclShmemData {
@@ -31,6 +35,9 @@ struct ncclShmemData {
   alignas(16) struct ncclDevComm comm;
   alignas(16) struct ncclDevChannel channel;
   alignas(16) struct ncclWork work;
+  alignas(16) union {
+    unpackShmem unpack;
+  } devicePlugin;
 };
 static_assert(offsetof(struct ncclShmemData, work)%16 == 0, "shmem.work needs to be 16B aligned");
 
@@ -111,10 +118,8 @@ static __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) {
   }
 }
 
-template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto, int FnIndex>
-__device__ void ncclKernel(
-    struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead
-  )  {
+template<int SpecializedFnId, typename SpecializedRunWork>
+__device__ void ncclKernelMain(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) {
   int tid = threadIdx.x;
 
   // To map blockId to channelId, we need the n'th set bit of channelMask which
@@ -166,7 +171,7 @@ __device__ void ncclKernel(
       bytes = 0;
       break;
     }
-    copyToShmem16(tid%WARP_SIZE, dst, src, bytes);
+    if (bytes) copyToShmem16(tid%WARP_SIZE, dst, src, bytes);
   }
   __syncthreads(); // publish ncclShmem
 
@@ -184,10 +189,10 @@ __device__ void ncclKernel(
     }
     __syncthreads();
 
-    if (ncclShmem.work.header.funcIndex == FnIndex) {
-      RunWork<Fn, T, RedOp, Algo, Proto>().run(&ncclShmem.work);
+    if (0 <= SpecializedFnId && ncclShmem.work.header.funcIndex == (unsigned)SpecializedFnId) {
+      SpecializedRunWork().run(&ncclShmem.work);
     } else {
-      ncclFuncs[ncclShmem.work.header.funcIndex]();
+      ncclDevFuncTable[ncclShmem.work.header.funcIndex]();
     }
 
     int workIxNext = ncclShmem.work.header.workNext;
@@ -204,94 +209,17 @@ __device__ void ncclKernel(
   }
 }
 
-// Only generate kernels for SUM
-#if NCCL_OP == 0
-#define IMPL_COLL_KERN(func, algo, proto, devredop, type, fIndex) \
-__global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)( \
-    struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead \
-  ) { \
-  ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex> \
-    (comm, channelMask, workHead); \
-}
-#else
-#define IMPL_COLL_KERN(func, algo, proto, devredop, type, fInded)
-#endif
+__global__ void ncclDevKernel_Generic(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead);
+__device__ void ncclDevFunc_Nop();
 
-// Examples :     AllReduce, RING, LL,    Sum,   uint8
-#define IMPL_COLL_FUNC(func, algo, proto, devredop, type) \
-__device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)() { \
-  RunWork<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto>().run(&ncclShmem.work); \
-}
+#define DEFINE_ncclDevKernel(suffix, coll, redop, ty, algo, proto, specializedFnId) \
+  __global__ void ncclDevKernel_##suffix(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) { \
+    ncclKernelMain<specializedFnId, RunWork<coll, ty, redop<ty>, algo, proto>>(comm, channelMask, workHead); \
+  }
 
-// Only generate inline kernels for LL
-#define IMPL_COLL4(func, algo, devredop, type, ncclType) \
-  IMPL_COLL_FUNC(func, algo, LL,     devredop, type) \
-  IMPL_COLL_FUNC(func, algo, LL128,  devredop, type) \
-  IMPL_COLL_FUNC(func, algo, SIMPLE, devredop, type) \
-  IMPL_COLL_KERN(func, algo, LL,     devredop, type, FUNC_INDEX(ncclFunc##func, ncclDev##devredop, ncclType, NCCL_ALGO_##algo, NCCL_PROTO_LL)) \
-
-#define IMPL_COLL3(func, devredop, type, ncclType) \
-  IMPL_COLL4(func, TREE,    devredop, type, ncclType) \
-  IMPL_COLL4(func, RING,    devredop, type, ncclType) \
-  IMPL_COLL4(func, COLLNET_DIRECT, devredop, type, ncclType) \
-  IMPL_COLL4(func, COLLNET_CHAIN, devredop, type, ncclType) \
-  IMPL_COLL4(func, NVLS, devredop, type, ncclType) \
-  IMPL_COLL4(func, NVLS_TREE, devredop, type, ncclType)
-
-#if NCCL_TYPE == 0
-#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, int8_t,   ncclInt8)
-#elif NCCL_TYPE == 1
-#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, uint8_t,  ncclUint8)
-#elif NCCL_TYPE == 2
-#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, int32_t,  ncclInt32)
-#elif NCCL_TYPE == 3
-#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, uint32_t, ncclUint32)
-#elif NCCL_TYPE == 4
-#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, int64_t,  ncclInt64)
-#elif NCCL_TYPE == 5
-#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, uint64_t, ncclUint64)
-#elif NCCL_TYPE == 6
-#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, half,     ncclFloat16)
-#elif NCCL_TYPE == 7
-#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, float,    ncclFloat32)
-#elif NCCL_TYPE == 8
-#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, double,   ncclFloat64)
-#elif NCCL_TYPE == 9 && defined(__CUDA_BF16_TYPES_EXIST__)
-#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, __nv_bfloat16, ncclBfloat16)
-#endif
-
-// Reduction define all functions
-#if NCCL_OP == 0
-#define IMPL_COLL_R(func) IMPL_COLL2(func, Sum);
-#elif NCCL_OP == 1
-#define IMPL_COLL_R(func) IMPL_COLL2(func, Prod);
-#elif NCCL_OP == 2
-#define IMPL_COLL_R(func) IMPL_COLL2(func, Min);
-#elif NCCL_OP == 3
-#define IMPL_COLL_R(func) IMPL_COLL2(func, Max);
-#elif NCCL_OP == 4
-#define IMPL_COLL_R(func) IMPL_COLL2(func, PreMulSum);
-#elif NCCL_OP == 5
-  #if NCCL_TYPE < 6
-    #define IMPL_COLL_R(func) IMPL_COLL2(func, SumPostDiv);
-  #else
-    #define IMPL_COLL_R(func) // skip SumPostDiv for floating point
-  #endif
-#endif
-
-#if NCCL_OP == 0 && NCCL_TYPE == 0
-// Copy primitives only define one function for copy
-#define IMPL_COLL_C(func) IMPL_COLL3(func, Sum, int8_t, ncclInt8);
-
-// Point-to-point primitives only have one function/kernel.
-#define IMPL_COLL_P(func) \
-  IMPL_COLL_FUNC(func, RING, SIMPLE, Sum, int8_t); \
-  IMPL_COLL_KERN(func, RING, SIMPLE, Sum, int8_t, FUNC_INDEX_P2P);
-#else
-#define IMPL_COLL_C(func)
-#define IMPL_COLL_P(func)
-#endif
-
-#define NCCL_NVLS_ENABLED (__CUDA_ARCH__ >= 900 && NCCL_NVLS_SUPPORTS(NCCL_TYPE, NCCL_OP))
+#define DEFINE_ncclDevFunc(suffix, coll, redop, ty, algo, proto) \
+  __device__ void ncclDevFunc_##suffix() { \
+    RunWork<coll, ty, redop<ty>, algo, proto>().run(&ncclShmem.work); \
+  }
 
 #endif
diff --git a/src/collectives/device/common_kernel.h b/src/device/common_kernel.h
similarity index 97%
rename from src/collectives/device/common_kernel.h
rename to src/device/common_kernel.h
index 6af8da57ea..bfeb87fdf3 100644
--- a/src/collectives/device/common_kernel.h
+++ b/src/device/common_kernel.h
@@ -7,7 +7,7 @@
 #ifndef NCCL_COMMON_KERNEL_H_
 #define NCCL_COMMON_KERNEL_H_
 
-#include "devcomm.h"
+#include "device.h"
 #include "op128.h"
 #include "reduce_kernel.h"
 #include <cstdio>
@@ -81,13 +81,13 @@ __device__ __forceinline__ void reduceCopyPacks(
       for (int u=0; u < Unroll; u++) {
         if (0 < MultimemSrcs) {
           // applyLoadMultimem uses relaxed semantics for same reason we use volatile below.
-          acc[u] = applyLoadMultimem<RedFn, BytePerPack>(preFn, minSrcs[0]);
+          acc[u] = applyLoadMultimem<RedFn, BytePerPack>(redFn, minSrcs[0]);
         } else {
           // Use volatile loads in case credits are polled for with volatile (instead of acquire).
           acc[u] = ld_volatile_global<BytePerPack>(minSrcs[0]);
+          if (0 < PreOpSrcs) acc[u] = applyPreOp(preFn, acc[u]);
         }
         minSrcs[0] += WARP_SIZE*BytePerPack;
-        if (0 < PreOpSrcs) acc[u] = applyPreOp(preFn, acc[u]);
       }
     }
 
@@ -99,7 +99,7 @@ __device__ __forceinline__ void reduceCopyPacks(
       for (int u=0; u < Unroll; u++) {
         if (s < MultimemSrcs) {
           // applyLoadMultimem uses relaxed semantics for same reason we use volatile below.
-          acc[u] = applyLoadMultimem<RedFn, BytePerPack>(preFn, minSrcs[s]);
+          acc[u] = applyLoadMultimem<RedFn, BytePerPack>(redFn, minSrcs[s]);
         } else {
           // Use volatile loads in case credits are polled for with volatile (instead of acquire).
           tmp[u] = ld_volatile_global<BytePerPack>(minSrcs[s]);
diff --git a/src/device/generate.py b/src/device/generate.py
new file mode 100755
index 0000000000..0b053de17e
--- /dev/null
+++ b/src/device/generate.py
@@ -0,0 +1,405 @@
+#!/usr/bin/env python3
+import os
+import sys
+
+# Order of redops, tys, protos, algos must match src/include/device.h
+all_colls =  ["Broadcast","Reduce","AllGather","ReduceScatter","AllReduce","SendRecv"]
+all_redops = ["Sum","Prod","MinMax","PreMulSum","SumPostDiv"]
+all_tys =    ["i8","u8","i32","u32","i64","u64","f16","f32","f64","bf16"]
+all_protos = ["LL","LL128","SIMPLE"]
+all_algos =  ["TREE","RING","COLLNET_DIRECT","COLLNET_CHAIN","NVLS","NVLS_TREE"]
+
+################################################################################
+# The first command line argument is the path to the directory to generate and
+# populate.
+
+gensrc = sys.argv[1]
+
+if os.path.exists(gensrc):
+  for name in os.listdir(gensrc):
+    os.remove(os.path.join(gensrc, name))
+    #os.truncate(os.path.join(gensrc, name), 0)
+else:
+  os.mkdir(gensrc)
+
+################################################################################
+# The second  command line argument is used as a regex to filter the functions
+# which make it into libnccl. This is helpful for reducing the binary when
+# developing device code. The regex supports non-space containing globs '*',
+# parentheses '(x)', and union 'a|b'. The string representing the function has
+# one of the forms:
+#
+# SendRecv
+# (AllGather|Broadcast) <algo> <proto>
+# (AlLReduce|Reduce|ReduceScatter) <redop> <type> <algo> <proto>
+#
+# The possible values for redop, type, algo, proto can be found in the all_<foo>
+# lists at the top of this file.
+#
+# Since the Makefile forwards this from the ONLY_FUNCS variable, useful command
+# line examples are given:
+"""
+# Only send/recv:
+make ONLY_FUNCS="SendRecv"
+
+# Only non-reductions:
+make ONLY_FUNCS="AllGather * *|Broadcast * *|SendRecv"
+
+# Only AllReduce sum f32 (but all algos, protos)
+make ONLY_FUNCS="AllReduce Sum f32 * *"
+
+# Only AllReduce minmax i32 NVLS (but all protos)
+make ONLY_FUNCS="AllReduce MinMax i32 NVLS *"
+
+# AllReduce sum <all floats> RING LL128
+make ONLY_FUNCS="AllReduce Sum f32 RING LL128"
+"""
+
+# Paste all non-None arguments together with `sep`.
+def paste(sep, *args):
+  return sep.join(x for x in args if x is not None)
+
+func_pattern = sys.argv[2:3]
+if func_pattern and func_pattern[0]:
+  import re
+  func_pattern = func_pattern[0]
+  func_pattern = func_pattern.replace("*", "[^ ]*")
+  func_pattern += "$"
+  def func_filter(*fn):
+    return None is not re.match(func_pattern, paste(" ", *fn), flags=re.IGNORECASE)
+else:
+  def func_filter(coll, redop, ty, algo, proto):
+    return True
+
+################################################################################
+
+algos_of_coll = {
+  "AllGather":     ["RING","NVLS"],
+  "AllReduce":     all_algos,
+  "Broadcast":     ["RING"],
+  "Reduce":        ["RING"],
+  "ReduceScatter": ["RING","NVLS"],
+  "SendRecv":      [None]
+}
+
+coll_camel_to_lower = {
+  "AllGather":     "all_gather",
+  "AllReduce":     "all_reduce",
+  "Broadcast":     "broadcast",
+  "Reduce":        "reduce",
+  "ReduceScatter": "reduce_scatter",
+  "SendRecv":      "sendrecv"
+}
+coll_lower_to_camel = {coll_camel_to_lower[x]: x for x in coll_camel_to_lower}
+
+################################################################################
+
+# Returns pair of minimum required values for (CUDART_VERSION, __CUDA_ARCH__)
+# or None if function is never supported. Note that (0, 0) encodes universal
+# support.
+def required_cuda(coll, redop, ty, algo, proto):
+  cudart, arch = 0, 0
+  # kernels mapped to by coll="Nop" functions have coll="Generic"
+  if coll in ("SendRecv", "Generic", "Nop"): return (cudart, arch)
+
+  if proto!="SIMPLE" and algo not in ("RING","TREE"): return None
+
+  if coll in ("AllReduce","Reduce","ReduceScatter"):
+    if redop=="SumPostDiv" and ty[0] not in ("i","u"): return None
+    if ty=="bf16": cudart = max(cudart, 11000)
+
+  if "NVLS" in algo:
+    if coll in ("AllReduce","Reduce","ReduceScatter"):
+      # Must match ncclNvlsSupported() in src/include/device.h
+      nvls_ok = ((ty in ("i32","u32","i64","u64") and redop in ("Sum","MinMax")) or
+                 (ty in ("f32","f64") and redop=="Sum") or
+                 (ty in ("f16","bf16") and redop in ("Sum","MinMax")))
+      if not nvls_ok: return None
+    cudart = max(cudart, 12010)
+    arch = max(arch, 900)
+
+  return (cudart, arch)
+
+# Maps functions to the chosen representative for the equivalence class it
+# belongs to. For instance (sum, signed int) maps to (sum, unsigned int).
+def equivalent_primary(coll, redop, ty, algo, proto):
+  if coll in ("AllReduce", "Reduce", "ReduceScatter"):
+    # map signed integer sum/prod to unsigned
+    if redop in ("Sum","Prod","PreMulSum") and ty[0]=="i":
+      return (coll, redop, "u"+ty[1:], algo, proto)
+    # map signed integer min/max to unsigned for non-NVLS
+    if redop=="MinMax" and ty[0]=="i" and ("NVLS" not in algo):
+      return (coll, redop, "u"+ty[1:], algo, proto)
+  return (coll, redop, ty, algo, proto)
+
+# Map to another func representing the best kernel to use. Every distinct value
+# returned will instantiate a ncclDevKernel specialized to run this func
+# without function call overhead.
+def best_kernel(coll, redop, ty, algo, proto):
+  def best(coll, redop, ty, algo, proto):
+    # Modify this logic to control how many kernels are specialized.
+    if coll=="Nop": return ("Generic", None, None, None, None)
+    if coll=="SendRecv": return ("SendRecv", None, None, None, None)
+    if coll in ("AllGather","Broadcast"): return (coll, None, None, "RING", "LL")
+    return (coll, "Sum", ty, ("TREE" if algo=="TREE" else "RING"), "LL")
+  # Need to ensure kernel is specialize for a primary function
+  kfn = equivalent_primary(*best(coll, redop, ty, algo, proto))
+  # And isn't filtered out.
+  if not func_filter(*kfn): return ("Generic", None, None, None, None)
+  return kfn
+
+# Order rows are enumerated must match formula of `ncclDevFuncId()`:
+def enumerate_func_rows():
+  yield ("SendRecv", None, None, None, None)
+  for coll in ("AllGather", "Broadcast"):
+    algos = algos_of_coll[coll]
+    for algo in algos:
+      for proto in all_protos:
+        yield (coll, None, None, algo, proto)
+  for coll in ("AllReduce", "Reduce", "ReduceScatter"):
+    algos = algos_of_coll[coll]
+    for redop in all_redops:
+      for ty in all_tys:
+        for algo in algos:
+          for proto in all_protos:
+            yield (coll, redop, ty, algo, proto)
+
+################################################################################
+
+def is_built(coll, redop, ty, algo, proto):
+  built = required_cuda(coll, redop, ty, algo, proto)
+  built = built and func_filter(coll, redop, ty, algo, proto)
+  return built
+
+# Returns None if required_cuda(...) is None.
+# Returns the coll="Nop" function if developer has filtered it out.
+# Otherwise just returns func it was given.
+def validate(coll, redop, ty, algo, proto):
+  valid = required_cuda(coll, redop, ty, algo, proto)
+  built = valid and func_filter(coll, redop, ty, algo, proto)
+  if built: return (coll, redop, ty, algo, proto)
+  if valid: return ("Nop", None, None, None, None)
+  return None
+
+# Corresponds to ncclDevFuncRowToId[]
+func_rows = [validate(*fn) for fn in enumerate_func_rows()]
+
+# Corresponds to ncclDevFuncTable[]
+primary_funcs = sorted(set(equivalent_primary(*fn) for fn in func_rows if fn is not None))
+
+# primary_to_index[primary_funcs[i]] == i
+primary_to_index = {fn: i for (i,fn) in zip(range(len(primary_funcs)), primary_funcs)}
+
+kernel_funcs = sorted(set(best_kernel(*fn) for fn in primary_funcs))
+
+################################################################################
+
+# Generate <gensrc>/device_table.cu
+with open(os.path.join(gensrc, "device_table.cu"), "w") as f:
+  out = f.write
+  out('#include "common.h"\n')
+  out("\n")
+
+  for fn in primary_funcs:
+    sym = paste("_", "ncclDevFunc", *fn)
+    cudart, arch = required_cuda(*fn)
+    if (cudart, arch) != (0, 0):
+      out("#if CUDART_VERSION >= %d && __CUDA_ARCH__ >= %d\n" % (cudart, arch))
+    out("__device__ void %s();\n" % sym)
+    if (cudart, arch) != (0, 0):
+      out("#endif\n")
+  out("\n")
+
+  out("__device__ ncclDevFuncPtr_t const ncclDevFuncTable[] = {\n");
+  index = 0
+  for fn in primary_funcs:
+    sym = paste("_", "ncclDevFunc", *fn)
+    cudart, arch = required_cuda(*fn)
+    if (cudart, arch) != (0, 0):
+      out("#if CUDART_VERSION >= %d && __CUDA_ARCH__ >= %d\n" % (cudart ,arch))
+    out("/*%4d*/ %s,\n" % (index, sym))
+    if (cudart, arch) != (0, 0):
+      out("#else\n" "/*%4d*/ nullptr,\n" "#endif\n" % index)
+    index += 1
+  out("nullptr};\n")
+  out("\n")
+
+  out("// Workaround for https://reviews.llvm.org/D55580\n"
+      "__device__ void ncclWorkaroundClangD55580() {}\n")
+
+# Generate <gensrc>/host_table.cc
+with open(os.path.join(gensrc, "host_table.cc"), "w") as f:
+  out = f.write
+  out('#include "device.h"\n')
+  out("\n")
+
+  # The mapping from function rows to valid primary function ids.
+  out("extern int const ncclDevFuncRowToId[] = {\n")
+  index = 0
+  for fn in func_rows:
+    fn_id, comment = -1, ""
+    if fn is not None:
+      fn_id = primary_to_index[equivalent_primary(*fn)]
+      comment = " // " + paste(" ", *fn)
+    out("/*%4d*/ %d,%s\n" % (index, fn_id, comment))
+    index += 1
+  out("-1};\n")
+  out("\n")
+
+  # Forward declarations of kernels.
+  for kfn in kernel_funcs:
+    cudart, _ = required_cuda(*kfn)
+    sym = paste("_", "ncclDevKernel", *kfn)
+    if cudart != 0: out("#if CUDART_VERSION >= %d\n" % cudart)
+    out("__global__ void %s(struct ncclDevComm*, uint64_t, struct ncclWork*);\n" % sym)
+    if cudart != 0: out("#endif\n")
+  out("\n")
+
+  # List of all kernel function pointers.
+  out("extern int const ncclDevKernelCount = %d;\n" % len(kernel_funcs))
+  out("extern void* const ncclDevKernelList[] = {\n")
+  index = 0
+  for kfn in kernel_funcs:
+    cudart, _ = required_cuda(*kfn)
+    sym = paste("_", "ncclDevKernel", *kfn)
+    if cudart != 0: out("#if CUDART_VERSION >= %d\n" % cudart)
+    out("/*%4d*/ (void*)%s,\n" % (index, sym));
+    if cudart != 0: out("#else\n" "/*%4d*/ nullptr,\n" "#endif\n" % index)
+    index += 1
+  out("nullptr};\n")
+  out("\n")
+
+  # Maps primary id to kernel function pointer.
+  out("extern void* const ncclDevKernelForFunc[] = {\n")
+  index = 0
+  for fn in primary_funcs:
+    kfn = best_kernel(*fn)
+    sym = paste("_", "ncclDevKernel", *kfn)
+    cudart, _ = required_cuda(*kfn)
+    if cudart != 0: out("#if CUDART_VERSION >= %d\n" % cudart)
+    out("/*%4d*/ (void*)%s,\n" % (index, sym))
+    if cudart != 0: out("#else\n" "/*%4d*/ nullptr,\n" "#endif\n" % index)
+    index += 1
+  out("nullptr};\n")
+  out("\n")
+
+  # Does the prior map use an explicitly specialized kernel.
+  out("extern bool const ncclDevKernelForFuncIsSpecialized[] = {\n")
+  index = 0
+  for fn in primary_funcs:
+    kfn = best_kernel(*fn)
+    specialized = "1" if fn == kfn else "0"
+    out("/*%4d*/ %s,\n" % (index, specialized))
+    index += 1
+  out("0};\n")
+
+# Maps to .cu filename which implements this func. The only constraint is that
+# "coll" is reflected in the name: formally that no two funcs having different
+# coll's map to the same filename.
+def impl_filename(coll, redop, ty, algo, proto):
+  return "%s.cu" % paste("_", coll_camel_to_lower[coll], redop and redop.lower(), ty)
+
+# Partition the functions and kernels to the .cu filenames. The partition is
+# a dictionary mapping filename to (coll, func-tuple list)
+def partition_by_name(fns):
+  ans = {}
+  for fn in fns:
+    name = impl_filename(*fn)
+    coll = fn[0]
+    if name not in ans:
+      ans[name] = (coll, [])
+    ans[name][1].append(fn)
+  return ans
+
+name_to_funcs = partition_by_name(fn for fn in primary_funcs if fn[0]!="Nop")
+name_to_kernels = partition_by_name(kfn for kfn in kernel_funcs if kfn[0]!="Generic")
+
+# Generate <gensrc>/rules.mk
+with open(os.path.join(gensrc, "rules.mk"), "w") as f:
+  out = f.write
+  impl_names = sorted(name_to_funcs.keys())
+  names = impl_names + ["host_table.cc", "device_table.cu"]
+  out("LIB_OBJS_GEN = $(patsubst %, $(OBJDIR)/genobj/%.o, {names})\n"
+      .format(names=" ".join(names)))
+  out("\n")
+
+  # For each <coll>_<op>_<ty>.cu compile to a .cu.o file. Notice the dependencies
+  # come from the suffix-erased file (e.g. 'gensrc/all_reduce.cu')
+  for name in impl_names:
+    coll = name_to_funcs[name][0]
+    out(
+      "$(OBJDIR)/genobj/{name}.o: $(OBJDIR)/gensrc $(OBJDIR)/genobj/{lower_coll}.cu.d\n"
+      "\t" "$(call COMPILE,$@,$(OBJDIR)/gensrc/{name})\n"
+      "\n"
+      .format(name=name, lower_coll=coll_camel_to_lower[coll])
+    )
+
+# Add the suffix-erased .cu's which are used only for dependency scraping.
+for coll in set(coll for (coll,_,_,_,_) in primary_funcs if coll!="Nop"):
+  name = impl_filename(coll, None, None, None, None)
+  if name not in name_to_funcs:
+    name_to_funcs[name] = (coll, [])
+
+redop_to_cxx = {
+  None: "FuncCopy",
+  "Sum": "FuncSum",
+  "Prod": "FuncProd",
+  "MinMax": "FuncMinMax",
+  "PreMulSum": "FuncPreMulSum",
+  "SumPostDiv": "FuncSumPostDiv"
+}
+
+ty_to_cxx = {
+  None: "int8_t",
+  "i8": "int8_t",
+  "u8": "uint8_t",
+  "i32": "int32_t",
+  "u32": "uint32_t",
+  "i64": "int64_t",
+  "u64": "uint64_t",
+  "f16": "half",
+  "f32": "float",
+  "f64": "double",
+  "bf16": "__nv_bfloat16"
+}
+
+# Generate each <gensrc>/<impl>.cu:
+for name in name_to_funcs.keys():
+  (coll, fns) = name_to_funcs[name]
+  with open(os.path.join(gensrc, name), "w") as f:
+    out = f.write
+    out(
+      '#include "common.h"\n'
+      '#include "{lower_coll}.h"\n'
+      .format(lower_coll=coll_camel_to_lower[coll])
+    )
+
+    (_, kfns) = name_to_kernels.get(name) or (None, [])
+    for kfn in kfns:
+      (coll, redop, ty, algo, proto) = kfn
+      sym = paste("_", coll, redop, ty, algo, proto)
+      fn_id = primary_to_index[kfn]
+      cudart, arch = required_cuda(*kfn)
+      if (cudart, arch) != (0, 0):
+        out("#if CUDART_VERSION >= %d && __CUDA_ARCH__ >= %d\n" % (cudart, arch))
+      out(
+        "DEFINE_ncclDevKernel({sym}, ncclFunc{coll}, {redop_cxx}, {ty_cxx}, NCCL_ALGO_{algo}, NCCL_PROTO_{proto}, {fn_id})\n"
+        .format(sym=sym, coll=coll, redop_cxx=redop_to_cxx[redop], ty_cxx=ty_to_cxx[ty],
+                algo=(algo or "RING"), proto=(proto or "SIMPLE"), fn_id=fn_id)
+      )
+      if (cudart, arch) != (0, 0):
+        out("#endif\n")
+
+    for fn in fns:
+      (coll, redop, ty, algo, proto) = fn
+      sym = paste("_", coll, redop, ty, algo, proto)
+      cudart, arch = required_cuda(*fn)
+      if (cudart, arch) != (0, 0):
+        out("#if CUDART_VERSION >= %d && __CUDA_ARCH__ >= %d\n" % (cudart, arch))
+      out(
+        "DEFINE_ncclDevFunc({sym}, ncclFunc{coll}, {redop_cxx}, {ty_cxx}, NCCL_ALGO_{algo}, NCCL_PROTO_{proto})\n"
+        .format(sym=sym, coll=coll, redop_cxx=redop_to_cxx[redop], ty_cxx=ty_to_cxx[ty],
+                algo=(algo or "RING"), proto=(proto or "SIMPLE"))
+      )
+      if (cudart, arch) != (0, 0):
+        out("#endif\n")
diff --git a/src/device/network/unpack/unpack.h b/src/device/network/unpack/unpack.h
new file mode 100644
index 0000000000..3bc910047d
--- /dev/null
+++ b/src/device/network/unpack/unpack.h
@@ -0,0 +1,280 @@
+/*************************************************************************
+ * Copyright (c) 2023, Google LLC.  All rights reserved.
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef NET_DEVICE_UNPACK_H
+#define NET_DEVICE_UNPACK_H
+
+#include "unpack_defs.h"
+
+#include "op128.h"
+#include "align.h"
+#include "device.h"
+#include "common.h"
+
+// #define ALIGNED_LOAD
+
+inline __device__ void load64gpu(const uint64_t* ptr, uint64_t &v) {
+  #if __CUDA_ARCH__ >= 700
+      asm volatile("ld.relaxed.gpu.u64 {%0}, [%1];"
+      : "=l"(v) : "l"(ptr));
+  #else
+      asm volatile("ld.volatile.global.u64 {%0}, [%1];"
+      : "=l"(v) : "l"(ptr));
+  #endif
+}
+
+#define PAGE_META_SIZE 16
+#define META_LOAD_SIZE 16
+#define DATA_LOAD_SIZE 16
+
+// Map internal association of handle with group and peer index (called once at init time)
+inline __device__ void ncclNetDeviceUnpackSetup(void* ohandle, const int group, const int index) {
+  struct unpackNetDeviceHandle* handle = (struct unpackNetDeviceHandle*) ohandle;
+  ncclShmem.groups[group].devicePlugin.unpack.g_meta[index] = handle->meta;
+  ncclShmem.devicePlugin.unpack.bounce_buf = handle->bounce_buf;
+  ncclShmem.groups[group].devicePlugin.unpack.head = handle->head;
+}
+
+inline __device__ void ncclNetDeviceIncrementHead(const int group) {
+  ncclShmem.groups[group].devicePlugin.unpack.head++;
+}
+
+inline __device__ void ncclNetDeviceSaveHead(void* ohandle, const int group) {
+  struct unpackNetDeviceHandle* handle = (struct unpackNetDeviceHandle*) ohandle;
+  handle->head = ncclShmem.groups[group].devicePlugin.unpack.head;
+}
+
+template <uint8_t sz>
+inline __device__ void bulkLoad(const int t, const uint32_t len, char* cpy_src, char* cpy_dst, BytePack<sz> *reg, const int w, loadMeta* g_meta, loadMeta* s_meta, uint32_t src_off, uint64_t dst_off){
+  bulkLoad<1>(t, len, cpy_src, cpy_dst, reg, w, g_meta, s_meta, src_off, dst_off);
+}
+
+template <>
+inline __device__ void bulkLoad<1>(const int t, const uint32_t len, char* cpy_src, char* cpy_dst, BytePack<1> reg[16], const int w, loadMeta* g_meta, loadMeta* s_meta, uint32_t src_off, uint64_t dst_off){
+  uint64_t data_s;
+  for (data_s = t * DATA_LOAD_SIZE; data_s + DATA_LOAD_SIZE - 1 < len; data_s += WARP_SIZE * DATA_LOAD_SIZE) {
+
+#ifdef ALIGNED_LOAD
+    load128 ((uint64_t*)(cpy_src + data_s), reg.u64[0], reg.u64[1]);
+#else
+#pragma unroll
+    for (int i=0; i<16; i++) {
+      reg[i] = ld_volatile_global<1>((uintptr_t)((uint8_t*)(cpy_src + data_s) + i));
+    }
+#endif
+
+#pragma unroll
+    for (int i=0; i<16; i++) {
+      st_global<1>((uintptr_t)((uint8_t*)(cpy_dst + data_s) + i), reg[i]);
+    }
+  }
+}
+
+template <>
+inline __device__ void bulkLoad<2>(const int t, const uint32_t len, char* cpy_src, char* cpy_dst, BytePack<2> reg[8], const int w, loadMeta* g_meta, loadMeta* s_meta, uint32_t src_off, uint64_t dst_off){
+  uint64_t data_s;
+  for (data_s = t * DATA_LOAD_SIZE; data_s + DATA_LOAD_SIZE - 1 < len; data_s += WARP_SIZE * DATA_LOAD_SIZE) {
+#ifdef ALIGNED_LOAD
+    load128 ((uint64_t*)(cpy_src + data_s), reg.u64[0], reg.u64[1]);
+#else
+#pragma unroll
+    for (int i=0; i<8; i++) {
+      reg[i] = ld_volatile_global<2>((uintptr_t)((uint16_t*)(cpy_src + data_s) + i));
+    }
+#endif
+
+
+#pragma unroll
+    for (int i=0; i<8; i++) {
+      st_global<2>((uintptr_t)((uint16_t*)(cpy_dst + data_s) + i), reg[i]);
+    }
+  }
+}
+
+template <>
+inline __device__ void bulkLoad<4>(const int t, const uint32_t len, char* cpy_src, char* cpy_dst, BytePack<4> reg[4], const int w, loadMeta* g_meta, loadMeta* s_meta, uint32_t src_off, uint64_t dst_off){
+  uint64_t data_s;
+  for (data_s = t * DATA_LOAD_SIZE; data_s + DATA_LOAD_SIZE - 1 < len; data_s += WARP_SIZE * DATA_LOAD_SIZE) {
+#ifdef ALIGNED_LOAD
+    load128 ((uint64_t*)(cpy_src + data_s), reg.u64[0], reg.u64[1]);
+#else
+#pragma unroll
+    for (int i=0; i<4; i++) {
+      reg[i] = ld_volatile_global<4>((uintptr_t)((uint32_t *)(cpy_src + data_s) + i));
+    }
+#endif
+
+#pragma unroll
+    for (int i=0; i<4; i++) {
+      st_global<4>((uintptr_t)((uint32_t*)(cpy_dst + data_s) + i), reg[i]);
+    }
+  }
+}
+
+template <>
+inline __device__ void bulkLoad<8>(const int t, const uint32_t len, char* cpy_src, char* cpy_dst, BytePack<8> reg[2], const int w, loadMeta* g_meta, loadMeta* s_meta, uint32_t src_off, uint64_t dst_off){
+  uint64_t data_s;
+  for (data_s = t * DATA_LOAD_SIZE; data_s + DATA_LOAD_SIZE - 1 < len; data_s += WARP_SIZE * DATA_LOAD_SIZE) {
+#ifdef ALIGNED_LOAD
+    load128 ((uint64_t*)(cpy_src + data_s), reg.u64[0], reg.u64[1]);
+#else
+#pragma unroll
+    for (int i=0; i<2; i++) {
+      reg[i] = ld_volatile_global<8>((uintptr_t)((uint64_t*)(cpy_src + data_s) + i));
+    }
+#endif
+
+#pragma unroll
+    for (int i=0; i<2; i++) {
+      st_global<8>((uintptr_t)((uint64_t*)(cpy_dst + data_s) + i), reg[i]);
+    }
+  }
+}
+
+template <>
+inline __device__ void bulkLoad<16>(const int t, const uint32_t len, char* cpy_src, char* cpy_dst, BytePack<16> reg[1], const int w, loadMeta* g_meta, loadMeta* s_meta, uint32_t src_off, uint64_t dst_off){
+  uint64_t data_s;
+  for (data_s = t * DATA_LOAD_SIZE; data_s + DATA_LOAD_SIZE - 1 < len; data_s += WARP_SIZE * DATA_LOAD_SIZE) {
+    reg[0] = ld_volatile_global<16>((uintptr_t)(cpy_src + data_s));
+    st_global<16>((uintptr_t)(cpy_dst + data_s), reg[0]);
+  }
+}
+
+#ifndef PAGE_SIZE
+#define PAGE_SIZE 4096
+#endif
+inline __device__ int ppw(const int nbytes, int nw) {
+  int v = DIVUP(nbytes, SLICE_PAGE_SIZE);
+  v = DIVUP(v, nw);
+  while (v > WARP_SHM_PAGE_CNT) {
+    v = DIVUP(v, 2);
+  }
+  return v;
+}
+
+// This function is called by all threads
+// Pack data from the internal iovec to the supplied flat buffer using all the
+// threads
+template <int Recv>
+inline __device__ void ncclNetDeviceUnpack(
+    const int tid, const int tidInBlock, const int nworkers, const int group, int mask, int Src, int workSize);
+
+template <>
+inline __device__ void ncclNetDeviceUnpack</*Recv=*/0>(
+    const int tid, const int tidInBlock, const int nworkers, const int group, int mask, int Src, int workSize) {
+  // send unpack empty
+}
+
+inline __device__ void ncclNetDeviceUnpackInner(
+    const int tid, const int tidInBlock, const int nworkers, const int group, const int index,
+    void *src, const int nbytes, const uint64_t step);
+
+template <>
+inline __device__ void ncclNetDeviceUnpack</*Recv=*/1>(
+    const int tid, const int tidInBlock, const int nworkers, const int group, int mask, int Src, int workSize) {
+
+  while (mask != 0) {
+    int ix = __ffs(mask)-1; // Get the first set bit of the mask (this should correlate to a peer index)
+    mask &= mask-1; // Drop the first set bit of the mask
+
+    // Pack data from the internal iovec to the supplied flat srcs buffer using all the threads
+    // + Src is necessary in the case of accessing the user buffer directly
+    ncclNetDeviceUnpackInner(tid, tidInBlock, nworkers, group /* in case they need to use split warps shared memory partitioning*/,
+        ix, ncclShmem.groups[group].srcs[ix + Src], workSize, ncclShmem.groups[group].devicePlugin.unpack.head);
+  }
+}
+
+inline __device__ void ncclNetDeviceUnpackInner(
+    const int tid, const int tidInBlock, const int nworkers, const int group, const int index,
+    void *src, const int nbytes, const uint64_t step) {
+  // from src/collectives/device/common_kernel.h
+  const int w = tid / WARP_SIZE;        // Warp number
+  const int nw = nworkers / WARP_SIZE;  // Number of warps
+  const int t = tid % WARP_SIZE;        // Thread (inside the warp)
+
+  BytePack<16> reg;
+  loadMeta meta;
+
+  uint64_t head;
+  struct netUnpackMeta* g_meta_struct;
+  void* bounce_buf;
+
+  loadMeta* g_meta;
+  loadMeta* s_meta;
+  uint64_t meta_cnt;
+
+  // hack head use per-warp
+  head          = step;
+  g_meta_struct = ncclShmem.groups[group].devicePlugin.unpack.g_meta[index];
+  bounce_buf    = ncclShmem.devicePlugin.unpack.bounce_buf;
+
+  __syncwarp();
+
+  head %= NCCL_NET_DEVICE_UNPACK_MAX_QUEUE_DEPTH;
+
+  g_meta = g_meta_struct->mem[head];
+
+  // Currently, even/odd groups perform send/recv separately. We don't really need space for send side.
+  // Total size is N page per warp * 16 B per page * 20 WARPS max = 320 * N bytes, N == WARP_SHM_PAGE_CNT
+  static_assert(ncclShmemScratchWarpSize() >= WARP_SHM_SIZE, "Each warp must have enough scratch space");
+  s_meta = (loadMeta*) ncclScratchForWarp(tidInBlock / WARP_SIZE); // (loadMeta*) (ncclShmem.devicePlugin.unpack.meta + shm_off);
+
+  load64gpu(g_meta_struct->cnt + head, meta_cnt);
+
+  int PPW = ppw(nbytes, nw);
+
+  for (uint64_t meta_s = w * PPW; meta_s < meta_cnt; meta_s += nw * PPW) {
+
+    uint64_t iter_meta_cnt = meta_cnt - meta_s;
+    iter_meta_cnt = iter_meta_cnt < PPW ? iter_meta_cnt : PPW;
+
+    // TODO: this load size needs to work if not aligned, but since the two are both 16...
+    if (t < PPW * PAGE_META_SIZE / META_LOAD_SIZE && t < iter_meta_cnt) {  // avoid last iter load garbage data
+      load128((const uint64_t*) (g_meta + (meta_s + t)), reg.u64[0], reg.u64[1]);
+
+      storeShmem128(shmemCvtPtr((uint64_t *)(s_meta + (w * PPW + t))), reg.u64[0], reg.u64[1]);
+    }
+
+    __syncwarp();
+
+    for (int x = 0; x < iter_meta_cnt; x++) {
+      int meta_idx = x + w * PPW;
+      
+      // load page offs
+      loadShmem128(shmemCvtPtr((uint64_t*) (s_meta + meta_idx)), meta.r64[0], meta.r64[1]);
+
+      if (meta.len >= DATA_LOAD_SIZE) {
+        // fast path, but need to adapt to alignment issue
+
+        // bulk copy data
+        uint8_t align_off = (meta.src_off | meta.dst_off) % DATA_LOAD_SIZE;
+        align_off = align_off & -align_off;  // keep the lowest bit
+        if (align_off == 0) {  // 0x16
+          bulkLoad<16>(t, meta.len, (char*) bounce_buf + meta.src_off, (char*) src + meta.dst_off, &reg, w, g_meta, s_meta, meta.src_off, meta.dst_off);
+        } else if (align_off & 0x8) {
+          bulkLoad<8>(t, meta.len, (char*) bounce_buf + meta.src_off, (char*) src + meta.dst_off, (BytePack<8>*) &reg, w, g_meta, s_meta, meta.src_off, meta.dst_off);
+        } else if (align_off & 0x4) {
+          bulkLoad<4>(t, meta.len, (char*) bounce_buf + meta.src_off, (char*) src + meta.dst_off, (BytePack<4>*) &reg, w, g_meta, s_meta, meta.src_off, meta.dst_off);
+        } else if (align_off & 0x2) {
+          bulkLoad<2>(t, meta.len, (char*) bounce_buf + meta.src_off, (char*) src + meta.dst_off, (BytePack<2>*) &reg, w, g_meta, s_meta, meta.src_off, meta.dst_off);
+        } else { // if (align_off & 0x1)
+          bulkLoad<1>(t, meta.len, (char*) bounce_buf + meta.src_off, (char*) src + meta.dst_off, (BytePack<1>*) &reg, w, g_meta, s_meta, meta.src_off, meta.dst_off);
+        }
+      }
+
+      // must be less than 16 bytes
+      if (t < meta.len % DATA_LOAD_SIZE) {
+        volatile char* cpy_src = (char*) bounce_buf + meta.src_off + (meta.len / DATA_LOAD_SIZE) * DATA_LOAD_SIZE + t;
+        volatile char* cpy_dst = (char*) src        + meta.dst_off + (meta.len / DATA_LOAD_SIZE) * DATA_LOAD_SIZE + t;
+        *cpy_dst = *cpy_src;
+      }
+    }
+
+    __syncwarp();
+  }
+}
+
+#endif  // NET_DEVICE_UNPACK_DEFS_H_
diff --git a/src/device/network/unpack/unpack_defs.h b/src/device/network/unpack/unpack_defs.h
new file mode 100644
index 0000000000..9be1c5e424
--- /dev/null
+++ b/src/device/network/unpack/unpack_defs.h
@@ -0,0 +1,61 @@
+/*************************************************************************
+ * Copyright (c) 2023, Google LLC.  All rights reserved.
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef NET_DEVICE_UNPACK_DEFS_H
+#define NET_DEVICE_UNPACK_DEFS_H
+
+#include <stdint.h>
+
+#include "device.h"
+
+#define NCCL_NET_DEVICE_UNPACK_MAX_QUEUE_DEPTH 16
+
+union alignas(16) loadMeta {
+  uint64_t r64[2];
+  struct {
+    uint32_t src_off;
+    uint32_t len;
+    uint64_t dst_off;
+  };
+};
+static_assert(sizeof(union loadMeta) == 16, "Must be 16-byte aligned");
+
+/****** global memory ******/
+
+#define NET_UNPACK_MAX_QUEUE_DEPTH 16  // MAX_REQUESTS
+#define NET_UNPACK_MAX_SLICE_SIZE 4194304  // 4MB per Irecv call
+#define SLICE_PAGE_SIZE 4096
+#define NET_UNPACK_MAX_SLICE_PAGES \
+  (NET_UNPACK_MAX_SLICE_SIZE / SLICE_PAGE_SIZE * 2)  // * 2 for slack, wasteful..
+
+struct netUnpackMeta {
+  loadMeta mem[NCCL_NET_DEVICE_UNPACK_MAX_QUEUE_DEPTH][NET_UNPACK_MAX_SLICE_PAGES];
+  uint64_t cnt[NCCL_NET_DEVICE_UNPACK_MAX_QUEUE_DEPTH];
+};
+
+struct unpackNetDeviceHandle {
+  struct netUnpackMeta *meta;  // mapped
+  void* bounce_buf;
+  uint64_t head;
+};
+
+/****** shared memory ******/
+
+#define NET_UNPACK_MAX_GROUPS 16 // Forked from NCCL_MAX_GROUPS in devcomm.h
+#define NET_UNPACK_MAX_NPEERS 2  // The most you should have is 2 network peers per-group (indexed by index)
+#define WARP_SHM_PAGE_CNT 4
+#define WARP_SHM_SIZE (WARP_SHM_PAGE_CNT * sizeof(union loadMeta))
+struct unpackShmem {
+  void* bounce_buf;
+};
+
+struct unpackGroupShmem {
+  int unpackNetDeviceIndexMask; // We store a single unpackNetDeviceIndex because only one peer can be network recv
+  uint64_t head;
+  struct netUnpackMeta* g_meta[NET_UNPACK_MAX_NPEERS]; // head of handle to index into meta for meta copy
+};
+
+#endif // NET_DEVICE_UNPACK_DEFS_H_
diff --git a/src/device/onerank.cu b/src/device/onerank.cu
new file mode 100644
index 0000000000..5ff4a85b10
--- /dev/null
+++ b/src/device/onerank.cu
@@ -0,0 +1,79 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "alloc.h"
+#include "collectives.h"
+#include "common_kernel.h"
+#include "common.h"
+#include <cuda_runtime.h>
+
+namespace {
+  template<typename RedOp>
+  __global__ __launch_bounds__(512, 1)
+  void oneRankReduce(void* dst, void* src, size_t nElts, uint64_t redOpArg, bool redOpArgIsPtr) {
+    using T = typename RedOp::EltType;
+    int tid = threadIdx.x;
+    int tn = blockDim.x;
+    int bid = blockIdx.x;
+    int bn = gridDim.x;
+
+    // each block/channel gets a roughly equal segment of 16 byte packs
+    constexpr int EltPerPack = 16/sizeof(T);
+    intptr_t i0 = (bid+0)*alignUp(nElts/bn, EltPerPack);
+    intptr_t i1 = (bid+1)*alignUp(nElts/bn, EltPerPack);
+    i0 = min(i0, nElts);
+    i1 = min(i1, nElts);
+    src = (T*)src + i0;
+    dst = (T*)dst + i0;
+
+    if (redOpArgIsPtr) {
+      if (redOpArg%2 != 0) {
+        redOpArg = *reinterpret_cast<uint8_t*>(redOpArg);
+      } else if (redOpArg%4 != 0) {
+        redOpArg = *reinterpret_cast<uint16_t*>(redOpArg);
+      } else if (redOpArg%8 != 0) {
+        redOpArg = *reinterpret_cast<uint32_t*>(redOpArg);
+      } else {
+        redOpArg = *reinterpret_cast<uint64_t*>(redOpArg);
+      }
+    }
+    reduceCopy<COLL_UNROLL, RedOp, T, 0,1,1, 0,1,1, /*PreOpSrcs=*/1>
+      (tid, tn, redOpArg, &redOpArg, true, 1, &src, 1, &dst, i1-i0);
+  }
+}
+
+ncclResult_t ncclLaunchOneRank(void* dst, void const* src, size_t nElts, struct ncclDevRedOpFull redOp, ncclDataType_t eltType, cudaStream_t stream) {
+  size_t eltSize = ncclTypeSize(eltType);
+  if (redOp.op != ncclDevPreMulSum) {
+    if (dst != src) {
+      NCCLCHECK(ncclCudaMemcpyAsync((char*)dst, (char*)src, nElts*eltSize, stream));
+    }
+    return ncclSuccess;
+  }
+
+  void const* kernel;
+  switch (eltType) {
+  case ncclInt8:     kernel = (void const*)&oneRankReduce<FuncPreMulSum<int8_t>>; break;
+  case ncclUint8:    kernel = (void const*)&oneRankReduce<FuncPreMulSum<uint8_t>>; break;
+  case ncclInt32:    kernel = (void const*)&oneRankReduce<FuncPreMulSum<int32_t>>; break;
+  case ncclUint32:   kernel = (void const*)&oneRankReduce<FuncPreMulSum<uint32_t>>; break;
+  case ncclInt64:    kernel = (void const*)&oneRankReduce<FuncPreMulSum<int64_t>>; break;
+  case ncclUint64:   kernel = (void const*)&oneRankReduce<FuncPreMulSum<uint64_t>>; break;
+  case ncclFloat16:  kernel = (void const*)&oneRankReduce<FuncPreMulSum<half>>; break;
+  #if defined(__CUDA_BF16_TYPES_EXIST__)
+  case ncclBfloat16: kernel = (void const*)&oneRankReduce<FuncPreMulSum<__nv_bfloat16>>; break;
+  #endif
+  case ncclFloat32:  kernel = (void const*)&oneRankReduce<FuncPreMulSum<float>>; break;
+  case ncclFloat64:  kernel = (void const*)&oneRankReduce<FuncPreMulSum<double>>; break;
+  default: return ncclInvalidArgument;
+  }
+  dim3 grid = {0, 1, 1};
+  grid.x = std::min(32, (int)divUp(nElts*eltSize, 16<<10));
+  dim3 block = {512, 1, 1};
+  void* args[5] = {&dst, &src, &nElts, &redOp.scalarArg, &redOp.scalarArgIsPtr};
+  CUDACHECK(cudaLaunchKernel(kernel, grid, block, args, 0, stream));
+  return ncclSuccess;
+}
diff --git a/src/collectives/device/op128.h b/src/device/op128.h
similarity index 82%
rename from src/collectives/device/op128.h
rename to src/device/op128.h
index 8ce18ef600..b2f8227b05 100644
--- a/src/collectives/device/op128.h
+++ b/src/device/op128.h
@@ -161,21 +161,25 @@ __device__ __forceinline__ T fromPack(typename BytePackOf<T>::Pack pack)  {
 // Load/store of BytePack<?> using integral addresses.
 
 template<int Size> __device__ BytePack<Size> ld_global(uintptr_t addr);
-template<int Size> __device__ BytePack<Size> ld_volatile_global(uintptr_t addr);
 template<int Size> __device__ BytePack<Size> ld_shared(uint32_t addr);
+template<int Size> __device__ BytePack<Size> ld_volatile_global(uintptr_t addr);
 template<int Size> __device__ BytePack<Size> ld_volatile_shared(uint32_t addr);
+template<int Size> __device__ BytePack<Size> ld_relaxed_gpu_global(uintptr_t addr);
 template<int Size> __device__ void st_global(uintptr_t addr, BytePack<Size> value);
 template<int Size> __device__ void st_shared(uint32_t addr, BytePack<Size> value);
+template<int Size> __device__ void st_relaxed_gpu_global(uintptr_t addr, BytePack<Size> value);
 
 template<> __device__ __forceinline__ BytePack<0> ld_global<0>(uintptr_t addr) { return {}; }
-template<> __device__ __forceinline__ BytePack<0> ld_volatile_global<0>(uintptr_t addr) { return {}; }
 template<> __device__ __forceinline__ BytePack<0> ld_shared<0>(uint32_t addr) { return {}; }
+template<> __device__ __forceinline__ BytePack<0> ld_volatile_global<0>(uintptr_t addr) { return {}; }
 template<> __device__ __forceinline__ BytePack<0> ld_volatile_shared<0>(uint32_t addr) { return {}; }
+template<> __device__ __forceinline__ BytePack<0> ld_relaxed_gpu_global<0>(uintptr_t addr) { return {}; }
 template<> __device__ __forceinline__ void st_global<0>(uintptr_t addr, BytePack<0> value) {}
 template<> __device__ __forceinline__ void st_shared<0>(uint32_t addr, BytePack<0> value) {}
+template<> __device__ __forceinline__ void st_relaxed_gpu_global<0>(uintptr_t addr, BytePack<0> value) {}
 
 // Used to define implementations for above prototypes.
-#define DEFINE_ld_st(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty, space, addr_cxx_ty, addr_reg_ty) \
+#define DEFINE_ld_st__size_space(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty, space, addr_cxx_ty, addr_reg_ty) \
   template<> \
   __device__ __forceinline__ BytePack<bytes> ld_##space<bytes>(addr_cxx_ty addr) { \
     data_cxx_ty tmp; \
@@ -197,19 +201,44 @@ template<> __device__ __forceinline__ void st_shared<0>(uint32_t addr, BytePack<
     data_cxx_ty tmp = value.native; \
     asm volatile("st." #space "." #data_ptx_ty " [%0], %1;" :: #addr_reg_ty(addr), #data_reg_ty(tmp) : "memory"); \
   }
+
+#if __CUDA_ARCH__ >= 700
+  #define PTX_relaxed_gpu "relaxed.gpu"
+#else
+  #define PTX_relaxed_gpu "volatile"
+#endif
+
+#define DEFINE_ld_st_gpu_relaxed__size(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty) \
+  template<> \
+  __device__ __forceinline__ BytePack<bytes> ld_relaxed_gpu_global<bytes>(uintptr_t addr) { \
+    data_cxx_ty tmp; \
+    asm("ld." PTX_relaxed_gpu ".global." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : "l"(addr)); \
+    BytePack<bytes> ans; \
+    ans.native = tmp; \
+    return ans; \
+  } \
+  template<> \
+  __device__ __forceinline__ void st_relaxed_gpu_global<bytes>(uintptr_t addr, BytePack<bytes> value) { \
+    data_cxx_ty tmp = value.native; \
+    asm volatile("st." PTX_relaxed_gpu ".global." #data_ptx_ty " [%0], %1;" :: "l"(addr), #data_reg_ty(tmp) : "memory"); \
+  }
+
+#define DEFINE_ld_st__size(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty) \
+  DEFINE_ld_st__size_space(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty, global, uintptr_t, l) \
+  DEFINE_ld_st__size_space(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty, shared, uint32_t, r) \
+  DEFINE_ld_st_gpu_relaxed__size(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty)
+
 // Single-byte types use 4-byte registers since there is no 1-byte register
 // character for asm blocks. See https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html#constraints
-DEFINE_ld_st(1, uint32_t, b8, r, global, uintptr_t, l)
-DEFINE_ld_st(1, uint32_t, b8, r, shared, uint32_t, r)
-DEFINE_ld_st(2, uint16_t, b16, h, global, uintptr_t, l)
-DEFINE_ld_st(2, uint16_t, b16, h, shared, uint32_t, r)
-DEFINE_ld_st(4, uint32_t, b32, r, global, uintptr_t, l)
-DEFINE_ld_st(4, uint32_t, b32, r, shared, uint32_t, r)
-DEFINE_ld_st(8, uint64_t, b64, l, global, uintptr_t, l)
-DEFINE_ld_st(8, uint64_t, b64, l, shared, uint32_t, r)
-#undef DEFINE_ld_st
+DEFINE_ld_st__size(1, uint32_t, b8, r)
+DEFINE_ld_st__size(2, uint16_t, b16, h)
+DEFINE_ld_st__size(4, uint32_t, b32, r)
+DEFINE_ld_st__size(8, uint64_t, b64, l)
 
-#define DEFINE_ld_st_16(space, addr_cxx_ty, addr_reg_ty) \
+#undef DEFINE_ld_st__size_space
+#undef DEFINE_ld_st__size
+
+#define DEFINE_ld_st_16__space(space, addr_cxx_ty, addr_reg_ty) \
   template<> \
   __device__ __forceinline__ BytePack<16> ld_##space<16>(addr_cxx_ty addr) { \
     BytePack<16> ans; \
@@ -226,10 +255,23 @@ DEFINE_ld_st(8, uint64_t, b64, l, shared, uint32_t, r)
   __device__ __forceinline__ void st_##space<16>(addr_cxx_ty addr, BytePack<16> value) { \
     asm("st." #space ".v2.b64 [%0], {%1,%2};" :: #addr_reg_ty(addr), "l"(value.u64[0]), "l"(value.u64[1]) : "memory"); \
   }
-DEFINE_ld_st_16(global, uintptr_t, l)
-DEFINE_ld_st_16(shared, uint32_t, r)
+DEFINE_ld_st_16__space(global, uintptr_t, l)
+DEFINE_ld_st_16__space(shared, uint32_t, r)
 #undef DEFINE_ld_st_16
 
+template<>
+__device__ __forceinline__ BytePack<16> ld_relaxed_gpu_global<16>(uintptr_t addr) {
+  BytePack<16> ans;
+  asm("ld." PTX_relaxed_gpu ".global.v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : "l"(addr));
+  return ans;
+}
+template<>
+__device__ __forceinline__ void st_relaxed_gpu_global<16>(uintptr_t addr, BytePack<16> value) {
+  asm volatile("st." PTX_relaxed_gpu ".global.v2.b64 [%0], {%1,%2};" :: "l"(addr), "l"(value.u64[0]), "l"(value.u64[1]) : "memory");
+}
+
+#undef PTX_relaxed_gpu
+
 ////////////////////////////////////////////////////////////////////////////////
 // Atomic load/store using c++ pointers.
 
@@ -247,6 +289,15 @@ __device__ __forceinline__ uint64_t ld_relaxed_sys_global(uint64_t *ptr) {
   #endif
   return ans;
 }
+__device__ __forceinline__ uint64_t ld_relaxed_gpu_global(uint64_t *ptr) {
+  uint64_t ans;
+  #if __CUDA_ARCH__ >= 700
+    asm("ld.relaxed.gpu.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
+  #else
+    asm("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
+  #endif
+  return ans;
+}
 __device__ __forceinline__ uint64_t ld_acquire_sys_global(uint64_t *ptr) {
   uint64_t ans;
   #if __CUDA_ARCH__ >= 700
diff --git a/src/collectives/device/primitives.h b/src/device/primitives.h
similarity index 100%
rename from src/collectives/device/primitives.h
rename to src/device/primitives.h
diff --git a/src/collectives/device/prims_ll.h b/src/device/prims_ll.h
similarity index 99%
rename from src/collectives/device/prims_ll.h
rename to src/device/prims_ll.h
index 5389cc4fae..f341d6fb81 100644
--- a/src/collectives/device/prims_ll.h
+++ b/src/device/prims_ll.h
@@ -323,7 +323,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
   __device__  Primitives(
       const int tid, const int nthreads, int const *recvPeers, int const *sendPeers,
       void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
-      uint8_t connIndexRecv=0, uint8_t connIndexSend=0
+      uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclWorkElem* e = nullptr, int stepSize_=0
     ):
     redOp(redOpArg),
     tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group),
diff --git a/src/collectives/device/prims_ll128.h b/src/device/prims_ll128.h
similarity index 99%
rename from src/collectives/device/prims_ll128.h
rename to src/device/prims_ll128.h
index cd50942bbe..43e01c485d 100644
--- a/src/collectives/device/prims_ll128.h
+++ b/src/device/prims_ll128.h
@@ -364,7 +364,7 @@ public:
   __device__ Primitives(
       const int tid, const int nthreads, int const *recvPeers, int const *sendPeers,
       void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
-      uint8_t connIndexRecv=0, uint8_t connIndexSend=0
+      uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclWorkElem* e = nullptr, int stepSize_=0
     ):
     redOp(redOpArg),
     tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE),
diff --git a/src/collectives/device/prims_simple.h b/src/device/prims_simple.h
similarity index 79%
rename from src/collectives/device/prims_simple.h
rename to src/device/prims_simple.h
index 19cecf97b8..048052eef1 100644
--- a/src/collectives/device/prims_simple.h
+++ b/src/device/prims_simple.h
@@ -4,6 +4,8 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
+#include "network/unpack/unpack.h"
+
 template<typename T, typename RedOp, typename Fan, int Direct,
          int SlicePerChunk, int StepPerSlice, int Unroll, int P2p, int MultimemSrcs, int MultimemDsts>
 class Primitives<
@@ -23,7 +25,11 @@ class Primitives<
                        DirectWrite = 0x200,
                        DirectRead = 0x400,
                        ThreadsSynced = 0x800,
-                       NvlsMinPolling = 0x1000;
+                       NvlsMinPolling = 0x1000,
+                       NetDeviceUnpack = 0x2000,
+                       AnyNetDeviceUnpack = 0x4000,
+                       NvlsDirectRead = 0x8000,
+                       NvlsDirectWrite = 0x10000;
   const int tid, tidInBlock;
   const int nthreads;
   int nworkers;
@@ -44,6 +50,8 @@ class Primitives<
   };
   uint64_t *connStepPtr;
   uint64_t connStepCache; // Cache last seen value of (*connStepPtr)
+  void*    mhandle;
+  void*    netDeviceHandle;
 
   // Don't use barrier 0 as it's used by the final sync
   __device__ void barrier() {
@@ -141,7 +149,7 @@ class Primitives<
       if (flags & OffsFifoEnabled)
         ptrs[index] = connEltsFifo + loadInt(connOffsFifoPtr + (step%NCCL_STEPS))/sizeof(T);
       else if (isSendNotRecv && DirectSend) {
-        if (flags & DirectWrite) {
+        if (flags & (DirectWrite | NvlsDirectWrite)) {
           ptrs[index] = directBuff + dstIx + offset;
         } else if (flags & DirectRead) {  // empty send
           ptrs[index] = nullptr;
@@ -149,7 +157,7 @@ class Primitives<
           ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
         }
       } else if (!isSendNotRecv && DirectRecv) {
-        if (flags & DirectRead) {
+        if (flags & (DirectRead | NvlsDirectRead)) {
           ptrs[index] = directBuff + srcIx + offset;
         } else if (flags & DirectWrite) {
           ptrs[index] = directBuff + dstIx + offset;  // send to next from my output buffer
@@ -160,6 +168,9 @@ class Primitives<
       else {
         ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
       }
+      if ((flags & (AnyNetDeviceUnpack)) && (flags & (Recv*RoleWaitRecv))) {
+        ncclNetDeviceIncrementHead(group);
+      }
       step += StepPerSlice;
     }
   }
@@ -229,7 +240,16 @@ class Primitives<
         /* if user abort the kernel, we don't need to actually perform copy/reduce; just set size
          * to 0 to avoid unnecessary workload. */
         int workSize = ncclShmem.aborted ? 0 : sliceSize;
-        if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) {
+        if (flags & AnyNetDeviceUnpack) {
+          ncclNetDeviceUnpack<Recv>(tid, tidInBlock, nworkers, group, ncclShmem.groups[group].devicePlugin.unpack.unpackNetDeviceIndexMask, Src, workSize);
+          // Sync here to make sure all workers are reading from the updated srcs)
+          subBarrier();
+        }
+
+        if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]
+            /* NVLS can have srcs[0] == dsts[0], but we cannot enter this "if branch",
+             * so we need to check whether MultimemSrcs and MultimemDsts are 0. */
+            && MultimemSrcs == 0 && MultimemDsts == 0) {
           // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
           if (Send) {
             reduceCopy<Unroll, RedOp, T, 0, 1, 1, 0, 1, MaxSend, /*PreOpSrcs*/0>
@@ -286,7 +306,7 @@ class Primitives<
   // shift: peer offset to avoid all ranks sending to or receiving from same peer
   template <int DirectRecv1, int DirectSend1, int Recv, int Send>
   __device__ __forceinline__ void
-  ScatterGatherOp(intptr_t inpIx, intptr_t outIx, int totalElem, int peerElem, int peerOffset, int skip, int shift, bool postOp) {
+  ScatterGatherOp(intptr_t inpIx, intptr_t outIx, ssize_t totalElem, int peerElem, ssize_t peerOffset, int skip, int shift, bool postOp) {
     constexpr int DirectRecv = 1 && Direct && DirectRecv1;
     constexpr int DirectSend = 1 && Direct && DirectSend1;
     int offset = 0; // slice offset
@@ -295,7 +315,7 @@ class Primitives<
 
     #pragma unroll
     for (int slice=0; slice<SlicePerChunk; ++slice) {
-      int realSize = max(0, min(dataSize, peerElem-offset));
+      ssize_t realSize = max(0, min(dataSize, peerElem-offset));
       bool fenceNeeded = false;
       if (tid < nworkers) {
         if (Send) {
@@ -309,11 +329,11 @@ class Primitives<
           // Loop over peers
           for (int j=0; j<fan.nsend(); j++) {
             int i = (j+shift)%fan.nsend();
-            int pOffset = i*peerOffset;
+            ssize_t pOffset = i*peerOffset;
             // Skip the data I am responsible of reducing myself
             if (skip >= 0 && i >= skip) pOffset += peerElem;
             void* src0 = (T*)ncclShmem.groups[group].srcs[0] + pOffset;
-            int realPeerSize = min(realSize, totalElem-pOffset);
+            ssize_t realPeerSize = min(realSize, totalElem-pOffset);
             if (realPeerSize > 0 && ncclShmem.groups[group].dsts[i] != nullptr) {
               reduceCopy<Unroll, RedOp, T, 0,1,1, 0,1,1, PreOpSrcs>(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, 1, &src0, 1, ncclShmem.groups[group].dsts+i, realPeerSize);
               // Mark for threadfence at the end
@@ -322,10 +342,10 @@ class Primitives<
           }
         } else if (Recv) {
           if (flags & RoleOutput) ncclShmem.groups[group].dsts[0] = userBuff + outIx + offset;
-          int pOffset = index*peerOffset;
+          ssize_t pOffset = index*peerOffset;
           if (skip >= 0 && index >= skip) pOffset += peerElem;
           // Adjust remote index with peer offset in case we are directly pulling from peer's output buffer
-          waitPeer<DirectRecv, 0, 1, 0, 0, 1>(outIx, outIx+pOffset, offset, realSize);
+          waitPeer<DirectRecv, 0, 1, 0, 0, 1>(outIx+pOffset, outIx+pOffset, offset, realSize);
           subBarrier();
           #pragma unroll
           for (int j=0; j<fan.nrecv(); j++) {
@@ -333,7 +353,7 @@ class Primitives<
             pOffset = i*peerOffset;
             if (skip >= 0 && i >= skip) pOffset += peerElem;
             void* dst0 = (T*)ncclShmem.groups[group].dsts[0] + pOffset;
-            int realPeerSize = min(realSize, totalElem-pOffset);
+            ssize_t realPeerSize = min(realSize, totalElem-pOffset);
             if (DirectRecv && ncclShmem.groups[group].srcs[i] == dst0) realPeerSize = 0;
             if (realPeerSize > 0) reduceCopy<Unroll, RedOp, T, 0,1,1, 0,1,1, /*PreOpSrcs=*/0>(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp, 1, ncclShmem.groups[group].srcs+i, 1, &dst0, realPeerSize);
           }
@@ -348,6 +368,13 @@ class Primitives<
   __device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, struct ncclWorkElem* e) {
     if (flags & (RoleWaitRecv|RolePostRecv)) {
       auto *conn = &peer->recv[connIndex];
+      if (conn->netDeviceHandle.netDeviceType == NCCL_NET_DEVICE_UNPACK) {
+        // handle must be a device ptr
+        netDeviceHandle = conn->netDeviceHandle.handle;
+        // Cache the handle
+        ncclNetDeviceUnpackSetup(netDeviceHandle, group, index);
+        flags |= NetDeviceUnpack;
+      }
       step = conn->step;
       step = roundUp(step, SlicePerChunk*StepPerSlice);
       if (flags & RolePostRecv) {
@@ -377,6 +404,9 @@ class Primitives<
               // otherwise, in one-to-multi send, we could mix empty send and intermediate send
               flags |= (conn->flags & NCCL_DIRECT_WRITE) ? DirectWrite : 0;
             }
+          } else if ((conn->flags & NCCL_NVLS_MIN_POLL) && e != nullptr && e->regUsed) {
+            /* NVLS direct */
+            flags |= NvlsDirectRead;
           }
         }
         if (flags & OffsFifoEnabled)
@@ -393,6 +423,7 @@ class Primitives<
       step = roundUp(step, SlicePerChunk*StepPerSlice);
       if (flags & RolePostSend) {
         connStepPtr = conn->tail;
+        connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
       }
       if (flags & RoleWaitSend) {
         ncclShmem.groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs()
@@ -424,6 +455,9 @@ class Primitives<
               // otherwise, in one-to-multi send, we could mix empty send and intermediate send
               flags |= (conn->flags & NCCL_DIRECT_WRITE) ? DirectWrite : 0;
             }
+          } else if ((conn->flags & NCCL_NVLS_MIN_POLL) && e != nullptr && e->regUsed) {
+            /* NVLS direct */
+            flags |= NvlsDirectWrite;
           }
         }
       }
@@ -434,10 +468,10 @@ class Primitives<
   __device__ Primitives(
       int tid, int nthreads, int const *recvPeers, int const *sendPeers,
       void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
-      uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclWorkElem* e = nullptr
+      uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclWorkElem* e = nullptr, int stepSize_=0
     ):
     tid(tid), nthreads(nthreads), tidInBlock(threadIdx.x), group(group),
-    stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T)) {
+    stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_) {
 
     // For send operations, we need an extra warp to overlap the threadfence and the copy
     this->nworkers = nthreads - (MaxSend > 0 && nthreads-WARP_SIZE >= 64 ? WARP_SIZE : 0);
@@ -473,6 +507,20 @@ class Primitives<
     loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, e);
     loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, e);
 
+    if (barrierAny(flags & NetDeviceUnpack)) {
+      flags |= AnyNetDeviceUnpack;
+      // g == 0 is the first ThreadPerSync # of threads of this warp
+      // g == 0 is also the RoleWaitRecv threads of this group, thus the thread ID will correlate to the peer index
+      if (g == 0) {
+        uint32_t mask = __ballot_sync((1U << ThreadPerSync) - 1, (flags & NetDeviceUnpack) ? 1 : 0);
+
+        // We only want to update the shared memory variable with a single thread
+        if (tid == 0) {
+          ncclShmem.groups[this->group].devicePlugin.unpack.unpackNetDeviceIndexMask = mask;
+        }
+      }
+    }
+
     setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclWorkElemReg*)e);
   }
 
@@ -485,8 +533,10 @@ class Primitives<
       auto *conns = (flags & RolePostSend) ? ncclShmem.groups[group].sendConns : ncclShmem.groups[group].recvConns;
       conns[index]->step = step;
     }
-    // Make sure all threads are done writing back conn->step and done using
-    // ncclShmem.groups[group]
+    
+    if ((flags & (AnyNetDeviceUnpack)) && (flags & (RoleWaitRecv))) {
+      ncclNetDeviceSaveHead(netDeviceHandle, group);
+    }
     barrier();
   }
 
@@ -497,33 +547,41 @@ class Primitives<
     }
     if (flags & RoleOutput) userBuff = (T*)outputBuf;
     bool recvProvider = flags == (flags|RoleWaitRecv|DirectWrite);
-    bool sendAcceptor = flags == (flags|RoleWaitSend|DirectWrite);
+    bool sendAcceptor = (flags == (flags|RoleWaitSend|DirectWrite)) || (flags == (flags|RoleWaitSend|NvlsDirectWrite));
     bool sendProvider = flags == (flags|RoleWaitSend|DirectRead); // sender provides direct buffer (to be fetched)
-    bool recvAcceptor = flags == (flags|RoleWaitRecv|DirectRead); // receiver accepts direct buffer
+    bool recvAcceptor = flags == (flags|RoleWaitRecv|DirectRead) || (flags == (flags|RoleWaitRecv|NvlsDirectRead)); // receiver accepts direct buffer
     int regUsed = e != nullptr ? e->elem.regUsed : 0;
 
     if (Direct && recvProvider) {
       int spins = 0;
       void *volatile *slot = ncclShmem.groups[group].recvConns[index]->ptrExchange;
       // Wait for consumer to consume previous value before trampling it.
-      while (*slot != nullptr && !checkAbort(spins));
-      directBuff = (T*)outputBuf;
-      // Encode pointer by XOR'ing against some address they definitely wouldn't send
-      // since we want to allow them sending us nullptr while not colliding with
-      // the empty slot value.
-      *slot = reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(directBuff) ^ reinterpret_cast<uintptr_t>(slot));
+      if (slot) {
+        while (*slot != nullptr && !checkAbort(spins));
+        directBuff = (T*)outputBuf;
+        // Encode pointer by XOR'ing against some address they definitely wouldn't send
+        // since we want to allow them sending us nullptr while not colliding with
+        // the empty slot value.
+        *slot = reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(directBuff) ^ reinterpret_cast<uintptr_t>(slot));
+      }
     }
     if (Direct && sendAcceptor) {
       int spins = 0;
       void *volatile *slot = ncclShmem.groups[group].sendConns[index]->ptrExchange;
       void *ptr;
-      while (true) {
+      while (slot) {
         ptr = *slot;
         if (ptr != nullptr || checkAbort(spins)) break;
       }
-      directBuff = regUsed ? (T*)(e->dnOutputs[index]) :
+
+      if (slot) {
+        directBuff = regUsed ? (T*)(e->dnOutputs[index]) :
                    reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(ptr) ^ reinterpret_cast<uintptr_t>(slot));
-      *slot = nullptr;
+        *slot = nullptr;
+      } else {
+        /* slot is NULL, it must be regUsed == 1 */
+        directBuff = (T*)e->dnOutputs[index];
+      }
     }
     if (Direct && sendProvider) {
       int spins = 0;
@@ -531,17 +589,19 @@ class Primitives<
       volatile uint64_t* argSlot0 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange;
       volatile uint64_t* argSlot1 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange+1;
       // Wait for consumer to consume previous value before trampling it.
-      while ((*slot != nullptr || *argSlot0 != 0 || *argSlot1 !=0) && !checkAbort(spins));
-      // If there is no recv, then we are directly pulling from input buffer (e.g. directScatter)
-      // Otherwise, we are pulling from output buffer (e.g. recvCopyDirectSend)
-      directBuff = MaxRecv == 0 ? (T*)inputBuf : (T*)outputBuf;
-      // Exchange pre-scalers for use in direct pull
-      *argSlot0 = (uint64_t(1)<<32) | (uint32_t)redOpArg;
-      *argSlot1 = (uint64_t(1)<<32) | (uint32_t)(redOpArg>>32);
-      // Encode pointer by XOR'ing against some address they definitely wouldn't send
-      // since we want to allow them sending us nullptr while not colliding with
-      // the empty slot value.
-      *slot = reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(directBuff) ^ reinterpret_cast<uintptr_t>(slot));
+      if (slot && argSlot0 && argSlot1) {
+        while ((*slot != nullptr || *argSlot0 != 0 || *argSlot1 !=0) && !checkAbort(spins));
+        // If there is no recv, then we are directly pulling from input buffer (e.g. directScatter)
+        // Otherwise, we are pulling from output buffer (e.g. recvCopyDirectSend)
+        directBuff = MaxRecv == 0 ? (T*)inputBuf : (T*)outputBuf;
+        // Exchange pre-scalers for use in direct pull
+        *argSlot0 = (uint64_t(1)<<32) | (uint32_t)redOpArg;
+        *argSlot1 = (uint64_t(1)<<32) | (uint32_t)(redOpArg>>32);
+        // Encode pointer by XOR'ing against some address they definitely wouldn't send
+        // since we want to allow them sending us nullptr while not colliding with
+        // the empty slot value.
+        *slot = reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(directBuff) ^ reinterpret_cast<uintptr_t>(slot));
+      }
     }
     if (Direct && recvAcceptor) {
       int spins = 0;
@@ -549,24 +609,29 @@ class Primitives<
       volatile uint64_t* argSlot0 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange;
       volatile uint64_t* argSlot1 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange+1;
       void *ptr;
-      while (true) {
+      while (slot) {
         ptr = *slot;
         if (ptr != nullptr || checkAbort(spins)) break;
       }
-      directBuff = regUsed ? (T*)(MaxSend == 0 ? e->upOutputs[index] : e->dnInputs[index]) :
-                   reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(ptr) ^ reinterpret_cast<uintptr_t>(slot));
-      if (MaxSend != 0) { // reduce group rather than gather group
-        // Store scalers for remote inputs
-        uint64_t arg0, arg1;
-        while (true) {
-          arg0 = *argSlot0;
-          arg1 = *argSlot1;
-          if ((arg0 != 0 && arg1 != 0) || checkAbort(spins)) break;
+
+      if (slot && argSlot0 && argSlot1) {
+        directBuff = regUsed ? (T*)(MaxSend == 0 ? e->upOutputs[index] : e->dnInputs[index]) :
+          reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(ptr) ^ reinterpret_cast<uintptr_t>(slot));
+        if (MaxSend != 0) { // reduce group rather than gather group
+          // Store scalers for remote inputs
+          uint64_t arg0, arg1;
+          while (true) {
+            arg0 = *argSlot0;
+            arg1 = *argSlot1;
+            if ((arg0 != 0 && arg1 != 0) || checkAbort(spins)) break;
+          }
+          ncclShmem.redOpArgs[1 + index] = ((arg1 & 0xffffffff) << 32) | (arg0 & 0xffffffff);
         }
-        ncclShmem.redOpArgs[1+index] = ((arg1 & 0xffffffff)<<32) | (arg0 & 0xffffffff);
+        *argSlot0 = 0; *argSlot1 = 0;
+        *slot = nullptr;
+      } else {
+        directBuff = (T*)e->dnInputs[index];
       }
-      *argSlot0 = 0; *argSlot1 = 0;
-      *slot = nullptr;
     }
   }
 
@@ -594,6 +659,9 @@ class Primitives<
   __device__ __forceinline__ void directRecv(intptr_t outIx, int eltN) {
     genericOp<1, 0, 1, 0, -1, Output>(-1, outIx, eltN, /*postOp=*/false);
   }
+  __device__ __forceinline__ void directRecvCopy(intptr_t inpIx, intptr_t outIx, int eltN) {
+    genericOp<1, 0, 1, 0, -1, Output>(inpIx, outIx, eltN, /*postOp=*/false);
+  }
 
   __device__ __forceinline__ void copySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
     genericOp<0, 0, 0, 1, Input, Output>(inpIx, outIx, eltN, postOp);
@@ -611,6 +679,9 @@ class Primitives<
   __device__ __forceinline__ void directRecvCopySend(intptr_t outIx, int eltN) {
     genericOp<1, 1, 1, 1, -1, Output>(-1, outIx, eltN, false);
   }
+  __device__ __forceinline__ void directRecvDirectSend(intptr_t inpIx, intptr_t outIx, int eltN) {
+    genericOp<1, 1, 1, 1, -1, -1>(inpIx, outIx, eltN, false);
+  }
   __device__ __forceinline__ void recvCopyDirectSend(intptr_t outIx, int eltN, bool postOp=false) {
     genericOp<0, 1, 1, 1, -1, Output>(-1, outIx, eltN, postOp);
   }
@@ -635,20 +706,20 @@ class Primitives<
   }
 
   __device__ __forceinline__ void
-  scatter(intptr_t inpIx, int totalElem, int peerElem, int peerOffset, int skip, int shift) {
+  scatter(intptr_t inpIx, ssize_t totalElem, int peerElem, ssize_t peerOffset, int skip, int shift) {
     ScatterGatherOp<0, 0, 0, 1>(inpIx, -1, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false);
   }
   __device__ __forceinline__ void
-  directScatter(intptr_t inpIx, int totalElem, int peerElem, int peerOffset, int skip, int shift) {
+  directScatter(intptr_t inpIx, ssize_t totalElem, int peerElem, ssize_t peerOffset, int skip, int shift) {
     ScatterGatherOp<0, 1, 0, 1>(inpIx, -1, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false);
   }
 
   __device__ __forceinline__ void
-  gather(intptr_t outIx, int totalElem, int peerElem, int peerOffset, int skip, int shift, bool postOp=false) {
+  gather(intptr_t outIx, ssize_t totalElem, int peerElem, ssize_t peerOffset, int skip, int shift, bool postOp=false) {
     ScatterGatherOp<0, 0, 1, 0>(-1, outIx, totalElem, peerElem, peerOffset, skip, shift, postOp);
   }
   __device__ __forceinline__ void
-  directGather(intptr_t outIx, int totalElem, int peerElem, int peerOffset, int skip, int shift) {
+  directGather(intptr_t outIx, ssize_t totalElem, int peerElem, ssize_t peerOffset, int skip, int shift) {
     ScatterGatherOp<1, 0, 1, 0>(-1, outIx, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false);
   }
 };
diff --git a/src/collectives/device/reduce.h b/src/device/reduce.h
similarity index 99%
rename from src/collectives/device/reduce.h
rename to src/device/reduce.h
index 0927037e93..627d9b119b 100644
--- a/src/collectives/device/reduce.h
+++ b/src/device/reduce.h
@@ -4,7 +4,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "devcomm.h"
+#include "device.h"
 #include "collectives.h"
 #include "primitives.h"
 
diff --git a/src/collectives/device/reduce_kernel.h b/src/device/reduce_kernel.h
similarity index 67%
rename from src/collectives/device/reduce_kernel.h
rename to src/device/reduce_kernel.h
index c1a39cedfe..66e9516cd3 100644
--- a/src/collectives/device/reduce_kernel.h
+++ b/src/device/reduce_kernel.h
@@ -12,6 +12,19 @@
 #include <limits>
 #include <type_traits>
 
+template<typename T>
+struct IsFloatingPoint: std::false_type {};
+template<>
+struct IsFloatingPoint<half>: std::true_type {};
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template<>
+struct IsFloatingPoint<__nv_bfloat16>: std::true_type {};
+#endif
+template<>
+struct IsFloatingPoint<float>: std::true_type {};
+template<>
+struct IsFloatingPoint<double>: std::true_type {};
+
 ////////////////////////////////////////////////////////////////////////////////
 // The reduction function classes. All classes must:
 //  1. Expose the `EltType` typedef.
@@ -19,16 +32,21 @@
 //  3. Have constructor taking `uint64_t opArg`.
 
 template<typename T>
-struct FuncNull { using EltType = T; __device__ FuncNull(uint64_t opArg=0) {}; };
+struct FuncCopy { using EltType = T; __device__ FuncCopy(uint64_t opArg=0) {}; };
 template<typename T>
 struct FuncSum  { using EltType = T; __device__ FuncSum(uint64_t opArg=0) {}; };
 template<typename T>
 struct FuncProd { using EltType = T; __device__ FuncProd(uint64_t opArg=0) {}; };
 template<typename T>
-struct FuncMin  { using EltType = T; __device__ FuncMin(uint64_t opArg=0) {}; };
-template<typename T>
-struct FuncMax  { using EltType = T; __device__ FuncMax(uint64_t opArg=0) {}; };
-
+struct FuncMinMax {
+  using EltType = T;
+  BytePack<sizeof(T)> xormask; // only used by integers
+  bool isMinNotMax; // only used by floats
+  __device__ FuncMinMax(uint64_t opArg=0) {
+    xormask.native = opArg;
+    isMinNotMax = (opArg&1)==0;
+  }
+};
 template<typename T> struct FuncPreMulSum;
 template<typename T> struct FuncSumPostDiv;
 
@@ -127,8 +145,8 @@ struct Apply_Reduce {
 
 // Base case definitions (EltPerPack == 1)
 template<typename T>
-struct Apply_Reduce<FuncNull<T>, /*EltPerPack=*/1> {
-  __device__ static BytePack<sizeof(T)> reduce(FuncSum<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
+struct Apply_Reduce<FuncCopy<T>, /*EltPerPack=*/1> {
+  __device__ static BytePack<sizeof(T)> reduce(FuncCopy<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
     return a;
   }
 };
@@ -145,15 +163,9 @@ struct Apply_Reduce<FuncProd<T>, /*EltPerPack=*/1> {
   }
 };
 template<typename T>
-struct Apply_Reduce<FuncMin<T>, /*EltPerPack=*/1> {
-  __device__ static BytePack<sizeof(T)> reduce(FuncMin<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
-    return toPack<T>(min(fromPack<T>(a), fromPack<T>(b)));
-  }
-};
-template<typename T>
-struct Apply_Reduce<FuncMax<T>, /*EltPerPack=*/1> {
-  __device__ static BytePack<sizeof(T)> reduce(FuncMax<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
-    return toPack<T>(max(fromPack<T>(a), fromPack<T>(b)));
+struct Apply_Reduce<FuncMinMax<T>, /*EltPerPack=*/1> {
+  __device__ static BytePack<sizeof(T)> reduce(FuncMinMax<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
+    return (a.native ^ fn.xormask.native) < (b.native ^ fn.xormask.native) ? a : b;
   }
 };
 
@@ -161,57 +173,55 @@ struct Apply_Reduce<FuncMax<T>, /*EltPerPack=*/1> {
 template<>
 struct Apply_Reduce<FuncSum<uint8_t>, /*EltPerPack=*/4> {
   __device__ static BytePack<4> reduce(FuncSum<uint8_t> fn, BytePack<4> a, BytePack<4> b) {
-    constexpr uint32_t lo = 0x00ff00ff;
-    constexpr uint32_t hi = ~lo;
-    uint32_t x = a.u32;
-    uint32_t y = b.u32;
-    a.u32 = (((x&lo) + (y&lo))&lo) + (((x&hi) + (y&hi))&hi);
+    constexpr uint32_t even = 0x00ff00ffu;
+    uint32_t x = (a.native &  even) + (b.native &  even);
+    uint32_t y = (a.native & ~even) + (b.native & ~even);
+    //a.native = (x & even) | (y & ~even);
+    a.native = __byte_perm(x, y, 0x7250);
     return a;
   }
 };
+
 template<>
-struct Apply_Reduce<FuncSum<int8_t>, /*EltPerPack=*/4> {
-  __device__ static BytePack<4> reduce(FuncSum<int8_t> fn, BytePack<4> a, BytePack<4> b) {
-    return Apply_Reduce<FuncSum<uint8_t>, 4>::reduce(FuncSum<uint8_t>(), a, b);
+struct Apply_Reduce<FuncMinMax<uint8_t>, /*EltPerPack=*/4> {
+  __device__ static BytePack<4> reduce(FuncMinMax<uint8_t> fn, BytePack<4> a, BytePack<4> b) {
+    constexpr uint32_t ones = 0x01010101u;
+    constexpr uint32_t even = 0x00ff00ffu; // even byte mask
+    // Replicate xormask to all bytes
+    uint32_t x = fn.xormask.native * ones;
+    // Transform inputs by xormask
+    uint32_t ax = a.native ^ x;
+    uint32_t bx = b.native ^ x;
+    // Use 9-bit arithmetic to compute d=a-b
+    uint32_t d0 = (ax    & even) + (~bx      & even) + ones;
+    uint32_t d1 = (ax>>8 & even) + (~(bx>>8) & even) + ones;
+    // Move sign bit of each 9-bit delta into the least bit of origin byte
+    //uint32_t s = (d0>>8 & ones & even) | (d1 & ones & ~even);
+    uint32_t s = __byte_perm(d0, d1, 0x7351) & ones;
+    // Broadcast least bit across whole byte
+    s *= 0xffu;
+    // Compose result by selecting bytes via: signbit(a-b)==1 ? a : b
+    a.native = (a.native & s) | (b.native & ~s);
+    return a;
   }
 };
 
-#if 300 <= __CUDA_ARCH__ && __CUDA_ARCH__ < 500
-  template<>
-  struct Apply_Reduce<FuncMin<uint8_t>, /*EltPerPack=*/4> {
-    __device__ static BytePack<4> reduce(FuncMin<uint8_t> fn, BytePack<4> a, BytePack<4> b) {
-      uint32_t z=0;
-      asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(a.u32) : "r"(a.u32), "r"(b.u32), "r"(z));
-      return a;
-    }
-  };
-  template<>
-  struct Apply_Reduce<FuncMin<int8_t>, /*EltPerPack=*/4> {
-    __device__ static BytePack<4> reduce(FuncMin<int8_t> fn, BytePack<4> a, BytePack<4> b) {
-      int32_t z=0;
-      asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(a.u32) : "r"(a.u32), "r"(b.u32), "r"(z));
-      return a;
-    }
-  };
-  template<>
-  struct Apply_Reduce<FuncMax<uint8_t>, /*EltPerPack=*/4> {
-    __device__ static BytePack<4> reduce(FuncMax<uint8_t> fn, BytePack<4> a, BytePack<4> b) {
-      uint32_t z=0;
-      asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(a.u32) : "r"(a.u32), "r"(b.u32), "r"(z));
-      return a;
-    }
-  };
-  template<>
-  struct Apply_Reduce<FuncMax<int8_t>, /*EltPerPack=*/4> {
-    __device__ static BytePack<4> reduce(FuncMax<int8_t> fn, BytePack<4> a, BytePack<4> b) {
-      int32_t z=0;
-      asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(a.u32) : "r"(a.u32), "r"(b.u32), "r"(z));
-      return a;
-    }
-  };
-#endif
+template<>
+struct Apply_Reduce<FuncProd<uint8_t>, /*EltPerPack=*/4> {
+  __device__ static BytePack<4> reduce(FuncProd<uint8_t> fn, BytePack<4> apack, BytePack<4> bpack) {
+    uint32_t a = apack.native;
+    uint32_t b = bpack.native;
+    uint32_t ab0 = (a*b) & 0xffu;
+    asm("mad.lo.u32 %0, %1, %2, %0;" : "+r"(ab0) : "r"(a&0xff00u), "r"(b&0xff00u));
+    uint32_t ab1;
+    asm("mul.hi.u32 %0, %1, %2;"     : "=r"(ab1) : "r"(a&0xff0000), "r"(b&0xff0000));
+    asm("mad.hi.u32 %0, %1, %2, %0;" : "+r"(ab1) : "r"(a&0xff000000u), "r"(b&0xff000000u));
+    apack.native = __byte_perm(ab0, ab1, 0x6420);
+    return apack;
+  }
+};
 
-#define SPECIALIZE_REDUCE(Fn, T, EltPerPack, Vec, expr_of_x_y) \
+#define SPECIALIZE_REDUCE(Fn, T, EltPerPack, Vec, expr_of_fn_x_y) \
   template<> \
   struct Apply_Reduce<Fn<T>, EltPerPack> { \
     __device__ __forceinline__ static BytePack<sizeof(Vec)> reduce( \
@@ -219,10 +229,13 @@ struct Apply_Reduce<FuncSum<int8_t>, /*EltPerPack=*/4> {
       ) { \
       Vec x = fromPack<Vec>(a); \
       Vec y = fromPack<Vec>(b); \
-      return toPack<Vec>(expr_of_x_y); \
+      return toPack<Vec>(expr_of_fn_x_y); \
     } \
   };
 
+SPECIALIZE_REDUCE(FuncMinMax, float, 1, float, fn.isMinNotMax ? fminf(x, y) : fmaxf(x, y))
+SPECIALIZE_REDUCE(FuncMinMax, double, 1, double, fn.isMinNotMax ? fmin(x, y) : fmax(x, y))
+
 #if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
   SPECIALIZE_REDUCE(FuncSum, half, 1, half, __hadd(x, y))
   SPECIALIZE_REDUCE(FuncSum, half, 2, half2, __hadd2(x, y))
@@ -234,13 +247,10 @@ struct Apply_Reduce<FuncSum<int8_t>, /*EltPerPack=*/4> {
 #endif
 
 #if __CUDA_ARCH__ >= 800
-  SPECIALIZE_REDUCE(FuncMin, half, 1, half, __hmin(x, y))
-  SPECIALIZE_REDUCE(FuncMin, half, 2, half2, __hmin2(x, y))
-  SPECIALIZE_REDUCE(FuncMax, half, 1, half, __hmax(x, y))
-  SPECIALIZE_REDUCE(FuncMax, half, 2, half2, __hmax2(x, y))
+  SPECIALIZE_REDUCE(FuncMinMax, half, 1, half, fn.isMinNotMax ? __hmin(x, y) : __hmax(x, y))
+  SPECIALIZE_REDUCE(FuncMinMax, half, 2, half2, fn.isMinNotMax ? __hmin2(x, y) : __hmax2(x, y))
 #else
-  SPECIALIZE_REDUCE(FuncMin, half, 1, half, __float2half(fminf(__half2float(x), __half2float(y))))
-  SPECIALIZE_REDUCE(FuncMax, half, 1, half, __float2half(fmaxf(__half2float(x), __half2float(y))))
+  SPECIALIZE_REDUCE(FuncMinMax, half, 1, half, __float2half(fn.isMinNotMax ? fminf(__half2float(x), __half2float(y)) : fmaxf(__half2float(x), __half2float(y))))
 #endif
 
 #if defined(__CUDA_BF16_TYPES_EXIST__)
@@ -249,15 +259,12 @@ struct Apply_Reduce<FuncSum<int8_t>, /*EltPerPack=*/4> {
   SPECIALIZE_REDUCE(FuncSum, __nv_bfloat16, 2, __nv_bfloat162, __hadd2(x, y))
   SPECIALIZE_REDUCE(FuncProd, __nv_bfloat16, 1, __nv_bfloat16, __hmul(x, y))
   SPECIALIZE_REDUCE(FuncProd, __nv_bfloat16, 2, __nv_bfloat162, __hmul2(x, y))
-  SPECIALIZE_REDUCE(FuncMin, __nv_bfloat16, 1, __nv_bfloat16, __hmin(x, y))
-  SPECIALIZE_REDUCE(FuncMin, __nv_bfloat16, 2, __nv_bfloat162, __hmin2(x, y))
-  SPECIALIZE_REDUCE(FuncMax, __nv_bfloat16, 1, __nv_bfloat16, __hmax(x, y))
-  SPECIALIZE_REDUCE(FuncMax, __nv_bfloat16, 2, __nv_bfloat162, __hmax2(x, y))
+  SPECIALIZE_REDUCE(FuncMinMax, __nv_bfloat16, 1, __nv_bfloat16, fn.isMinNotMax ? __hmin(x, y) : __hmax(x, y))
+  SPECIALIZE_REDUCE(FuncMinMax, __nv_bfloat16, 2, __nv_bfloat162, fn.isMinNotMax ? __hmin2(x, y) : __hmax2(x, y))
 #else
   SPECIALIZE_REDUCE(FuncSum, __nv_bfloat16, 1, __nv_bfloat16, __float2bfloat16(__bfloat162float(x) + __bfloat162float(y)))
   SPECIALIZE_REDUCE(FuncProd, __nv_bfloat16, 1, __nv_bfloat16, __float2bfloat16(__bfloat162float(x) * __bfloat162float(y)))
-  SPECIALIZE_REDUCE(FuncMin, __nv_bfloat16, 1, __nv_bfloat16, __float2bfloat16(fminf(__bfloat162float(x), __bfloat162float(y))))
-  SPECIALIZE_REDUCE(FuncMax, __nv_bfloat16, 1, __nv_bfloat16, __float2bfloat16(fmaxf(__bfloat162float(x), __bfloat162float(y))))
+  SPECIALIZE_REDUCE(FuncMinMax, __nv_bfloat16, 1, __nv_bfloat16, __float2bfloat16(fn.isMinNotMax ? fminf(__bfloat162float(x), __bfloat162float(y)) : fmaxf(__bfloat162float(x), __bfloat162float(y))))
 #endif
 #endif
 
@@ -479,19 +486,6 @@ struct Apply_PreOp<FuncPreMulSum<half>, /*EltPerPack=*/1> {
 ////////////////////////////////////////////////////////////////////////////////
 // FuncSumPostDiv
 
-template<typename T>
-struct IsFloatingPoint: std::false_type {};
-template<>
-struct IsFloatingPoint<half>: std::true_type {};
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-template<>
-struct IsFloatingPoint<__nv_bfloat16>: std::true_type {};
-#endif
-template<>
-struct IsFloatingPoint<float>: std::true_type {};
-template<>
-struct IsFloatingPoint<double>: std::true_type {};
-
 template<typename T, bool IsFloating=IsFloatingPoint<T>::value>
 struct FuncSumPostDiv_IntOnly;
 
@@ -543,25 +537,44 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
 #define SIZEOF_BytePack_field_u64 8
 #define PTX_REG_BytePack_field_u64 "l"
 
-#define DEFINE_Apply_LoadMultimem(Fn, T, op, ptx_ty, pack_field) \
+#define DEFINE_Apply_LoadMultimem_sum(T, ptx_ty, pack_field) \
   template<> \
-  struct Apply_LoadMultimem<Fn<T>, SIZEOF_BytePack_field_##pack_field> { \
+  struct Apply_LoadMultimem<FuncSum<T>, SIZEOF_BytePack_field_##pack_field> { \
     static constexpr int PackSize = SIZEOF_BytePack_field_##pack_field; \
-    __device__ static BytePack<PackSize> load(Fn<T> fn, uintptr_t addr) { \
+    __device__ static BytePack<PackSize> load(FuncSum<T> fn, uintptr_t addr) { \
       BytePack<PackSize> ans; \
-      asm("multimem.ld_reduce.relaxed.sys.global." #op "." #ptx_ty " %0, [%1];" \
+      asm("multimem.ld_reduce.relaxed.sys.global.add." #ptx_ty " %0, [%1];" \
         : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \
         : "l"(addr)); \
       return ans; \
     } \
   };
-#define DEFINE_Apply_LoadMultimem_v4(Fn, T, op, ptx_ty, pack_field) \
+#define DEFINE_Apply_LoadMultimem_minmax(T, ptx_ty, pack_field) \
   template<> \
-  struct Apply_LoadMultimem<Fn<T>, 4*(SIZEOF_BytePack_field_##pack_field)> { \
-    static constexpr int PackSize = 4*(SIZEOF_BytePack_field_##pack_field); \
-    __device__ static BytePack<PackSize> load(Fn<T> fn, uintptr_t addr) { \
+  struct Apply_LoadMultimem<FuncMinMax<T>, SIZEOF_BytePack_field_##pack_field> { \
+    static constexpr int PackSize = SIZEOF_BytePack_field_##pack_field; \
+    __device__ static BytePack<PackSize> load(FuncMinMax<T> fn, uintptr_t addr) { \
       BytePack<PackSize> ans; \
-      asm("multimem.ld_reduce.relaxed.sys.global." #op ".v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
+      if (fn.isMinNotMax) { \
+        asm("multimem.ld_reduce.relaxed.sys.global.min." #ptx_ty " %0, [%1];" \
+          : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \
+          : "l"(addr)); \
+      } else { \
+        asm("multimem.ld_reduce.relaxed.sys.global.max." #ptx_ty " %0, [%1];" \
+          : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \
+          : "l"(addr)); \
+      } \
+      return ans; \
+    } \
+  };
+
+#define DEFINE_Apply_LoadMultimem_sum_v4(T, ptx_ty, pack_field) \
+  template<> \
+  struct Apply_LoadMultimem<FuncSum<T>, 4*(SIZEOF_BytePack_field_##pack_field)> { \
+    static constexpr int PackSize = 4*(SIZEOF_BytePack_field_##pack_field); \
+    __device__ static BytePack<PackSize> load(FuncSum<T> fn, uintptr_t addr) { \
+      BytePack<PackSize> ans; \
+      asm("multimem.ld_reduce.relaxed.sys.global.add.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
         : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \
           "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \
           "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \
@@ -570,18 +583,61 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
       return ans; \
     } \
   };
-#define DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(Fn, T, op, ptx_ty, pack_field) \
-  DEFINE_Apply_LoadMultimem_v4(Fn, T, op, ptx_ty, pack_field) \
+#define DEFINE_Apply_LoadMultimem_minmax_v4(T, ptx_ty, pack_field) \
   template<> \
-  struct Apply_LoadMultimem<Fn<T>, sizeof(T)> { \
-    __device__ static BytePack<sizeof(T)> load(Fn<T> fn, uintptr_t addr) { \
+  struct Apply_LoadMultimem<FuncMinMax<T>, 4*(SIZEOF_BytePack_field_##pack_field)> { \
+    static constexpr int PackSize = 4*(SIZEOF_BytePack_field_##pack_field); \
+    __device__ static BytePack<PackSize> load(FuncMinMax<T> fn, uintptr_t addr) { \
+      BytePack<PackSize> ans; \
+      if (fn.isMinNotMax) { \
+        asm("multimem.ld_reduce.relaxed.sys.global.min.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
+          : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \
+            "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \
+            "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \
+            "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[3]) \
+          : "l"(addr)); \
+      } else { \
+        asm("multimem.ld_reduce.relaxed.sys.global.max.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
+          : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \
+            "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \
+            "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \
+            "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[3]) \
+          : "l"(addr)); \
+      } \
+      return ans; \
+    } \
+  };
+
+#define DEFINE_Apply_LoadMultimem_sum_v4x2_and_subhalf(T, ptx_ty, pack_field) \
+  DEFINE_Apply_LoadMultimem_sum_v4(T, ptx_ty, pack_field) \
+  template<> \
+  struct Apply_LoadMultimem<FuncSum<T>, sizeof(T)> { \
+    __device__ static BytePack<sizeof(T)> load(FuncSum<T> fn, uintptr_t addr) { \
       BytePack<2*sizeof(T)> tmp; \
-      asm("multimem.ld_reduce.relaxed.sys.global." #op "." #ptx_ty " %0, [%1];" \
+      asm("multimem.ld_reduce.relaxed.sys.global.add." #ptx_ty " %0, [%1];" \
         : "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
         : "l"(addr & -uintptr_t(sizeof(T)))); \
       return tmp.half[(addr/sizeof(T))%2]; \
     } \
   };
+#define DEFINE_Apply_LoadMultimem_minmax_v4x2_and_subhalf(T, ptx_ty, pack_field) \
+  DEFINE_Apply_LoadMultimem_minmax_v4(T, ptx_ty, pack_field) \
+  template<> \
+  struct Apply_LoadMultimem<FuncMinMax<T>, sizeof(T)> { \
+    __device__ static BytePack<sizeof(T)> load(FuncMinMax<T> fn, uintptr_t addr) { \
+      BytePack<2*sizeof(T)> tmp; \
+      if (fn.isMinNotMax) { \
+        asm("multimem.ld_reduce.relaxed.sys.global.min." #ptx_ty " %0, [%1];" \
+          : "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
+          : "l"(addr & -uintptr_t(sizeof(T)))); \
+      } else { \
+        asm("multimem.ld_reduce.relaxed.sys.global.max." #ptx_ty " %0, [%1];" \
+          : "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
+          : "l"(addr & -uintptr_t(sizeof(T)))); \
+      } \
+      return tmp.half[(addr/sizeof(T))%2]; \
+    } \
+  };
 
 template<typename Fn, int BytePerPack>
 struct Apply_LoadMultimem {
@@ -598,46 +654,39 @@ struct Apply_LoadMultimem {
     static constexpr bool IsSum = std::is_same<Fn, FuncSum<T>>::value ||
                                   std::is_same<Fn, FuncPreMulSum<T>>::value ||
                                   std::is_same<Fn, FuncSumPostDiv<T>>::value;
-    static constexpr bool IsMinOrMax = std::is_same<Fn, FuncMin<T>>::value ||
-                                       std::is_same<Fn, FuncMax<T>>::value;
+    static constexpr bool IsMinMax = std::is_same<Fn, FuncMinMax<T>>::value;
     static constexpr bool IsFloat = IsFloatingPoint<T>::value;
     static constexpr int BigPackSize =
       IsFloat && IsSum && sizeof(T) < 8 ? 16 :
       IsFloat && IsSum ? 8 :
-      IsFloat && IsMinOrMax && sizeof(T)==2 ? 16 :
-      !IsFloat && (IsSum||IsMinOrMax) && sizeof(T)>=4 ? sizeof(T) :
+      IsFloat && IsMinMax && sizeof(T)==2 ? 16 :
+      !IsFloat && (IsSum||IsMinMax) && sizeof(T)>=4 ? sizeof(T) :
       /*multimem.ld_reduce not supported:*/ 0;
   };
 
-  DEFINE_Apply_LoadMultimem(FuncSum, uint32_t, add, u32, u32)
-  DEFINE_Apply_LoadMultimem(FuncMin, uint32_t, min, u32, u32)
-  DEFINE_Apply_LoadMultimem(FuncMax, uint32_t, max, u32, u32)
+  DEFINE_Apply_LoadMultimem_sum(uint32_t, u32, u32)
+  DEFINE_Apply_LoadMultimem_minmax(uint32_t, u32, u32)
 
-  DEFINE_Apply_LoadMultimem(FuncSum, int32_t, add, s32, u32)
-  DEFINE_Apply_LoadMultimem(FuncMin, int32_t, min, s32, u32)
-  DEFINE_Apply_LoadMultimem(FuncMax, int32_t, max, s32, u32)
+  DEFINE_Apply_LoadMultimem_sum(int32_t, s32, u32)
+  DEFINE_Apply_LoadMultimem_minmax(int32_t, s32, u32)
 
-  DEFINE_Apply_LoadMultimem(FuncSum, uint64_t, add, u64, u64)
-  DEFINE_Apply_LoadMultimem(FuncMin, uint64_t, min, u64, u64)
-  DEFINE_Apply_LoadMultimem(FuncMax, uint64_t, max, u64, u64)
+  DEFINE_Apply_LoadMultimem_sum(uint64_t, u64, u64)
+  DEFINE_Apply_LoadMultimem_minmax(uint64_t, u64, u64)
 
-  DEFINE_Apply_LoadMultimem(FuncSum, int64_t, add, u64, u64)
-  DEFINE_Apply_LoadMultimem(FuncMin, int64_t, min, s64, u64)
-  DEFINE_Apply_LoadMultimem(FuncMax, int64_t, max, s64, u64)
+  DEFINE_Apply_LoadMultimem_sum(int64_t, u64, u64)
+  DEFINE_Apply_LoadMultimem_minmax(int64_t, s64, u64)
 
-  DEFINE_Apply_LoadMultimem(FuncSum, float, add, f32, u32)
-  DEFINE_Apply_LoadMultimem_v4(FuncSum, float, add, f32, u32)
+  DEFINE_Apply_LoadMultimem_sum(float, f32, u32)
+  DEFINE_Apply_LoadMultimem_sum_v4(float, f32, u32)
 
-  DEFINE_Apply_LoadMultimem(FuncSum, double, add, f64, u64)
+  DEFINE_Apply_LoadMultimem_sum(double, f64, u64)
 
-  DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncSum, half, add, f16x2, u32)
-  DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMin, half, min, f16x2, u32)
-  DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMax, half, max, f16x2, u32)
+  DEFINE_Apply_LoadMultimem_sum_v4x2_and_subhalf(half, f16x2, u32)
+  DEFINE_Apply_LoadMultimem_minmax_v4x2_and_subhalf(half, f16x2, u32)
 
   #if defined(__CUDA_BF16_TYPES_EXIST__)
-    DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncSum, __nv_bfloat16, add, bf16x2, u32)
-    DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMin, __nv_bfloat16, min, bf16x2, u32)
-    DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMax, __nv_bfloat16, max, bf16x2, u32)
+    DEFINE_Apply_LoadMultimem_sum_v4x2_and_subhalf(__nv_bfloat16, bf16x2, u32)
+    DEFINE_Apply_LoadMultimem_minmax_v4x2_and_subhalf(__nv_bfloat16, bf16x2, u32)
   #endif
 #else
   template<typename Fn>
diff --git a/src/collectives/device/reduce_scatter.h b/src/device/reduce_scatter.h
similarity index 58%
rename from src/collectives/device/reduce_scatter.h
rename to src/device/reduce_scatter.h
index d2026e678c..6660cc0adc 100644
--- a/src/collectives/device/reduce_scatter.h
+++ b/src/device/reduce_scatter.h
@@ -4,7 +4,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "devcomm.h"
+#include "device.h"
 #include "collectives.h"
 #include "primitives.h"
 
@@ -98,33 +98,69 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROT
     const ssize_t chunkSize = int(args->lastChunkSize);
     const ssize_t size = args->count;
     const ssize_t loopSize = nChannels*chunkSize;
+    const int rank = ncclShmem.comm.rank;
+    const int nranks = ncclShmem.comm.nRanks;
 
-    const int nThreadsScatter = 128 + WARP_SIZE;
-    const int nThreadsReduce = 384;
+    /* if we are direct NVLS, we only need to allocate 1 warp to scatter for sync; 
+     * if not, based on #ranks, we allocate 7 or 5 warps to reduce to saturate bandwidth
+     * and the rest are allocated to scatter. */
+    const int nThreadsReduce = args->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : (nranks <= 6 ? 7 * WARP_SIZE : 5 * WARP_SIZE);
+    const int nThreadsScatter = args->regUsed ? WARP_SIZE : (NCCL_MAX_NTHREADS - nThreadsReduce);
     const int tidEndScatter = nThreadsScatter;
     const int tidEndReduce = tidEndScatter + nThreadsReduce;
 
-    using Proto = ProtoSimple<1, 1>;
-
-    if (tid < tidEndScatter) {
-      // Scatter
-      Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
-        prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
-            args->redOpArg, 0*Proto::MaxGroupWidth, 0, 0);
-      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-        ssize_t offset = gridOffset + bid*chunkSize;
-        int nelem = min(chunkSize, size-offset);
-        prims.scatter(offset, nvls->nHeads*size, nelem, size, -1, 0);
+    if (!args->regUsed) {
+      if (tid < tidEndScatter) {
+        // Scatter
+        using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
+        Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
+          prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
+            args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
+        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+          ssize_t offset = gridOffset + bid * chunkSize;
+          int nelem = min(chunkSize, size - offset);
+          prims.scatter(offset, nvls->nHeads * size, nelem, size, -1, 0);
+        }
+      } else if (tid < tidEndReduce) {
+        // Reduce through NVLS
+        using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
+        Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
+          prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, args->recvbuff,
+            args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0);
+        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+          ssize_t offset = gridOffset + bid * chunkSize;
+          int nelem = min(chunkSize, size - offset);
+          prims.recv(offset, nelem);
+        }
       }
-    } else if (tid < tidEndReduce) {
-      // Reduce through NVLS
-      Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
-        prims(tid-tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, args->recvbuff,
-           args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
-      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-        ssize_t offset = gridOffset + bid*chunkSize;
-        int nelem = min(chunkSize, size-offset);
-        prims.recv(offset, nelem);
+    } else {
+      if (tid < tidEndScatter) {
+        // Scatter
+        using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
+        Primitives<T, RedOp, FanSymmetric<NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
+          prims(tid, nThreadsScatter, nvls->up, nvls->up, NULL, NULL,
+            args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
+        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+          prims.scatter(0, 0, 0, 0, -1, 0);
+        }
+
+        /* gather used as sync */
+        prims.gather(0, 0, 0, 0, -1, 0);
+      } else if (tid < tidEndReduce) {
+        // Reduce through NVLS
+        using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
+        Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
+          prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, &nvls->down, NULL, args->recvbuff,
+            args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, args);
+        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+          ssize_t outOffset = gridOffset + bid * chunkSize;
+          ssize_t inpOffset = outOffset + rank * size;
+          int nelem = min(chunkSize, size - outOffset);
+          prims.directRecvCopy(inpOffset, outOffset, nelem);
+        }
+
+        /* send for sync */
+        prims.send(0, 0);
       }
     }
   }
diff --git a/src/collectives/device/sendrecv.h b/src/device/sendrecv.h
similarity index 95%
rename from src/collectives/device/sendrecv.h
rename to src/device/sendrecv.h
index 42d9b550db..5401f0542c 100644
--- a/src/collectives/device/sendrecv.h
+++ b/src/device/sendrecv.h
@@ -4,7 +4,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "devcomm.h"
+#include "device.h"
 #include "collectives.h"
 #include "primitives.h"
 
@@ -26,7 +26,7 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
       if (args->proto == NCCL_PROTO_LL) chunkSize /= 2;
       int const peer = args->peer;
       Primitives<T, RedOp, FanAsymmetric<0, 1>, 1, Proto, 1> prims
-        (tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group, 1, 1);
+        (tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group, 1, 1, nullptr, ncclShmem.comm.p2pChunkSize/sizeof(T));
       size_t offset = 0;
       do {
         int nelem = min(size_t(chunkSize), count-offset);
@@ -45,7 +45,7 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
       if (args->proto == NCCL_PROTO_LL) chunkSize /= 2; // This is to account for chunkEffectiveSize
       int const peer = args->peer;
       Primitives<T, RedOp, FanAsymmetric<1, 0>, 1, Proto, 1> prims
-        (tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group, 1, 1);
+        (tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group, 1, 1, nullptr, ncclShmem.comm.p2pChunkSize/sizeof(T));
       size_t offset = 0;
       do {
         int nelem = min(size_t(chunkSize), count-offset);
diff --git a/src/enqueue.cc b/src/enqueue.cc
index 43d0ba109a..dbb9865bcf 100644
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@@ -11,83 +11,16 @@
 #include "bootstrap.h"
 #include "channel.h"
 #include "cudawrap.h"
+#include "transport.h"
 
 #include <cstring> // std::memcpy
 #include <cinttypes> // PRIx64
 
-static void* const ncclKernelGeneric = (void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t);
-
-struct ncclKernelMatch {
-  void* kernelFn;
-  bool specialized;
-};
-
-// Only generate inline kernels for LL
-#define NCCL_FUNC5(func, algo, devredop, dtype, specialized) \
-  /*LL    */{(void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype), true && specialized}, \
-  /*LL128 */{(void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype), false && specialized}, \
-  /*SIMPLE*/{(void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype), false && specialized}
-
-#define NCCL_FUNC4(func, devredop, type, specialized) \
-  NCCL_FUNC5(func, TREE,           devredop, type, specialized), \
-  NCCL_FUNC5(func, RING,           devredop, type, specialized), \
-  NCCL_FUNC5(func, COLLNET_DIRECT, devredop, type, specialized), \
-  NCCL_FUNC5(func, COLLNET_CHAIN,  devredop, type, specialized), \
-  NCCL_FUNC5(func, NVLS,           devredop, type, specialized), \
-  NCCL_FUNC5(func, NVLS_TREE,      devredop, type, specialized)
-
-#ifdef __CUDA_BF16_TYPES_EXIST__
-  #define HAVE_BFLOAT16 1
-#else
-  #define HAVE_BFLOAT16 0
-#endif
-
-// Must be consistent with ncclDataType_t
-#define NCCL_FUNCS3(func, devredop, reduction, specialized) \
-  NCCL_FUNC4(func, devredop, MACRO_IF(reduction, int8_t, int8_t), specialized), \
-  NCCL_FUNC4(func, devredop, MACRO_IF(reduction, uint8_t, int8_t), specialized), \
-  NCCL_FUNC4(func, devredop, MACRO_IF(reduction, int32_t, int8_t), specialized), \
-  NCCL_FUNC4(func, devredop, MACRO_IF(reduction, uint32_t, int8_t), specialized), \
-  NCCL_FUNC4(func, devredop, MACRO_IF(reduction, int64_t, int8_t), specialized), \
-  NCCL_FUNC4(func, devredop, MACRO_IF(reduction, uint64_t, int8_t), specialized), \
-  NCCL_FUNC4(func, devredop, MACRO_IF(reduction, half, int8_t), specialized), \
-  NCCL_FUNC4(func, devredop, MACRO_IF(reduction, float, int8_t), specialized), \
-  NCCL_FUNC4(func, devredop, MACRO_IF(reduction, double, int8_t), specialized) \
-  MACRO_IF(HAVE_BFLOAT16, \
-    SINGLE_ARG(, NCCL_FUNC4(func, devredop, MACRO_IF(reduction, __nv_bfloat16, int8_t), specialized)), \
-    /*nothing*/ \
-  )
-
-// Must be consistent with ncclDevRedOp_t -- but we only generate kernel for sums.
-#define NCCL_FUNCS2(func, reduction) \
-  NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/1), /*Sum*/ \
-  NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/0), /*Prod*/ \
-  NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/0), /*Max*/ \
-  NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/0), /*Min*/ \
-  NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/0), /*PreMulSum*/ \
-  NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/0)  /*SumPostDiv*/
-
-// Must be consistent with the ncclFuncSet enum
-static const ncclKernelMatch ncclKerns[1+ncclNumTypes+NCCL_NUM_FUNCTIONS*ncclNumDevRedOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
-  {(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), true},
-  // We don't bake special kernels for the one-rank reductions
-  {/*int8*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
-  {/*uint8*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
-  {/*int32*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
-  {/*uint32*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
-  {/*int64*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
-  {/*uint64*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
-  {/*half*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
-  {/*float*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
-  {/*double*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
-  #if HAVE_BFLOAT16
-    {/*bfloat16*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
-  #endif
-  NCCL_FUNCS2(Broadcast, /*reduction=*/0),
-  NCCL_FUNCS2(Reduce, /*reduction=*/1),
-  NCCL_FUNCS2(AllGather, /*reduction=*/0),
-  NCCL_FUNCS2(ReduceScatter, /*reduction=*/1),
-  NCCL_FUNCS2(AllReduce, /*reduction=*/1)
+enum ncclRegBufferType {
+  NCCL_REGULAR_BUFFER = 0,
+  NCCL_IPC_REG_BUFFER = 1,
+  NCCL_NVLS_REG_BUFFER = 2,
+  NCCL_REG_BUFFER_NUM = 3
 };
 
 static ncclResult_t computeColl(struct ncclInfo* info /* input */, int* workFuncIndex, struct ncclWorkElem* work, struct ncclProxyOp* proxyOp /* output */);
@@ -96,19 +29,14 @@ NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0);
 
 // Returns maximum kernel stack size of all CUDA kernels
 ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize) {
-  constexpr int KernelCount = sizeof(ncclKerns)/sizeof(ncclKerns[0]);
   ncclResult_t result = ncclSuccess;
 
   if (maxStackSize) *maxStackSize = 0;
   int carveout = ncclParamL1SharedMemoryCarveout();
 
-  // Keep track if we already visited a function pointer.
-  void* lru[2] = {nullptr, nullptr};
-  for (int i=0; i < KernelCount; i++) {
-    void* fn = ncclKerns[i].kernelFn;
-    if (fn == lru[0] || fn == lru[1]) goto next_kernel;
-    lru[1] = lru[0];
-    lru[0] = fn;
+  for (int k=0; k < ncclDevKernelCount; k++) {
+    void* fn = ncclDevKernelList[k];
+    if (fn == nullptr) continue;
 
     if (maxStackSize) {
       cudaFuncAttributes attr = {0};
@@ -116,14 +44,12 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize) {
       if (attr.localSizeBytes > *maxStackSize) *maxStackSize = attr.localSizeBytes;
     ignore0:;
     }
-
     if (carveout) {
       CUDACHECKGOTO(cudaFuncSetAttribute(fn,
         cudaFuncAttributePreferredSharedMemoryCarveout, carveout),
         result, ignore1);
     ignore1:;
     }
-
     if (ncclShmemDynamicSize(cudaArch) != 0) {
       CUDACHECKGOTO(cudaFuncSetAttribute(fn,
         cudaFuncAttributeMaxDynamicSharedMemorySize, ncclShmemDynamicSize(cudaArch)),
@@ -218,7 +144,7 @@ static void appendWorkElemP2p(
     struct ncclComm* comm, struct ncclKernelPlan* plan, int channelId,
     struct ncclWorkElemP2p const *elem, bool fuseOk
   ) {
-  constexpr int funcIndex = FUNC_INDEX_P2P;
+  int funcIndex = ncclDevFuncId_P2p();
   struct ncclKernelPlan::Channel* chan = &plan->channels[channelId];
   struct ncclWorkList* q = ncclIntruQueueTail(&chan->workQueue);
   if (q && funcIndex == q->work.header.funcIndex) {
@@ -240,7 +166,7 @@ static void appendWorkElemP2p(
   }
   q = ncclMemoryStackAlloc<struct ncclWorkList>(&comm->memScoped);
   q->work.header.type = ncclWorkTypeP2p;
-  q->work.header.funcIndex = FUNC_INDEX_P2P;
+  q->work.header.funcIndex = ncclDevFuncId_P2p();
   chan->p2pTailElem[ncclWorkP2pTypeRecv-1] = 0;
   chan->p2pTailElem[ncclWorkP2pTypeSend-1] = 1;
   q->work.p2pElems[chan->p2pTailElem[elem->p2pType-1]] = *elem; // C++ struct assignment
@@ -265,7 +191,7 @@ static ncclResult_t addProxyOpIfNeeded(struct ncclComm* comm, struct ncclKernelP
 static ncclResult_t addCollToPlan(
     struct ncclComm* comm, struct ncclKernelPlan* plan, int* nWorkBudget, int funcIndex,
     struct ncclWorkElem const* workElem, struct ncclProxyOp const* proxyOp,
-    int nCollChannels, int nBid, size_t bytes, bool regBufUsed, void* regBufSend[], void* regBufRecv[]
+    int nCollChannels, int nBid, size_t bytes, ncclRegBufferType regBufType, void* regBufSend[], void* regBufRecv[]
   ) {
   struct ncclKernelPlan::Channel *chans = plan->channels;
 
@@ -307,10 +233,9 @@ static ncclResult_t addCollToPlan(
 
     // Add work elem
     *nWorkBudget += chans[c].nWork;
-    if (!regBufUsed) {
+    if (regBufType == NCCL_REGULAR_BUFFER) {
       appendWorkElemColl(comm, plan, c, funcIndex, workElem, bid);
-    } else {
-      // Buffer registration in play which could only for CollNet at the moment.
+    } else if (regBufType == NCCL_IPC_REG_BUFFER) {
       struct ncclChannel* channel = &comm->channels[c];
       struct ncclWorkElemReg workElemReg;
       workElemReg.elem = *workElem; // C++ struct assignment
@@ -330,6 +255,18 @@ static ncclResult_t addCollToPlan(
         workElemReg.upOutputs[i] = regBufRecv[j];
       }
       appendWorkElemColl(comm, plan, c, funcIndex, &workElemReg, bid);
+    } else if (regBufType == NCCL_NVLS_REG_BUFFER) {
+      struct ncclWorkElemReg workElemReg;
+      workElemReg.elem = *workElem; // C++ struct assignment
+      workElemReg.elem.regUsed = 1;
+      /* NVLS only has one send and recv buffer registered */
+      workElemReg.dnInputs[0] = regBufSend[0];
+      workElemReg.dnOutputs[0] = regBufRecv[0];
+      appendWorkElemColl(comm, plan, c, funcIndex, &workElemReg, bid);
+    } else {
+      /* impossible value */
+      WARN("Invalid regBufType %d\n", regBufType);
+      return ncclInvalidArgument;
     }
     *nWorkBudget -= chans[c].nWork; // subtract delta of chans[c].nWork
 
@@ -417,68 +354,118 @@ static void finishPlan(struct ncclKernelPlan* plan) {
   plan->threadPerBlock = std::max(plan->threadPerBlock, 3*WARP_SIZE);
 }
 
+int64_t ncclParamLocalRegister();
+NCCL_PARAM(GraphRegister, "GRAPH_REGISTER", 1);
+
 static ncclResult_t registerIntraNodeBuffers(
     struct ncclComm* comm, struct ncclKernelPlan* plan, struct ncclInfo* info,
-    bool* outRegBufUsed,
     void* outRegBufSend[NCCL_MAX_LOCAL_RANKS],
-    void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS]
+    void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS],
+    ncclRegBufferType *outRegBufType
   ) {
-  *outRegBufUsed = false;
   ncclResult_t result = ncclSuccess;
 
+  *outRegBufType = NCCL_REGULAR_BUFFER;
 #if CUDART_VERSION >= 11030
-  int localRank = comm->localRank;
+  if ((info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) && comm->nvlsRegSupport) {
+    bool regBufUsed = false;
+    const void *sendbuff = info->sendbuff;
+    void *recvbuff = info->recvbuff;
+    cudaPointerAttributes sattr, rattr;
+    bool query = false;
+    
+    if (info->coll == ncclFuncAllGather)
+      sendbuff = NULL;
+    else if (info->coll == ncclFuncReduceScatter)
+      recvbuff = NULL;
 
-  if (CUPFN(cuMemGetAddressRange) == nullptr) return ncclSuccess;
+    /* first try local registration. */
+    if (ncclParamLocalRegister()) {
+      CUDACHECK(cudaPointerGetAttributes(&sattr, info->sendbuff));
+      CUDACHECK(cudaPointerGetAttributes(&rattr, info->recvbuff));
+      query = true;
+      if (sattr.type == cudaMemoryTypeDevice && rattr.type == cudaMemoryTypeDevice)
+        ncclNvlsLocalRegisterBuffer(comm, sendbuff, recvbuff, info->sendbuffSize, info->recvbuffSize, &regBufUsed, outRegBufSend, outRegBufRecv);
+    }
 
-  struct HandlePair {
-    cudaIpcMemHandle_t ipc[2]; // {send, recv}
-    size_t offset[2]; // {send, recv}
-  };
-  struct HandlePair handles[NCCL_MAX_LOCAL_RANKS];
+    if (regBufUsed == false && plan->persistent && ncclParamGraphRegister()) {
+      if (!query) {
+        CUDACHECK(cudaPointerGetAttributes(&sattr, info->sendbuff));
+        CUDACHECK(cudaPointerGetAttributes(&rattr, info->recvbuff));
+      }
+      if (sattr.type == cudaMemoryTypeDevice && rattr.type == cudaMemoryTypeDevice)
+        ncclNvlsGraphRegisterBuffer(comm, plan, sendbuff, recvbuff, info->sendbuffSize, info->recvbuffSize, &regBufUsed, outRegBufSend, outRegBufRecv);
+    }
 
-  CUDACHECKGOTO(cudaIpcGetMemHandle(&handles[localRank].ipc[0], (void*)info->sendbuff), result, fallback);
-  CUDACHECKGOTO(cudaIpcGetMemHandle(&handles[localRank].ipc[1], (void*)info->recvbuff), result, fallback);
+    if (regBufUsed) {
+      /* tweak NVLS channels usage; for registered NVLS buffer, we only need 4/5 channels to
+       * saturate bandwidth. */
+      if (info->coll == ncclFuncReduceScatter)
+        info->nChannels = std::min(5, comm->nvlsChannels);
+      else
+        info->nChannels = std::min(4, comm->nvlsChannels);
+      *outRegBufType = NCCL_NVLS_REG_BUFFER;
+    }
+  } else if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT &&   // limited to CollNetDirect for now
+    comm->intraHighestTransportType == TRANSPORT_P2P && // only when all ranks can p2p each other
+    comm->intraRanks < comm->localRanks &&  // only with inter-process & intra-node peers
+    plan->persistent && 0) {
+    /* Disable CollnetDirect registration since it does not support cuMem* allocated memory. */
+    int localRank = comm->localRank;
+    cudaPointerAttributes sattr, rattr;
 
-  void *baseSend, *baseRecv;
-  size_t size;
-  CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &size, (CUdeviceptr)info->sendbuff));
-  handles[localRank].offset[0] = (char*)info->sendbuff - (char*)baseSend;
-  CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseRecv, &size, (CUdeviceptr)info->recvbuff));
-  handles[localRank].offset[1] = (char*)info->recvbuff - (char*)baseRecv;
+    CUDACHECK(cudaPointerGetAttributes(&sattr, info->sendbuff));
+    CUDACHECK(cudaPointerGetAttributes(&rattr, info->recvbuff));
+    if (sattr.type != cudaMemoryTypeDevice || rattr.type != cudaMemoryTypeDevice) return ncclSuccess;
 
-  NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, handles, sizeof(struct HandlePair)));
+    if (CUPFN(cuMemGetAddressRange) == nullptr) return ncclSuccess;
 
-  // Open handles locally
-  for (int i=0; i < comm->localRanks; i++) {
-    if (i == localRank) { // Skip self
-      outRegBufSend[i] = nullptr;
-      outRegBufRecv[i] = nullptr;
-    } else {
-      for (int sr=0; sr < 2; sr++) {
-        // Get base address of mapping
-        void* base;
-        CUDACHECK(cudaIpcOpenMemHandle(&base, handles[i].ipc[sr], cudaIpcMemLazyEnablePeerAccess));
-        // Get real buffer address by adding offset in the mapping
-        (sr==0 ? outRegBufSend : outRegBufRecv)[i] = (char*)base + handles[i].offset[sr];
-        // Enqueue reminder to close memory handle
-        struct ncclPointerList* q = ncclMemoryPoolAlloc<struct ncclPointerList>(&comm->memPool_ncclPointerList, &comm->memPermanent);
-        q->ptr = base;
-        ncclIntruQueueEnqueue(&plan->ipcMemQueue, q);
+    struct HandlePair {
+      cudaIpcMemHandle_t ipc[2]; // {send, recv}
+      size_t offset[2]; // {send, recv}
+    };
+    struct HandlePair handles[NCCL_MAX_LOCAL_RANKS];
+
+    CUDACHECKGOTO(cudaIpcGetMemHandle(&handles[localRank].ipc[0], (void*)info->sendbuff), result, fallback);
+    CUDACHECKGOTO(cudaIpcGetMemHandle(&handles[localRank].ipc[1], (void*)info->recvbuff), result, fallback);
+
+    void *baseSend, *baseRecv;
+    size_t size;
+    CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &size, (CUdeviceptr)info->sendbuff));
+    handles[localRank].offset[0] = (char*)info->sendbuff - (char*)baseSend;
+    CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseRecv, &size, (CUdeviceptr)info->recvbuff));
+    handles[localRank].offset[1] = (char*)info->recvbuff - (char*)baseRecv;
+
+    NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, handles, sizeof(struct HandlePair)));
+
+    // Open handles locally
+    for (int i=0; i < comm->localRanks; i++) {
+      if (i == localRank) { // Skip self
+        outRegBufSend[i] = nullptr;
+        outRegBufRecv[i] = nullptr;
+      } else {
+        for (int sr=0; sr < 2; sr++) {
+          // Get base address of mapping
+          void* base;
+          CUDACHECK(cudaIpcOpenMemHandle(&base, handles[i].ipc[sr], cudaIpcMemLazyEnablePeerAccess));
+          // Get real buffer address by adding offset in the mapping
+          (sr==0 ? outRegBufSend : outRegBufRecv)[i] = (char*)base + handles[i].offset[sr];
+          // Enqueue reminder to close memory handle
+          struct ncclPointerList* q = ncclMemoryPoolAlloc<struct ncclPointerList>(&comm->memPool_ncclPointerList, &comm->memPermanent);
+          q->ptr = base;
+          ncclIntruQueueEnqueue(&plan->ipcMemQueue, q);
+        }
       }
     }
+    *outRegBufType = NCCL_IPC_REG_BUFFER;
   }
-  *outRegBufUsed = true;
-
 fallback:
 #endif
   return result;
 }
 
-NCCL_PARAM(GraphRegister, "GRAPH_REGISTER", 0);
-
-static ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNetTypeSupport);
-static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, int numPipeOps);
+static ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNetSupport);
+static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetSupport, int nvlsSupport, int numPipeOps);
 
 static ncclResult_t scheduleCollTasksToPlan(
     struct ncclComm* comm, struct ncclKernelPlan* plan, int* nWorkBudget
@@ -517,6 +504,7 @@ static ncclResult_t scheduleCollTasksToPlan(
     int nAggChannels = 0;
     int nAggOps = 1;
     struct ncclTaskColl* aggEnd = head->next;
+    int nvlsSupport = comm->nvlsSupport && ncclNvlsSupported(aggInfo.opFull.op, aggInfo.datatype);
     int collNetSupport = 0;
     NCCLCHECK(getCollNetSupport(&aggInfo, &collNetSupport));
 
@@ -537,7 +525,7 @@ static ncclResult_t scheduleCollTasksToPlan(
       NCCLCHECK(ncclInfoSetDerived(&aggInfo, comm->nRanks));
       aggInfo.nChannels = std::min(comm->nChannels, nAggChannels);
       int opPerChannel = DIVUP(nAggChannels, aggInfo.nChannels);
-      NCCLCHECK(getAlgoInfo(&aggInfo, collNetSupport, opPerChannel));
+      NCCLCHECK(getAlgoInfo(&aggInfo, collNetSupport, nvlsSupport, opPerChannel));
     }
 
     while (head != aggEnd) {
@@ -566,23 +554,26 @@ static ncclResult_t scheduleCollTasksToPlan(
       int workFuncIndex;
       struct ncclWorkElem workElem = {};
       struct ncclProxyOp proxyOp = {};
-      NCCLCHECK(computeColl(&info, &workFuncIndex, &workElem, &proxyOp));
+      // Check whether algo and proto have been preset (as in aggregation case)
+      // If so, skip the calculation
+      if (info.nChannels <= 0 || info.nThreads <= 0) {
+        NCCLCHECK(getAlgoInfo(&info, collNetSupport, nvlsSupport, 1));
+      }
 
       if (*nWorkBudget < info.nChannels) return ncclSuccess; // Ensure room for addCollToPlan()
 
-      bool regBufUsed = false;
+      /* if possible, start registration  */
+      ncclRegBufferType regBufType = NCCL_REGULAR_BUFFER;
       void* regBufSend[NCCL_MAX_LOCAL_RANKS];
       void* regBufRecv[NCCL_MAX_LOCAL_RANKS];
-      if (plan->persistent && ncclParamGraphRegister() &&
-          info.algorithm == NCCL_ALGO_COLLNET_DIRECT &&   // limited to CollNetDirect for now
-          comm->intraHighestTransportType == TRANSPORT_P2P && // only when all ranks can p2p each other
-          comm->intraRanks < comm->localRanks) { // only with inter-process & intra-node peers
-        NCCLCHECK(registerIntraNodeBuffers(comm, plan, &info, &regBufUsed, regBufSend, regBufRecv));
-      }
+
+      registerIntraNodeBuffers(comm, plan, &info, regBufSend, regBufRecv, &regBufType);
+
+      NCCLCHECK(computeColl(&info, &workFuncIndex, &workElem, &proxyOp));
 
       int maxChannels = info.algorithm == NCCL_ALGO_NVLS || aggInfo.algorithm == NCCL_ALGO_NVLS_TREE ? comm->nvlsChannels : comm->nChannels;
       NCCLCHECK(addCollToPlan(comm, plan, nWorkBudget, workFuncIndex, &workElem, &proxyOp,
-        maxChannels, info.nChannels, info.nBytes, regBufUsed, regBufSend, regBufRecv));
+        maxChannels, info.nChannels, info.nBytes, regBufType, regBufSend, regBufRecv));
       tasks->nTasksColl -= 1;
       tasks->collBytesTotal -= info.nBytes;
       ncclIntruQueueDequeue(&tasks->collQueue);
@@ -590,8 +581,8 @@ static ncclResult_t scheduleCollTasksToPlan(
 
       plan->threadPerBlock = std::max(plan->threadPerBlock, info.nThreads);
       if (!plan->kernelSpecialized) {
-        plan->kernelFn = ncclKerns[workFuncIndex].kernelFn;
-        plan->kernelSpecialized = ncclKerns[workFuncIndex].specialized;
+        plan->kernelFn = ncclDevKernelForFunc[workFuncIndex];
+        plan->kernelSpecialized = ncclDevKernelForFuncIsSpecialized[workFuncIndex];
       }
     }
   }
@@ -619,8 +610,8 @@ static ncclResult_t scheduleP2pTasksToPlan(
 
   plan->threadPerBlock = std::max(plan->threadPerBlock, NCCL_MAX_NTHREADS);
   if (!plan->kernelSpecialized) {
-    plan->kernelFn = ncclKerns[FUNC_INDEX_P2P].kernelFn;
-    plan->kernelSpecialized = ncclKerns[FUNC_INDEX_P2P].specialized;
+    plan->kernelFn = ncclDevKernelForFunc[ncclDevFuncId_P2p()];
+    plan->kernelSpecialized = ncclDevKernelForFuncIsSpecialized[ncclDevFuncId_P2p()];
   }
 
   // Compute how much to split operations
@@ -893,6 +884,13 @@ static ncclResult_t reclaimPlan(struct ncclComm* comm, struct ncclCommCallback*
       CUDACHECKIGNORE(cudaIpcCloseMemHandle(q->ptr));
       ncclMemoryPoolFree(&comm->memPool_ncclPointerList, q);
     }
+    /* free mcHandle */
+    while (!ncclIntruQueueEmpty(&plan->nvlsMcHandleQueue)) {
+      struct ncclNvlsMcHandleList* obj = ncclIntruQueueDequeue(&plan->nvlsMcHandleQueue);
+      NCCLCHECK(ncclNvlsDeregBuffer(&obj->mcHandle, obj->ptr, obj->dev, obj->size));
+      INFO(NCCL_NVLS, "rank %d - deregistered buffer %p on device %d, size %ld", comm->rank, (void*)obj->ptr, obj->dev, obj->size);
+      ncclMemoryPoolFree(&comm->memPool_ncclNvlsHandleList, obj);
+    }
   }
   ncclMemoryPoolTakeAll(&comm->memPool_ncclProxyOp, &plan->memPool_ncclProxyOp);
   ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan);
@@ -1142,45 +1140,64 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
 /* Enqueueing system : computation of kernel and proxy operations parameters */
 /*****************************************************************************/
 
-static inline ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNetTypeSupport) {
+static inline ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNetSupport) {
   // Translate ncclAvg and PreMulSum
   ncclRedOp_t netOp = info->op == ncclAvg || info->op >= ncclNumOps ? ncclSum : info->op;
-  *collNetTypeSupport = info->comm->collNetSupportMatrix[netOp][info->datatype];
+  *collNetSupport = info->comm->collNetSupport && info->comm->collNetSupportMatrix[netOp][info->datatype];
   return ncclSuccess;
 }
 
 // numPipeOps: number of pipelined ops. Can be greater than 1 in aggregation mode. Used to adjust latency.
-static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, int numPipeOps) {
+static ncclResult_t topoGetAlgoInfo(struct ncclInfo* info, int collNetSupport, int nvlsSupport, int numPipeOps) {
   struct ncclComm* comm = info->comm;
   if (comm->nRanks == 1) {
     info->algorithm = NCCL_ALGO_RING;
     info->protocol = NCCL_PROTO_SIMPLE;
   }
-  else {
+  else if (info->algorithm == NCCL_ALGO_UNDEF || info->protocol == NCCL_PROTO_UNDEF) {
     float minTime = 3600000000.0; // Hopefully no operation will take an hour to complete.
+    float backupMinTime = 3600000000.0;
+    bool backup = false;
+    int backupAlgo = NCCL_ALGO_UNDEF; // back up algo and proto if no algo/proto is picked up.
+    int backupProto = NCCL_PROTO_UNDEF;
     // Find algorithm / protocol.
     info->algorithm = -1;
     info->protocol = -1;
     int nAlgos = NCCL_NUM_ALGORITHMS;
     for (int a=0; a<nAlgos; a++) {
-      if ((a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) && collNetTypeSupport != 1) continue;
-      if (a == NCCL_ALGO_NVLS && !NCCL_NVLS_SUPPORTS(info->datatype, info->opFull.op)) continue;
-      if (a == NCCL_ALGO_NVLS && collNetTypeSupport != 1 && comm->nNodes > 1) continue;
-      if (a == NCCL_ALGO_NVLS_TREE && !NCCL_NVLS_SUPPORTS(info->datatype, info->opFull.op)) continue;
+      if ((a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) && collNetSupport != 1) continue;
+      if (a == NCCL_ALGO_NVLS && nvlsSupport != 1 && info->coll != ncclFuncAllGather) continue;
+      if (a == NCCL_ALGO_NVLS && collNetSupport != 1 && comm->nNodes > 1) continue;
+      /* now we only support single-node NVLS allgather and reducescatter */
+      if (a == NCCL_ALGO_NVLS && (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) && comm->nNodes > 1) continue;
+      if (a == NCCL_ALGO_NVLS_TREE && nvlsSupport != 1) continue;
 
       for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
         float time;
-        NCCLCHECK(ncclTopoGetAlgoTime(info, a, p, numPipeOps, &time));
-        if (time >= 0 && time < minTime) {
-          info->algorithm = a;
-          info->protocol = p;
-          minTime = time;
+        NCCLCHECK(ncclTopoGetAlgoTime(info, a, p, numPipeOps, &time, &backup));
+        if (!backup) {
+          if (time >= 0 && time < minTime) {
+            info->algorithm = a;
+            info->protocol = p;
+            minTime = time;
+          }
+        } else {
+          if (time >= 0 && time < backupMinTime) {
+            backupAlgo = a;
+            backupProto = p;
+            backupMinTime = time;
+          }
         }
       }
     }
-    if (info->algorithm == -1 || info->protocol == -1) {
-      WARN("Error : no algorithm/protocol available");
-      return ncclInternalError;
+
+    if (info->algorithm == NCCL_ALGO_UNDEF || info->protocol == NCCL_PROTO_UNDEF) {
+      if (backupAlgo == NCCL_ALGO_UNDEF || backupProto == NCCL_PROTO_UNDEF) {
+        WARN("Error : no algorithm/protocol available");
+        return ncclInternalError;
+      }
+      info->algorithm = backupAlgo;
+      info->protocol = backupProto;
     }
     //if (comm->rank == 0) INFO(NCCL_TUNING, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);
     TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);
@@ -1222,6 +1239,25 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
   return ncclSuccess;
 }
 
+// Use the default topo-based tuner if tuner plugin is not successful.
+// Call the plugin first. Let it set algo+proto, and/or nChannels.
+// Then, topoGetAlgoInfo will set algo/proto if not set, then nChannels and nThreads based on algo/proto.
+// Finally, nChannels will be overriden by the plugin setting.
+static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetSupport, int nvlsSupport, int numPipeOps) {
+  info->algorithm = NCCL_ALGO_UNDEF;
+  info->protocol = NCCL_PROTO_UNDEF;
+  int nChannels = 0;
+  if (info->comm->tuner != NULL) {
+    NCCLCHECK(info->comm->tuner->getCollInfo(
+          info->coll, info->nBytes,
+          collNetSupport, nvlsSupport, numPipeOps,
+          &info->algorithm, &info->protocol, &nChannels));
+  }
+  NCCLCHECK(topoGetAlgoInfo(info, collNetSupport, nvlsSupport, numPipeOps));
+  if (nChannels) info->nChannels = nChannels; // Set by plugin; override default.
+  return ncclSuccess;
+}
+
 static ncclResult_t getPatternInfo(struct ncclInfo* info) {
   switch (info->coll) {
     case ncclFuncBroadcast:
@@ -1275,14 +1311,6 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
 }
 
 static ncclResult_t computeColl(struct ncclInfo* info /* input */, int* workFuncIndex, struct ncclWorkElem* work, struct ncclProxyOp* proxyOp /* output */) {
-  int collNetTypeSupport = 0;
-  // Check whether algo and proto have been preset (as in aggregation case)
-  // If so, skip the calculation
-  if (info->nChannels > 0 && info->nThreads > 0) goto comp_next;
-  NCCLCHECK(getCollNetSupport(info, &collNetTypeSupport));
-  NCCLCHECK(getAlgoInfo(info, collNetTypeSupport, 1));
-
-comp_next:
   // Set nstepsPerLoop and nchunksPerLoop
   NCCLCHECK(getPatternInfo(info));
   NCCLCHECK(getLoopInfo(info));
@@ -1295,14 +1323,7 @@ comp_next:
   work->nWarps = info->nThreads / WARP_SIZE;
   work->redOpArg = info->opFull.scalarArg;
   work->redOpArgIsPtr = info->opFull.scalarArgIsPtr;
-
-  if (info->comm->nRanks == 1) {
-    // one-rank reduce index
-    *workFuncIndex = 1 + int(info->datatype);
-    return ncclSuccess;
-  }
-
-  *workFuncIndex = FUNC_INDEX(info->coll, info->opFull.op, info->datatype, info->algorithm, info->protocol);
+  *workFuncIndex = ncclDevFuncId(info->coll, info->opFull.op, info->datatype, info->algorithm, info->protocol);
 
   int stepSize   = info->comm->buffSizes[info->protocol]/NCCL_STEPS;
   int chunkSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->chunkSteps : 1;
@@ -1337,6 +1358,7 @@ comp_next:
     work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
   } else if (info->algorithm == NCCL_ALGO_NVLS) {
     int maxChunkSize = 131072;
+    if (info->comm->nNodes > 1 && info->comm->bandwidths[ncclFuncAllReduce][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] < 150) maxChunkSize = 32768;
     if (chunkSize > maxChunkSize) chunkSize = maxChunkSize;
     // Use uint64_t so that concurrentOps*chunkSize*X does not overflow
     uint64_t concurrentOps = info->nChannels*info->comm->channels[0].nvls.nHeads;
@@ -1347,6 +1369,7 @@ comp_next:
   } else if (info->algorithm == NCCL_ALGO_NVLS_TREE) {
     // Use uint64_t so that concurrentOps*chunkSize*X does not overflow
     uint64_t concurrentOps = info->nChannels*info->comm->channels[0].nvls.nHeads;
+    if (info->comm->nNodes >= 4) chunkSize = 65536;
     if ((info->nBytes < (32 * (concurrentOps*chunkSize))) && (chunkSize > 262144)) chunkSize = 262144;
     if ((info->nBytes < (16 * (concurrentOps*chunkSize))) && (chunkSize > 131072)) chunkSize = 131072;
     if ((info->nBytes < (4 * (concurrentOps*chunkSize))) && (chunkSize > 65536)) chunkSize = 65536;
@@ -1381,7 +1404,7 @@ comp_next:
   proxyOp->protocol = info->protocol;
   proxyOp->dtype = info->datatype;
   proxyOp->redOp = info->opFull.op==ncclDevPreMulSum || info->opFull.op==ncclDevSumPostDiv ? ncclSum : // Network sees avg as sum
-                     info->op;
+                     info->opFull.proxyOp;
   proxyOp->pattern = info->pattern;
   proxyOp->root = info->root;
   // This is used by P2P to reduce the receive buffer size. We don't use it in collectives
@@ -1399,27 +1422,37 @@ static ncclResult_t hostToDevRedOp(
     ncclDevRedOpFull *opFull, ncclRedOp_t op, ncclDataType_t datatype, ncclComm *comm
   ) {
   union {
-    int8_t i8;
-    uint8_t u8;
-    int32_t i32;
-    uint32_t u32;
-    int64_t i64;
-    uint64_t u64;
-    half f16;
+    int8_t   i8; uint8_t   u8;
+    int32_t i32; uint32_t u32;
+    int64_t i64; uint64_t u64;
+    half f16; float f32; double f64;
     #if defined(__CUDA_BF16_TYPES_EXIST__)
       __nv_bfloat16 bf16;
     #endif
-    float f32;
-    double f64;
     void *ptr;
   };
   u64 = 0;
   opFull->scalarArgIsPtr = false;
+  opFull->proxyOp = op;
+
+  int nbits = 8*ncclTypeSize(datatype);
+  uint64_t allBits = uint64_t(-1)>>(64-nbits);
+  uint64_t signBit = allBits^(allBits>>1);
+
   switch (int(op)) {
   case ncclSum:  opFull->op = ncclDevSum;  break;
   case ncclProd: opFull->op = ncclDevProd; break;
-  case ncclMax:  opFull->op = ncclDevMax;  break;
-  case ncclMin:  opFull->op = ncclDevMin;  break;
+  case ncclMin:
+  case ncclMax:
+    opFull->op = ncclDevMinMax;
+    opFull->scalarArg = 0;
+    // The xormask used by ncclFuncMinMax<[u]int> is the XOR of the sign bit
+    // for signed (opposed to unsigned) types and all the bits for max (opposed to min).
+    if (datatype==ncclInt8 || datatype==ncclInt32 || datatype==ncclInt64) {
+      opFull->scalarArg ^= signBit;
+    }
+    opFull->scalarArg ^= (op == ncclMax) ? allBits : 0;
+    break;
   case ncclAvg:
     switch ((int)datatype) {
     case ncclInt8:  case ncclInt32:  case ncclInt64:
@@ -1513,12 +1546,8 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo const* inf
     struct ncclDevRedOpFull opFull;
     NCCLCHECK(hostToDevRedOp(&opFull, info->op, info->datatype, comm));
 
-    // User-defined reduction ops may need alter the data even for unitary reductions
-    if (comm->nRanks == 1 && opFull.op < ncclDevPreMulSum) {
-      if (info->sendbuff != info->recvbuff) {
-        size_t bytes = info->count*ncclTypeSize(info->datatype);
-        CUDACHECK(cudaMemcpyAsync(info->recvbuff, info->sendbuff, bytes, cudaMemcpyDeviceToDevice, info->stream));
-      }
+    if (comm->nRanks == 1) {
+      NCCLCHECK(ncclLaunchOneRank(info->recvbuff, info->sendbuff, info->count, opFull, info->datatype, info->stream));
       return ncclSuccess;
     } else {
       // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`.
diff --git a/src/graph/connect.cc b/src/graph/connect.cc
index a71045e628..5af0020eda 100644
--- a/src/graph/connect.cc
+++ b/src/graph/connect.cc
@@ -370,13 +370,18 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
       treeToParent[c*nNodes+n] = allTopoRanks[r]->treeToParent[c];
       treeToChild0[c*nNodes+n] = allTopoRanks[r]->treeToChild0[c];
       treeToChild1[c*nNodes+n] = allTopoRanks[r]->treeToChild1[c];
-      nvlsHeads[c*nNodes+n] = allTopoRanks[r]->nvlsHeads[c];
     }
     for (int r=0; r<nranks; r++) {
       ringPrev[c*nranks+r] = allTopoRanks[r]->ringPrev[c];
       ringNext[c*nranks+r] = allTopoRanks[r]->ringNext[c];
     }
   }
+  for (int c=0; c<graphs[NCCL_ALGO_NVLS]->nChannels; c++) {
+    for (int n=0; n<nNodes; n++) {
+      int r = firstRanks[n];
+      nvlsHeads[c*nNodes+n] = allTopoRanks[r]->nvlsHeads[c];
+    }
+  }
 
   // Connect rings and trees. This should also duplicate the channels.
   NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext));
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index 450ba658f1..42be5919ed 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -70,7 +70,7 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
         if ((remPath->bw == 0 || remPath->count > path->count) && remPath->bw < bw) {
           // Find reverse link
           for (int l=0; l<remNode->nlinks; l++) {
-            if (remNode->links[l].remNode == node) {
+            if (remNode->links[l].remNode == node && remNode->links[l].type == link->type) {
               remPath->list[0] = remNode->links+l;
               break;
             }
@@ -126,7 +126,7 @@ static void printNodePaths(struct ncclTopoSystem* system, struct ncclTopoNode* n
       for (int i=0; i<node->paths[t][n].count; i++) {
         struct ncclTopoLink* link = node->paths[t][n].list[i];
         struct ncclTopoNode* remNode = link->remNode;
-        sprintf(line+offset, "--%s->%s/%lX", topoLinkTypeStr[link->type], topoNodeTypeStr[remNode->type], remNode->id);
+        sprintf(line+offset, "--%s(%g)->%s/%lX", topoLinkTypeStr[link->type], link->bw, topoNodeTypeStr[remNode->type], remNode->id);
         offset = strlen(line);
       }
       INFO(NCCL_GRAPH, "%s (%f)", line, node->paths[t][n].bw);
@@ -212,14 +212,14 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
   if (*level == -1) {
     int l = -1;
     if (disableEnv) {
-      char* str = getenv(disableEnv);
+      const char* str = ncclGetEnv(disableEnv);
       if (str) {
         int disable = strtol(str, NULL, 0);
         if (disable == 1) l = 0;
       }
     }
     if (l == -1) {
-      char* str = getenv(levelEnv);
+      const char* str = ncclGetEnv(levelEnv);
       if (str) {
         for (int i=0; i<=PATH_SYS; i++) {
           if (strcmp(str, topoPathTypeStr[i]) == 0) {
@@ -318,14 +318,15 @@ compare:
         status = ncclNvmlDevicePairs[indexes[i-1]][indexes[i-0]].p2pStatusWrite;
         good &= status == NVML_P2P_STATUS_OK;
         if (!good) {
-          if (ncclParamIgnoreDisabledP2p()) {
-            *p2p = 0;
-          } else if (path->type <= PATH_NVB) {
-            WARN("P2P is disabled between NVLINK connected GPUs %d and %d. This should not be the case given their connectivity, and is probably due to a hardware issue. If you still want to proceed, you can set NCCL_IGNORE_DISABLED_P2P=1.", indexes[i-1], indexes[i-0]);
-            return ncclUnhandledCudaError;
-          } else if (path->type < PATH_SYS) {
-            INFO(NCCL_INIT, "P2P is disabled between connected GPUs %d and %d. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.", indexes[i-1], indexes[i-0]);
+          if (!ncclParamIgnoreDisabledP2p()) {
+            if (path->type <= PATH_NVB) {
+              WARN("P2P is disabled between NVLINK connected GPUs %d and %d. This should not be the case given their connectivity, and is probably due to a hardware issue. If you still want to proceed, you can set NCCL_IGNORE_DISABLED_P2P=1.", indexes[i-1], indexes[i-0]);
+              return ncclUnhandledCudaError;
+            } else if (path->type < PATH_SYS) {
+              INFO(NCCL_INIT, "P2P is disabled between connected GPUs %d and %d. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.", indexes[i-1], indexes[i-0]);
+            }
           }
+          *p2p = 0;
         }
       }
     }
@@ -360,7 +361,8 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int
   if (read) { // For reads (sends) only enable under certain conditions
     int gdrReadParam = ncclParamNetGdrRead();
     if (gdrReadParam == 0) return ncclSuccess;
-    if (gdrReadParam < 0) {
+    // Disable GDR Reads pre-Ampere when we have other PCI flows
+    if (gdrReadParam < 0 && gpu->gpu.cudaCompCap < 80) {
       int nvlink = 0;
       // Since we don't know whether there are other communicators,
       // it's better to keep things local if we have a single GPU.
@@ -400,7 +402,7 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int
 }
 
 // Set to 0 to disable the flush on Hopper when using GDR
-NCCL_PARAM(NetForceFlush, "NET_FORCE_FLUSH", 1);
+NCCL_PARAM(NetForceFlush, "NET_FORCE_FLUSH", 0);
 
 // Determine whether we need to flush the GDR recv buffers
 ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int* flush) {
diff --git a/src/graph/search.cc b/src/graph/search.cc
index dd8896bd20..3ebb0d4204 100644
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@@ -49,10 +49,10 @@ ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system) {
   return ncclSuccess;
 }
 
-static ncclResult_t findRevLink(struct ncclTopoNode* node1, struct ncclTopoNode* node2, struct ncclTopoLink** revLink) {
+static ncclResult_t findRevLink(struct ncclTopoNode* node1, struct ncclTopoNode* node2, int type, struct ncclTopoLink** revLink) {
   for (int l=0; l<node2->nlinks; l++) {
     struct ncclTopoLink* link = node2->links+l;
-    if (link->remNode == node1) {
+    if (link->remNode == node1 && link->type == type) {
       *revLink = link;
       return ncclSuccess;
     }
@@ -85,11 +85,11 @@ static ncclResult_t followPath(struct ncclTopoLinkList* path, struct ncclTopoNod
     float fwBw = link->type == LINK_PCI ? pciBw : bw;
     float revBw = 0;
     if (link->remNode->type == GPU && link->remNode->gpu.cudaCompCap < 80 && start->type != GPU) {
-      if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, &revLink));
+      if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, link->type, &revLink));
       revBw += fwBw/8;
     }
-    if (link->remNode->type == CPU && link->type == LINK_NVL) {
-      if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, &revLink));
+    if (link->remNode->type == CPU && link->remNode->cpu.arch == NCCL_TOPO_CPU_ARCH_POWER && link->type == LINK_NVL) {
+      if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, link->type, &revLink));
       revBw += fwBw;
     }
     if (link->bw < fwBw || (revBw && revLink->bw < revBw)) { *steps = step; return ncclSuccess; }
@@ -260,6 +260,32 @@ ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncc
   } else {
     for (int i=0; i<count; i++) next[i] = scores[i].g;
   }
+
+  if (system->nodes[NVS].count) {
+    // NVSwitches prefer when we talk to a limited set of peers. Try to use neighbors first.
+    int index = gpu-system->nodes[GPU].nodes;
+    int i;
+    int prevGpu = (index-1+ngpus)%ngpus;
+    int nextGpu = (index+1)%ngpus;
+    int firstGpus[2];
+    int firstGpuCount = 0;
+    if (graph->pattern == NCCL_TOPO_PATTERN_RING) {
+      firstGpus[0] = nextGpu; firstGpus[1] = prevGpu; firstGpuCount = 2;
+    } else if (graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE ||
+        graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) {
+      firstGpus[0] = prevGpu; firstGpus[1] = nextGpu; firstGpuCount = 2;
+    } else {
+      firstGpus[0] = nextGpu; firstGpuCount = 1;
+    }
+    for (int g=0; g<firstGpuCount; g++) {
+      for (i=0; i<count && next[i] != firstGpus[g]; i++);
+      if (i<count) {
+        for (; i>0; i--) next[i] = next[i-1];
+        next[0] = firstGpus[g];
+      }
+    }
+  }
+
   *countPtr = count;
   return ncclSuccess;
 }
@@ -267,7 +293,7 @@ ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncc
 ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int* time);
 
 // Try to keep all searchs within one second
-#define NCCL_SEARCH_GLOBAL_TIMEOUT (5ULL<<16)
+#define NCCL_SEARCH_GLOBAL_TIMEOUT (1ULL<<19)
 #define NCCL_SEARCH_TIMEOUT (1<<14)
 #define NCCL_SEARCH_TIMEOUT_TREE (1<<14)
 #define NCCL_SEARCH_TIMEOUT_SAMECHANNELS (1<<8)
@@ -342,6 +368,7 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop
 
   if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) { // NVLS channels correspond to GPUs pulling from NVLS. So the more the better.
     if (graph->nChannels > refGraph->nChannels && graph->nChannels <= system->nodes[GPU].count) *copy = 1;
+    if (graph->nChannels*graph->bwInter > refGraph->nChannels*refGraph->bwInter) *copy = 1;
     return ncclSuccess;
   }
   // 2. Try to get better bandwidth
@@ -358,30 +385,27 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop
   return ncclSuccess;
 }
 
-// Build a list of the best NETs to try.
+// Build a sorted list of the NETs to try.
 //
 // "gpu" can be set to -1 to build a list suitable for all GPUs (search start) or to a given gpu
 //  index when trying to get back to the NIC.
 //
 // The list is built the following way:
 // 1. Select NETs starting with those close to GPU(s), based on paths[n].type.
-// 2. For each GPU, once that list of NICs with a given distance is prepared, shuffle the list
-//    based on the GPU NVML index so that e.g. GPU 1 chooses NIC 1 first instead of NIC 0 which
-//    might have been choosen by GPU 0 (case with multiple independent communicators per node)
-// 3. Then add the NETs to the final list if they were not already added by another closer GPU.
+// 2. add other NETs satisfying typeInter but not already in the list.
 
 ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int* nets, int* netCountRet) {
   int netCount = 0;
   int localNetCount;
   int* localNets;
-  NCCLCHECK(ncclCalloc(&localNets, system->nodes[NET].count));
+  NCCLCHECK(ncclCalloc(&localNets, MAXCHANNELS));
 
   // First add the preferred NICs
   for (int g=0; g<system->nodes[GPU].count; g++) {
     if (gpu != -1 && gpu != g) continue;
     localNetCount = 0;
     struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
-    for (int c = 0;; c++) {
+    for (int c = 0; c<MAXCHANNELS; c++) {
       int netId;
       NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId));
       NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount));
@@ -451,11 +475,11 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
       int startNetIndex;
       NCCLCHECK(getNetIndex(system, graph->inter[graph->nChannels*2], &startNetIndex));
       struct ncclTopoNode* startNet = system->nodes[NET].nodes+startNetIndex;
-      int netcount;
+      int netCount;
       int* nets;
       NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
-      NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netcount));
-      for (int i=0; i<netcount; i++) {
+      NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netCount));
+      for (int i=0; i<netCount; i++) {
         int n = nets[i];
         struct ncclTopoNode* net = system->nodes[NET].nodes+n;
         if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) continue; // Trees are symmetric
@@ -523,12 +547,12 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
   const int bw = graph->bwInter;
   int* nets;
   NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
-  int netcount;
-  NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netcount));
-  for (int i=0; i<netcount; i++) {
-    int n = nets[i];
+  int netCount;
+  NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netCount));
+  for (int i=0; i<netCount; i++) {
+    if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && i>0) continue;
+    int n = nets[(graph->nChannels+i)%netCount];
     struct ncclTopoNode* net = system->nodes[NET].nodes+n;
-    struct ncclTopoNode* gpu;
     if (graph->collNet && net->net.collSupport == 0) continue;
     if (net->net.bw < bw) continue;
 
@@ -542,12 +566,12 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
       }
     }
 
-    // NVLS needs to balance on all NICs
     if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
-      if (graph->nChannels < netcount) {
+      // NVLS search only tries to find NIC:GPU combinations to compute the heads.
+      if (graph->nChannels < netCount) {
         int gpu;
-        NCCLCHECK(ncclTopoGetLocalGpu(system, system->nodes[NET].nodes[nets[graph->nChannels]].id, &gpu));
-        if (gpu != -1) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, gpu));
+        NCCLCHECK(ncclTopoGetLocalGpu(system, net->id, &gpu));
+        if (gpu != -1) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, gpu));
       }
     } else {
       if (graph->nChannels > 0) {
@@ -557,7 +581,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
         NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g));
       }
       if (graph->nChannels == 0 || graph->sameChannels == 0) {
-        if (graph->nChannels == 0) {
+        if (graph->nChannels == 0 && system->nodes[NVS].count == 0) {
           // Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long
           int t = 1 << 10;
           NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0));
@@ -577,18 +601,10 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
           }
         }
         if (maxBw >= bw) {
-          // In the first loop, avoid using GPUs in both directions between channels (one channel
-          // sending from that GPU and one channel receiving to that GPU), since that usually leads
-          // to lower BW.
-          for (int tryGpuBidir=0; tryGpuBidir<2; tryGpuBidir++) {
-            for (int g=0; g<system->nodes[GPU].count; g++) {
-              if (paths[g].bw == maxBw && paths[g].count == minHops) {
-                gpu = system->nodes[GPU].nodes+g;
-                int gpuUsed = gpuPciBw(gpu) > 0 ? 0 : 1;
-                if (tryGpuBidir == gpuUsed) {
-                  NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g));
-                }
-              }
+          for (int i=0; i<system->nodes[GPU].count; i++) {
+            int g = (graph->nChannels+i)%system->nodes[GPU].count;
+            if (paths[g].bw == maxBw && paths[g].count == minHops) {
+              NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g));
             }
           }
         }
@@ -804,33 +820,50 @@ ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs
   return ncclSuccess;
 }
 
+ncclResult_t ncclTopoDupChannels(struct ncclTopoGraph* graph, int ccMin, int ngpus) {
+  if (graph->nChannels == 0) return ncclSuccess;
+  if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) return ncclSuccess;
+  if (graph->bwIntra < 25.0) return ncclSuccess;
+  if (ccMin > 80 && graph->bwIntra < 50.0 && graph->nChannels > 4) return ncclSuccess;
+
+  int dupChannels = std::min(graph->nChannels*2, graph->maxChannels);
+  memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, (dupChannels-graph->nChannels)*ngpus*sizeof(int));
+  memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int));
+  graph->bwIntra /= DIVUP(dupChannels, graph->nChannels);
+  graph->bwInter /= DIVUP(dupChannels, graph->nChannels);
+  graph->nChannels = dupChannels;
+  return ncclSuccess;
+}
+
 float speedArrayIntra[] = { 40.0, 30.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0 };
 float speedArrayInter[] = { 48.0, 30.0, 28.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
 #define NSPEEDSINTRA (sizeof(speedArrayIntra)/sizeof(float))
 #define NSPEEDSINTER (sizeof(speedArrayInter)/sizeof(float))
 
-float sm90SpeedArrayIntra[] = { 60.0, 40.0, 30.0, 24.0, 20.0, 15.0, 12.0, 6.0, 3.0 };
-float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
+float sm90SpeedArrayIntra[] = { 60.0, 50.0, 40.0, 30.0, 24.0, 20.0, 15.0, 12.0, 6.0, 3.0 };
+float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
 #define NSPEEDSINTRA_SM90 (sizeof(sm90SpeedArrayIntra)/sizeof(float))
 #define NSPEEDSINTER_SM90 (sizeof(sm90SpeedArrayInter)/sizeof(float))
 
 ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) {
   int ngpus = system->nodes[GPU].count;
-  graph->crossNic = ncclParamCrossNic();
-  int crossNic = (system->nodes[NET].count > 1) && graph->crossNic &&
+  int crossNic = (system->nodes[NET].count > 1) &&
 	 (graph->pattern == NCCL_TOPO_PATTERN_RING ||
-	  graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE ||
-	  graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) ? 1 : 0;
+          graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE ||
+          graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) ? ncclParamCrossNic() : 0;
+  graph->crossNic = crossNic == 1 ? 1 : 0;
   graph->bwIntra = graph->bwInter = 0;
   graph->latencyInter = 0;
-  if (graph->crossNic == 2) graph->crossNic = 0;
   graph->typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL;
   graph->typeInter = PATH_PIX;
   graph->nChannels = 0;
   int trySameChannels = graph->pattern == NCCL_TOPO_PATTERN_NVLS ? 0 : 1;
   graph->sameChannels = trySameChannels;
 
-  char* str = getenv("NCCL_GRAPH_FILE");
+  int cpuArch, cpuVendor, cpuModel;
+  NCCLCHECK(ncclTopoCpuType(system, &cpuArch, &cpuVendor, &cpuModel));
+
+  const char* str = ncclGetEnv("NCCL_GRAPH_FILE");
   if (str) {
     INFO(NCCL_ENV, "NCCL_GRAPH_FILE set by environment to %s", str);
     struct ncclXml* xml;
@@ -846,6 +879,8 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
   int ccMin;
   NCCLCHECK(ncclTopoGetCompCap(system, &ccMin, NULL));
   if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && (system->nodes[NVS].count == 0 || ccMin < 90)) return ncclSuccess;
+  // NVLS search must have ngpus heads at most.
+  if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) graph->maxChannels = system->nodes[GPU].count;
 
   if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;
 
@@ -884,7 +919,7 @@ search:
 
   NCCLCHECK(ncclTopoSearchRec(system, &tmpGraph, graph, &time));
 #if 0
-  printf("Pattern %d, crossNic %d, Bw %g/%g, type %d/%d, channels %d-%d sameChannels %d -> nChannels %dx%g/%g %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.bwInter, tmpGraph.bwIntra, tmpGraph.typeInter, tmpGraph.typeIntra, tmpGraph.minChannels, tmpGraph.maxChannels, tmpGraph.sameChannels, graph->nChannels, graph->bwInter, graph->bwIntra, time == 0 ? "TIMEOUT" : time == -1 ? "PERFECT" : "");
+  printf("Id %d Pattern %d, crossNic %d, Bw %g/%g, type %d/%d, channels %d-%d sameChannels %d -> nChannels %dx%g/%g %s\n", tmpGraph.id, tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.bwInter, tmpGraph.bwIntra, tmpGraph.typeInter, tmpGraph.typeIntra, tmpGraph.minChannels, tmpGraph.maxChannels, tmpGraph.sameChannels, graph->nChannels, graph->bwInter, graph->bwIntra, time == 0 ? "TIMEOUT" : time == -1 ? "PERFECT" : "");
   for (int c=0; c<graph->nChannels; c++) {
     printf("%2d : ", c);
     for (int g=0; g<ngpus; g++) {
@@ -901,8 +936,9 @@ search:
   if (pass == 1) {
     // First pass, we don't have a solution yet ; try other options
 
-    // Try having different channels
-    if (tmpGraph.sameChannels == 1) {
+    // Try having different channels (except when going through AMD CPUs)
+    if (tmpGraph.sameChannels == 1 &&
+        !(cpuArch == NCCL_TOPO_CPU_ARCH_X86 && cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD && tmpGraph.typeIntra == PATH_SYS)) {
       tmpGraph.sameChannels = 0;
       goto search;
     }
@@ -932,12 +968,12 @@ search:
     }
     tmpGraph.typeInter = PATH_PIX;
 
-    if (crossNic && tmpGraph.crossNic == 0) {
+    if (crossNic == 2 && tmpGraph.crossNic == 0) {
       // Try again with crossNic if permitted
-      tmpGraph.crossNic = crossNic;
+      tmpGraph.crossNic = 1;
       goto search;
     }
-    tmpGraph.crossNic = 0;
+    tmpGraph.crossNic = crossNic == 1 ? 1 : 0;
 
     // Decrease bw until we find a solution
     if ((speedIndex < nspeeds-1) && (graph->nChannels == 0 || (speedArray[speedIndex+1]/graph->bwInter > .49))) {
@@ -954,6 +990,7 @@ done:
   // We have a solution. Start from that solution and move to pass 2.
   if (pass == 1) {
     time = -1;
+    NCCLCHECK(ncclTopoDupChannels(graph, ccMin, ngpus));
     memcpy(&tmpGraph, graph, sizeof(tmpGraph));
     speedIndex = 0;
     while (speedArray[speedIndex] > graph->bwInter && speedIndex < nspeeds-1) speedIndex++;
@@ -962,13 +999,22 @@ done:
     pass = 2;
   }
 
-  // 3. See if we can increase bwIntra for trees (2 nodes or collnet)
   if (pass == 2) {
-    if (time != 0 && graph->pattern != NCCL_TOPO_PATTERN_RING &&
-        tmpGraph.bwIntra == graph->bwIntra && tmpGraph.bwIntra < tmpGraph.bwInter*2 &&
-        speedIndex > 0) {
-      tmpGraph.bwIntra = speedArray[--speedIndex];
-      goto search;
+    // See if we can increase bw
+    if (time != 0 && speedIndex > 0) {
+      if (graph->pattern == NCCL_TOPO_PATTERN_RING) {
+        // increase bw for Ring
+        tmpGraph.bwIntra = tmpGraph.bwInter = speedArray[--speedIndex];
+        goto search;
+      } else if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && tmpGraph.bwInter == graph->bwInter && tmpGraph.bwInter < tmpGraph.bwIntra*2) {
+        tmpGraph.minChannels = tmpGraph.maxChannels = graph->nChannels;
+        tmpGraph.bwInter = speedArray[--speedIndex];
+        goto search;
+      } else if (tmpGraph.bwIntra == graph->bwIntra && tmpGraph.bwIntra < tmpGraph.bwInter*2) {
+        // increase bwIntra for trees (2 nodes or collnet)
+        tmpGraph.bwIntra = speedArray[--speedIndex];
+        goto search;
+      }
     }
     time = -1;
     memcpy(&tmpGraph, graph, sizeof(tmpGraph));
@@ -982,18 +1028,6 @@ done:
     graph->typeIntra = graph->typeInter = PATH_SYS;
     graph->nChannels = 1;
   }
-
-  if (graph->nChannels == 0) return ncclSuccess;
-  if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) return ncclSuccess;
-  if (graph->bwIntra < 25.0) return ncclSuccess;
-  if (ccMin > 80 && graph->bwIntra < 50.0 && graph->nChannels > 4) return ncclSuccess;
-
-  int dupChannels = std::min(graph->nChannels*2, graph->maxChannels);
-  memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, (dupChannels-graph->nChannels)*ngpus*sizeof(int));
-  memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int));
-  graph->bwIntra /= DIVUP(dupChannels, graph->nChannels);
-  graph->bwInter /= DIVUP(dupChannels, graph->nChannels);
-  graph->nChannels = dupChannels;
   return ncclSuccess;
 }
 
@@ -1023,7 +1057,7 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr
 }
 
 ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, struct ncclTopoGraph** graphs) {
-  char* str = getenv("NCCL_GRAPH_DUMP_FILE");
+  const char* str = ncclGetEnv("NCCL_GRAPH_DUMP_FILE");
   if (str) {
     INFO(NCCL_ENV, "NCCL_GRAPH_DUMP_FILE set by environment to %s", str);
     struct ncclXml* xml;
diff --git a/src/graph/topo.cc b/src/graph/topo.cc
index cdcc0664f7..481def486b 100644
--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@@ -72,6 +72,9 @@ static ncclResult_t ncclTopoGetInterCpuBw(struct ncclTopoNode* cpu, float* bw) {
   if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
     *bw = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_BW : QPI_BW;
   }
+  if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_AMD) {
+    *bw = AMD_BW;
+  }
   if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
     *bw = cpu->cpu.model ==  NCCL_TOPO_CPU_TYPE_YONGFENG ? YONGFENG_ZPI_BW : ZPI_BW;
   }
@@ -540,6 +543,36 @@ ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem*
   return ncclSuccess;
 }
 
+ncclResult_t ncclTopoAddC2c(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId) {
+  if (strcmp(node->name, "c2c") == 0) {
+    struct ncclTopoNode* gpu = NULL;
+    int64_t pBusId;
+    NCCLCHECK(busIdToInt64(parentBusId, &pBusId));
+    NCCLCHECK(ncclTopoGetNode(system, &gpu, GPU, pBusId));
+    if (gpu == NULL) {
+      WARN("Add NVLink error : could not find GPU %lx", pBusId);
+      return ncclInternalError;
+    }
+    int count = 0;
+    NCCLCHECK(xmlGetAttrInt(node, "count", &count));
+    int bw = 0;
+    NCCLCHECK(xmlGetAttrInt(node, "bw", &bw));
+    double c2cBw = (bw*count)/1000.0;
+    struct ncclTopoNode* cpu = NULL;
+    NCCLCHECK(findLocalCpu(gpu, &cpu));
+    if (cpu == NULL) return ncclSuccess;
+    NCCLCHECK(ncclTopoConnectNodes(gpu, cpu, LINK_NVL, c2cBw));
+    NCCLCHECK(ncclTopoConnectNodes(cpu, gpu, LINK_NVL, c2cBw));
+  } else {
+    const char* busId;
+    NCCLCHECK(xmlGetAttr(node, "busid", &busId));
+    for (int s=0; s<node->nSubs; s++) {
+      NCCLCHECK(ncclTopoAddC2c(node->subs[s], system, busId ? busId : parentBusId));
+    }
+  }
+  return ncclSuccess;
+}
+
 ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem) {
   NCCLCHECK(ncclCalloc(topoSystem, 1));
   struct ncclXmlNode* topNode;
@@ -549,6 +582,7 @@ ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem
     if (strcmp(node->name, "cpu") == 0) NCCLCHECK(ncclTopoAddCpu(node, *topoSystem));
   }
   NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL));
+  NCCLCHECK(ncclTopoAddC2c(topNode, *topoSystem, NULL));
 
   NCCLCHECK(ncclTopoFlattenBcmSwitches(*topoSystem));
   NCCLCHECK(ncclTopoConnectCpus(*topoSystem));
@@ -595,7 +629,7 @@ static ncclResult_t xmlInitAttrFloat(struct ncclXmlNode* node, const char* attrN
 ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) {
   struct ncclXml* xml;
   NCCLCHECK(ncclCalloc(&xml, 1));
-  char* xmlTopoFile = getenv("NCCL_TOPO_FILE");
+  const char* xmlTopoFile = ncclGetEnv("NCCL_TOPO_FILE");
   if (xmlTopoFile) {
     INFO(NCCL_ENV, "NCCL_TOPO_FILE set by environment to %s", xmlTopoFile);
     NCCLCHECK(ncclTopoGetXmlFromFile(xmlTopoFile, xml, 1));
@@ -668,7 +702,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
   // Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
   NCCLCHECK(ncclTopoTrimXml(xml));
 
-  xmlTopoFile = getenv("NCCL_TOPO_DUMP_FILE");
+  xmlTopoFile = ncclGetEnv("NCCL_TOPO_DUMP_FILE");
   if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) {
     INFO(NCCL_ENV, "NCCL_TOPO_DUMP_FILE set by environment to %s", xmlTopoFile);
     NCCLCHECK(ncclTopoDumpXmlToFile(xmlTopoFile, xml));
@@ -704,7 +738,7 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int ch
   int* localNets;
   int localNetCount;
   NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, &localNets, &localNetCount, NULL));
-  int* localGpus;
+  int* localGpus = NULL;
   int localGpuCount;
   NCCLCHECK(ncclTopoGetLocal(system, NET, localNets[0], GPU, &localGpus, &localGpuCount, NULL));
   int net = system->nodes[GPU].nodes[gpu].gpu.dev;
@@ -717,17 +751,25 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int ch
 }
 
 ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int net, int* gpuIndex) {
+  int netIndex;
+  NCCLCHECK(ncclTopoIdToIndex(system, NET, net, &netIndex));
+  int* localGpus = NULL;
+  int localGpuCount;
+  NCCLCHECK(ncclTopoGetLocal(system, NET, netIndex, GPU, &localGpus, &localGpuCount, NULL));
   for (int c=0; c<MAXCHANNELS; c++) {
-    for (int g=0; g<system->nodes[GPU].count; g++) {
+    for (int lg=0; lg<localGpuCount; lg++) {
+      int g = localGpus[lg];
       struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
       int id;
       NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id));
       if (net == id) {
         *gpuIndex = g;
+        free(localGpus);
         return ncclSuccess;
       }
     }
   }
+  free(localGpus);
   *gpuIndex = -1;
   return ncclSuccess;
 }
@@ -836,14 +878,3 @@ ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int*
   if (ccMax) *ccMax = max;
   return ncclSuccess;
 }
-
-ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank) {
-  for (int g=0; g<system->nodes[GPU].count; g++) {
-    if (system->nodes[GPU].nodes[g].gpu.rank == rank) {
-      *localRank = g;
-      return ncclSuccess;
-    }
-  }
-  WARN("Could not find local GPU with rank %d", rank);
-  return ncclInternalError;
-}
diff --git a/src/graph/topo.h b/src/graph/topo.h
index 8951505fd6..b067f2f975 100644
--- a/src/graph/topo.h
+++ b/src/graph/topo.h
@@ -18,6 +18,7 @@
 #define SM86_NVLINK_BW 12.0
 #define PCI_BW 12.0           // PCI Gen3 x16
 #define QPI_BW 6.0
+#define AMD_BW 16.0
 #define SKL_QPI_BW 10.0
 #define ZPI_BW 6.0
 #define YONGFENG_ZPI_BW 9.0
diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc
index a43ea2628c..a97ed9a1ad 100644
--- a/src/graph/tuning.cc
+++ b/src/graph/tuning.cc
@@ -5,7 +5,7 @@
  ************************************************************************/
 
 #include "core.h"
-#include "devcomm.h"
+#include "device.h"
 #include "comm.h"
 #include "topo.h"
 
@@ -54,9 +54,9 @@ ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* li
 // Latencies in us, Bandwidths in GB/s
 // Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple }
 static const float baseLat  [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = {
-       {  6.8, 14.0,    0 }, {  6.6, 14.0,  8.4 }, // Tree, Ring
-       {  6.8, 14.0,    0 }, {  6.8, 14.0,    0 },       // Collnet Direct, Chain
-       {    0,    0, 23.0 }, {    0,    0, 23.0 }};     // NVLS, NVLS Tree
+       {  6.8, 14.0,    0 }, {  6.6, 14.0,  8.4 },  // Tree, Ring
+       {    0,    0,    0 }, {    0,    0,    0 },  // Collnet Direct, Chain
+       {    0,    0,    0 }, {    0,    0,    0 }}; // NVLS, NVLS Tree
 
 // NVLink, PCI, Network
 #define NCCL_HW_NVLINK 0
@@ -64,17 +64,17 @@ static const float baseLat  [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = {
 #define NCCL_HW_NET 2
 static float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
 { /* NVLINK */
-  { /* Tree (LL/LL128/Simple)*/ { .6, 1.25,  4 }, /* Ring (LL/LL128/Simple)*/ { .6, 1.9, 3.4 },
-    /* CollNetDirect (Simple)*/ { 0, 0, 8.0 }, /* CollNetChain (Simple)*/ { 0, 0, 4.75 },
-    /* NVLS */ { 0, 0, 0 }, /* NVLSTree */ { 0, 0, 0 } },
+  { /* Tree (LL/LL128/Simple)*/ { .6, 1.25, 28 }, /* Ring (LL/LL128/Simple)*/ { .6, 1.9, 3.4 },
+    /* CollNetDirect (Simple)*/ { 0, 0, 3.7 }, /* CollNetChain (Simple)*/ { 0, 0, 2.8 },
+    /* NVLS */ { 0, 0, 23 }, /* NVLSTree */ { 0, 0, 23 } },
   /* PCI */
-  { /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9,  6 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 },
-    /* CollNetDirect (Simple)*/ { 0, 0, 8.0 }, /* CollNetChain (Simple)*/ { 0, 0, 8.0 },
+  { /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 },
+    /* CollNetDirect (Simple)*/ { 0, 0, 3.7 }, /* CollNetChain (Simple)*/ { 0, 0, 2.8 },
     /* NVLS */ { 0, 0, 0 }, /* NVLSTree */ { 0, 0, 0 } },
   /* NET */
-  { /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 14 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 14.0 },
-    /* CollNetDirect (Simple)*/ { 0, 0, 10.7 }, /* CollNetChain (Simple)*/ { 0, 0, 14 },
-    /* NVLS */ { 0, 0, 18 }, /* NVLSTree */ { 0, 0, 19 } }
+  { /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 28 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 14.0 },
+    /* CollNetDirect (Simple)*/ { 0, 0, 31 }, /* CollNetChain (Simple)*/ { 0, 0, 30 },
+    /* NVLS */ { 0, 0, 18 }, /* NVLSTree */ { 0, 0, 14 } }
 };
 
 /* Array indexes used below */
@@ -165,13 +165,15 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
     for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
       if (coll == ncclFuncBroadcast && a != NCCL_ALGO_RING) continue;
       if (coll == ncclFuncReduce && a != NCCL_ALGO_RING) continue;
-      if (coll == ncclFuncReduceScatter && a != NCCL_ALGO_RING) continue;
-      if (coll == ncclFuncAllGather && a != NCCL_ALGO_RING) continue;
+      if (coll == ncclFuncReduceScatter && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS) continue;
+      if (coll == ncclFuncAllGather && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS) continue;
 
       for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
         if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && p != NCCL_PROTO_SIMPLE) continue;
         int collnet = (a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) ? 1 : 0;
         float bw = nNodes <= 2 || collnet ? graphs[a]->bwIntra : graphs[a]->bwInter;
+        if (a == NCCL_ALGO_NVLS) bw = std::min(graphs[a]->bwIntra, graphs[a]->bwInter);
+        if (a == NCCL_ALGO_NVLS_TREE) bw = std::min(graphs[a]->bwIntra, nNodes <= 2 ? graphs[a]->bwInter : graphs[a]->bwInter/2);
         float busBw = graphs[a]->nChannels * bw;
 
         // Various model refinements
@@ -194,10 +196,12 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
         // Convert bus BW to algorithm BW
         float ratio;
         if (a == NCCL_ALGO_RING) ratio = (1.0 * nRanks) / nsteps;
-        else if (a == NCCL_ALGO_NVLS) ratio = 5.0/6.0;
-        else if (a == NCCL_ALGO_NVLS_TREE) ratio = .70 * nNodes / (2*(nNodes-1));
+        else if (a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) ratio = 5.0/6.0;
         else ratio = .5;
         comm->bandwidths[coll][a][p] = busBw * ratio;
+        /* Ring bandwidth backup */
+        if (a == NCCL_ALGO_RING)
+          comm->ringbdw[coll][p] = comm->bandwidths[coll][NCCL_ALGO_RING][p];
 
         comm->latencies[coll][a][p] = baseLat[a][p];
         float intraLat = hwLat[intraHw[a]][a][p];
@@ -229,13 +233,14 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
             2 * ((nRanks/nNodes-1) * intraLat + log2i(nNodes) * interLat);
         } else if (a == NCCL_ALGO_COLLNET_DIRECT) {
           comm->latencies[coll][a][p] +=
-            2 * (std::min(1, (nRanks/nNodes-1)) * intraLat + (nRanks/nNodes-1) * 0.5) + interLat;  // Add 0.5 arity serialization latency
+            2 * (std::min(1, (nRanks/nNodes-1)) * intraLat + (nRanks/nNodes-1) * 0.4) + interLat;  // Add 0.4 us arity serialization latency
         } else if (a == NCCL_ALGO_COLLNET_CHAIN) {
           comm->latencies[coll][a][p] += 2 * (nRanks/nNodes-1) * intraLat + interLat;
         } else if (a == NCCL_ALGO_NVLS) {
-          if (nNodes > 1) comm->latencies[coll][a][p] += hwLat[NCCL_HW_NET][a][p];
+          comm->latencies[coll][a][p] = intraLat;
+          if (nNodes > 1) comm->latencies[coll][a][p] += interLat;
         } else if (a == NCCL_ALGO_NVLS_TREE) {
-          comm->latencies[coll][a][p] += 2*(nNodes-1)*hwLat[NCCL_HW_NET][a][p];
+          comm->latencies[coll][a][p] += intraLat + 2 * log2i(nNodes) * interLat;
         }
       }
     }
@@ -246,12 +251,12 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
   int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 };
   int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1, 1, 1 };
 
-  const char *protoStr = getenv("NCCL_PROTO");
+  const char *protoStr = ncclGetEnv("NCCL_PROTO");
   if (protoStr) {
     INFO(NCCL_ENV, "NCCL_PROTO set by environment to %s", protoStr);
     NCCLCHECK(parseList(protoStr, ncclProtoStr, NCCL_NUM_PROTOCOLS, protoEnable));
   }
-  const char *algoStr = getenv("NCCL_ALGO");
+  const char *algoStr = ncclGetEnv("NCCL_ALGO");
   if (algoStr) {
     INFO(NCCL_ENV, "NCCL_ALGO set by environment to %s", algoStr);
     NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable));
@@ -293,11 +298,25 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
       }
     }
     if (pEnable == 0) comm->bandwidths[c][a][p] = 0;
-    // Never disable ring for non-allreduce operations. That allows to run real apps with NCCL_ALGO=TREE.
-    if (a == NCCL_ALGO_RING && c != ncclFuncAllReduce) continue;
     if (algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
   }
 
+  for (int c = 0; c < NCCL_NUM_FUNCTIONS; c++) {
+    bool available = false;
+    for (int a = 0; a < NCCL_NUM_ALGORITHMS; a++)
+      for (int p = 0; p < NCCL_NUM_PROTOCOLS; p++)
+        if (comm->bandwidths[c][a][p] != 0) {
+          available = true;
+          goto check_avail;
+        }
+  check_avail:
+    if (available == false) {
+      /* at least set ring algo available */
+      for (int p = 0; p < NCCL_NUM_PROTOCOLS; p++)
+        comm->bandwidths[c][NCCL_ALGO_RING][p] = comm->ringbdw[c][p];
+    }
+  }
+
   if (comm->rank == 0) {
     char line[1024];
     for (int block=0; block<2; block++) {
@@ -346,7 +365,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
   comm->threadThresholds[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] = 512;
 
   // Override defaults with user env
-  char* str = getenv("NCCL_THREAD_THRESHOLDS");
+  const char* str = ncclGetEnv("NCCL_THREAD_THRESHOLDS");
   if (str) {
     INFO(NCCL_ENV, "NCCL_THREAD_THRESHOLDS set by environment to %s", str);
     ssize_t t[2][NCCL_NUM_PROTOCOLS] = {{ -2, -2, -2 }, { -2, -2, -2 }};
@@ -378,9 +397,19 @@ static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][23] = {
   {  .9,  .9,  .9,  .9,  .9,  .9,  .9,  .8,  .7,  .6,  .6,  .5,  .5,  .5,  .5,  .6,  .7,  .8,  .7,  .7,  .8,  .9,  .9 }
 };
 
-ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time) {
-  float bw = info->comm->bandwidths[info->coll][algorithm][protocol];
+ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time, bool* backup) {
+  float bw = info->comm->bandwidths[info->coll][algorithm][protocol]; 
   float lat = info->comm->latencies[info->coll][algorithm][protocol];
+  
+  if (backup) {
+    *backup = false;
+    if (algorithm == NCCL_ALGO_RING && bw == 0.0f) {
+      /* try back up RING algorithm */
+      bw = info->comm->ringbdw[info->coll][protocol];
+      *backup = true;
+    }
+  }
+
   if (bw == 0) {
     *time = -1.0; return ncclSuccess;
   }
diff --git a/src/graph/xml.cc b/src/graph/xml.cc
index ac862a4e06..47fda1f851 100644
--- a/src/graph/xml.cc
+++ b/src/graph/xml.cc
@@ -254,9 +254,13 @@ ncclResult_t ncclTopoXmlLoadNvlink(FILE* file, struct ncclXml* xml, struct ncclX
   return ncclSuccess;
 }
 
+ncclResult_t ncclTopoXmlLoadC2c(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
+  NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0));
+  return ncclSuccess;
+}
 ncclResult_t ncclTopoXmlLoadGpu(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
-  struct xmlHandler handlers[] = { { "nvlink", ncclTopoXmlLoadNvlink } };
-  NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1));
+  struct xmlHandler handlers[] = { { "nvlink", ncclTopoXmlLoadNvlink }, { "c2c", ncclTopoXmlLoadC2c } };
+  NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 2));
   return ncclSuccess;
 }
 
@@ -687,6 +691,41 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
       }
     }
   }
+#if CUDART_VERSION >= 11080
+  struct ncclXmlNode* c2cNode = NULL;
+  NCCLCHECK(xmlGetSub(gpuNode, "c2c", &c2cNode));
+  if (c2cNode == NULL) {
+      if (sm >= 90) {
+        int c2cLinksCount = 0;
+        nvmlFieldValue_t fv;
+        fv.fieldId = NVML_FI_DEV_C2C_LINK_COUNT;
+        if ((ncclNvmlDeviceGetFieldValues(nvmlDev, 1, &fv) == ncclSuccess) && (fv.nvmlReturn == NVML_SUCCESS)) {
+          c2cLinksCount = fv.value.uiVal;
+          int bw = 0;
+	  int count = 0;
+          for (int l=0; l<c2cLinksCount; l++) {
+            nvmlFieldValue_t fvs[2];
+            fvs[0].fieldId = NVML_FI_DEV_C2C_LINK_GET_STATUS;
+            fvs[0].scopeId = l;
+            fvs[1].fieldId = NVML_FI_DEV_C2C_LINK_GET_MAX_BW;
+            fvs[1].scopeId = l;
+            if ((ncclNvmlDeviceGetFieldValues(nvmlDev, 2, fvs) == ncclSuccess) &&
+                (fvs[0].nvmlReturn == NVML_SUCCESS) &&
+                (fvs[0].value.uiVal == 1) &&
+                (fvs[1].nvmlReturn == NVML_SUCCESS)) {
+              bw = fvs[1].value.uiVal;
+	      count++;
+            }
+          }
+          if (count > 0) {
+            NCCLCHECK(xmlAddNode(xml, gpuNode, "c2c", &c2cNode));
+            NCCLCHECK(xmlSetAttrInt(c2cNode, "bw", bw));
+            NCCLCHECK(xmlSetAttrInt(c2cNode, "count", count));
+          }
+        }
+      }
+  }
+#endif
   // Fill target classes
   for (int s=0; s<gpuNode->nSubs; s++) {
     struct ncclXmlNode* sub = gpuNode->subs[s];
diff --git a/src/group.cc b/src/group.cc
index a889c060cb..29400d6bcb 100644
--- a/src/group.cc
+++ b/src/group.cc
@@ -22,7 +22,6 @@ __thread int ncclGroupBlocking = -1; /* default mode */
 __thread bool ncclGroupJobAbortFlag = false;
 
 void* ncclAsyncJobMain(void* arg);
-static ncclResult_t groupJobComplete(struct ncclGroupJob *job);
 
 ncclResult_t ncclAsyncLaunch(
     struct ncclAsyncJob* job,
@@ -181,9 +180,28 @@ failure:
   return result;
 }
 
-static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** groupCommPreconnectHeadPtr, struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next>* asyncJobsPtr, ncclResult_t* groupErrorPtr, ncclResult_t error) {
+static inline void groupResetJobState(struct ncclGroupJob* job) {
+  if (job) {
+    if (job->groupBlockingPtr) *job->groupBlockingPtr = -1;
+    if (job->abortFlagPtr) *job->abortFlagPtr = false;
+    if (job->groupErrorPtr) *job->groupErrorPtr = ncclSuccess;
+    if (job->groupCommHeadPtr) *job->groupCommHeadPtr = NULL;
+    if (job->groupCommPreconnectHeadPtr) *job->groupCommPreconnectHeadPtr = NULL;
+    memset(job, 0, sizeof(struct ncclGroupJob));
+  }
+  return;
+}
+
+static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** groupCommPreconnectHeadPtr, struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next>* asyncJobsPtr, ncclResult_t* groupErrorPtr, int* groupBlockingPtr, volatile bool* groupJobAbortFlagPtr, ncclResult_t error) {
   struct ncclComm* comm = *groupCommHeadPtr;
 
+  /* reset all thread local variables */
+  *groupCommHeadPtr = NULL;
+  *groupCommPreconnectHeadPtr = NULL;
+  *groupErrorPtr = ncclSuccess;
+  *groupBlockingPtr = -1;
+  *groupJobAbortFlagPtr = false;
+
   while (comm != nullptr) {
     struct ncclComm* next = comm->groupNext;
     (void) ncclGroupCommLeave(comm); // overwrites comm->groupNext
@@ -233,16 +251,12 @@ static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** g
   /* reset everything */
   while (!ncclIntruQueueEmpty(asyncJobsPtr)) {
     struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsPtr);
-    *job->abortFlag = 1;
     if (job->comm && !job->comm->config.blocking)
       (void) ncclCommSetAsyncError(job->comm, error);
     if (job->undo) job->undo(job);
     if (job->destructor) job->destructor((void*)job);
   }
 
-  *groupErrorPtr = ncclSuccess;
-  *groupCommHeadPtr = nullptr;
-  *groupCommPreconnectHeadPtr = nullptr;
   return;
 }
 
@@ -325,9 +339,6 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
     NCCLCHECKGOTO(doLaunches(groupCommHeadMain), ret, fail);
   }
 
-  /* this atomic must happen before cleanup and setting state of communicators */
-  __atomic_store_n(&gjob->doneFlag, true, __ATOMIC_RELEASE);
-
   while (!ncclIntruQueueEmpty(asyncJobsMain)) {
     struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsMain);
     if (job->comm && !job->comm->config.blocking)
@@ -345,16 +356,12 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
     groupCommHeadMain = next;
   }
 
-  *gjob->groupErrorPtr = ncclSuccess;
-  *gjob->groupCommHeadPtr = nullptr;
-  *gjob->groupCommPreconnectHeadPtr = nullptr;
-
   CUDACHECK(cudaSetDevice(savedDev));
 
 exit:
   return ret;
 fail:
-  groupCleanup(gjob->groupCommHeadPtr, gjob->groupCommPreconnectHeadPtr, gjob->asyncJobsPtr, gjob->groupErrorPtr, ret);
+  groupCleanup(gjob->groupCommHeadPtr, gjob->groupCommPreconnectHeadPtr, gjob->asyncJobsPtr, gjob->groupErrorPtr, gjob->groupBlockingPtr, gjob->abortFlagPtr, ret);
   goto exit;
 }
 
@@ -377,7 +384,8 @@ ncclResult_t ncclGroupEndInternal() {
     ncclGroupJobMain.groupErrorPtr = &ncclGroupError;
     ncclGroupJobMain.asyncJobsPtr = &ncclAsyncJobs;
     ncclGroupJobMain.abortFlagPtr = &ncclGroupJobAbortFlag;
-    ncclGroupJobMain.doneFlag = false;
+    ncclGroupJobMain.groupBlockingPtr = &ncclGroupBlocking;
+    ncclGroupJobMain.initialized = true;
     ncclGroupJobMainPtr = &ncclGroupJobMain;
     /* make sure ncclGroupBlocking has been set. */
     assert(ncclGroupBlocking == 0 || ncclGroupBlocking == 1);
@@ -387,6 +395,7 @@ ncclResult_t ncclGroupEndInternal() {
         ncclAsyncJob* job = ncclIntruQueueHead(&ncclAsyncJobs);
         do {
           NCCLCHECKGOTO(ncclCommSetAsyncError(job->comm, ncclInProgress), ret, fail);
+          job->comm->groupJob = ncclGroupJobMainPtr;
           job = job->next;
         } while (job);
       }
@@ -395,30 +404,42 @@ ncclResult_t ncclGroupEndInternal() {
         ncclComm_t comm = ncclGroupCommHead;
         do {
           NCCLCHECKGOTO(ncclCommSetAsyncError(comm, ncclInProgress), ret, fail);
+          /* link group job to communicators. */
+          comm->groupJob = ncclGroupJobMainPtr;
           comm = comm->groupNext;
         } while (comm);
       }
+
       ncclGroupJobMainPtr->base.func = groupLaunch;
       SYSCHECKGOTO(pthread_create(&ncclGroupJobMainPtr->base.thread, NULL, ncclAsyncJobMain, (void*)&ncclGroupJobMainPtr->base), ret, fail);
       ret = ncclInProgress;
     } else {
       /* blocking group */
       NCCLCHECKGOTO(groupLaunch(&ncclGroupJobMainPtr->base), ret, fail);
-      groupResetJobState();
+      groupResetJobState(ncclGroupJobMainPtr);
     }
   }
 
 exit:
   return ret;
 fail:
-  groupCleanup(&ncclGroupCommHead, &ncclGroupCommPreconnectHead, &ncclAsyncJobs, &ncclGroupError, ret);
-  groupResetJobState();
+  groupCleanup(&ncclGroupCommHead, &ncclGroupCommPreconnectHead, &ncclAsyncJobs, &ncclGroupError, &ncclGroupBlocking, &ncclGroupJobAbortFlag, ret);
   goto exit;
 }
 
-void ncclGroupJobAbort() {
-  ncclGroupJobAbortFlag = true;
-  (void) groupJobComplete(ncclGroupJobMainPtr);
-  /* reset group abort flag */
-  ncclGroupJobAbortFlag = false;
+ncclResult_t ncclGroupJobComplete(struct ncclGroupJob* groupJob) {
+  ncclResult_t ret = ncclSuccess;
+  if (groupJob && groupJob->initialized) {
+    ret = ncclAsyncJobComplete(&groupJob->base);
+    groupResetJobState(groupJob);
+  }
+  return ret;
+}
+
+ncclResult_t ncclGroupJobAbort(struct ncclGroupJob* groupJob) {
+  if (groupJob && groupJob->initialized) {
+    *groupJob->abortFlagPtr = true;
+    NCCLCHECK(ncclGroupJobComplete(groupJob));
+  }
+  return ncclSuccess;
 }
diff --git a/src/include/alloc.h b/src/include/alloc.h
index caa9da9855..f8d954469e 100644
--- a/src/include/alloc.h
+++ b/src/include/alloc.h
@@ -101,7 +101,7 @@ static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHand
   /* Allocate the physical memory on the device */
   CUCHECK(cuMemCreate(&handle, size, &prop, 0));
   /* Reserve a virtual address range */
-  CUCHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0));
+  CUCHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0));
   /* Map the virtual address range to the physical allocation */
   CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
   /* Now allow RW access to the newly mapped memory */
diff --git a/src/include/collectives.h b/src/include/collectives.h
index b9100a22a0..0f965276a4 100644
--- a/src/include/collectives.h
+++ b/src/include/collectives.h
@@ -7,108 +7,7 @@
 #ifndef NCCL_COLLECTIVES_H_
 #define NCCL_COLLECTIVES_H_
 
-enum ncclDevRedOp_t {
-  ncclDevSum, ncclDevProd, ncclDevMax, ncclDevMin,
-  ncclDevPreMulSum, ncclDevSumPostDiv,
-  ncclNumDevRedOps
-};
-struct ncclDevRedOpFull {
-  ncclDevRedOp_t op;
-  bool scalarArgIsPtr;
-  uint64_t scalarArg;
-};
-
-#define FUNC_INDEX_P2P 0
-#define FUNC_INDEX(func, devredop, ncclType, al, pr) (1+ncclNumTypes+(((((func)*ncclNumDevRedOps + (devredop))*ncclNumTypes) + (ncclType))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr))
-
-#define NCCL_FUNC_NAME(func, algo, proto, devredop, type) \
-  ncclFunction_##func##_##algo##_##proto##_##devredop##_##type
-
-#define NCCL_ONERANK_REDUCE_NAME(devredop, type) \
-  ncclFunction_OneRankReduce_##devredop##_##type
-
-#define NCCL_KERN_NAME(func, algo, proto, devredop, type) \
-  ncclKernel_##func##_##algo##_##proto##_##devredop##_##type
-
-#define NCCL_IMPL_NAME(func, algo, proto) \
-  nccl##func##algo##proto
-
-/* Declare all collective operations */
-#define DECL5(func, algo, proto, devredop, type) \
-  extern __device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(); \
-  extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \
-
-#define SINGLE_ARG(...) __VA_ARGS__
-#define CONCAT(a,b) a##b
-#define MACRO_IF(cond, t, f) CONCAT(MACRO_IF_, cond)(SINGLE_ARG(t), SINGLE_ARG(f))
-#define MACRO_IF_0(t, f) f
-#define MACRO_IF_1(t, f) t
-
-#define DECL4(func, algo, devredop, type, undef) \
-  MACRO_IF(undef, /*undefined*/, DECL5(func, algo, SIMPLE, devredop, type)) \
-  MACRO_IF(undef, /*undefined*/, DECL5(func, algo, LL,     devredop, type)) \
-  MACRO_IF(undef, /*undefined*/, DECL5(func, algo, LL128,  devredop, type))
-
-#define DECL3(func, devredop, type, undef) \
-  DECL4(func, RING,           devredop, type, undef) \
-  DECL4(func, TREE,           devredop, type, undef) \
-  DECL4(func, COLLNET_DIRECT, devredop, type, undef) \
-  DECL4(func, COLLNET_CHAIN,  devredop, type, undef) \
-  DECL4(func, NVLS,           devredop, type, undef) \
-  DECL4(func, NVLS_TREE,      devredop, type, undef)
-
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-#define DECL2(func, devredop, undefForFloat) \
-  DECL3(func, devredop, int8_t, /*undef=*/0) \
-  DECL3(func, devredop, uint8_t, /*undef=*/0) \
-  DECL3(func, devredop, int32_t, /*undef=*/0) \
-  DECL3(func, devredop, uint32_t, /*undef=*/0) \
-  DECL3(func, devredop, int64_t, /*undef=*/0) \
-  DECL3(func, devredop, uint64_t, /*undef=*/0) \
-  DECL3(func, devredop, half, /*undef=*/undefForFloat) \
-  DECL3(func, devredop, float, /*undef=*/undefForFloat) \
-  DECL3(func, devredop, double, /*undef=*/undefForFloat) \
-  DECL3(func, devredop, __nv_bfloat16, /*undef=*/undefForFloat)
-#else
-#define DECL2(func, devredop, undefForFloat) \
-  DECL3(func, devredop, int8_t, /*undef=*/0) \
-  DECL3(func, devredop, uint8_t, /*undef=*/0) \
-  DECL3(func, devredop, int32_t, /*undef=*/0) \
-  DECL3(func, devredop, uint32_t, /*undef=*/0) \
-  DECL3(func, devredop, int64_t, /*undef=*/0) \
-  DECL3(func, devredop, uint64_t, /*undef=*/0) \
-  DECL3(func, devredop, half, /*undef=*/undefForFloat) \
-  DECL3(func, devredop, float, /*undef=*/undefForFloat) \
-  DECL3(func, devredop, double, /*undef=*/undefForFloat)
-#endif
-
-#define DECL(func) \
-  DECL2(func, Sum, /*undefForFloat=*/0) \
-  DECL2(func, Prod, /*undefForFloat=*/0) \
-  DECL2(func, Min, /*undefForFloat=*/0) \
-  DECL2(func, Max, /*undefForFloat=*/0) \
-  DECL2(func, PreMulSum, /*undefForFloat=*/0) \
-  DECL2(func, SumPostDiv, /*undefForFloat=*/1)
-
-DECL2(Broadcast, Sum, /*undefForFloat=*/0)
-DECL(Reduce)
-DECL2(AllGather, Sum, /*undefForFloat=*/0)
-DECL(ReduceScatter)
-DECL(AllReduce)
-DECL5(SendRecv, RING, SIMPLE, Sum, int8_t)
-
-extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int8_t)();
-extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint8_t)();
-extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int32_t)();
-extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint32_t)();
-extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int64_t)();
-extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint64_t)();
-extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, half)();
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, __nv_bfloat16)();
-#endif
-extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, float)();
-extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, double)();
+#include "nccl.h"
 
 // CHUNKSIZE must be a multiple of SLICESIZE
 #define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4)
@@ -123,13 +22,27 @@ extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, double)();
 #define REDUCE_CHUNKSTEPS 1
 #define NCCL_MAX_SLICE_PER_CHUNK 2  // max value for CHUNKSTEPS/SLICESTEPS, must accord with above
 
-// We can't use the enum identifiers like ncclSum, ncclFloat, etc since this
-// macro will be used in preprocessor conditionals where enums have no meaning.
-#define NCCL_NVLS_SUPPORTS(/*ncclDataType_t*/ type, /*ncclDevRedOp_t*/ red) \
-  (((type==2 || type==3) && (red==0 || red==2 || red==3)) || \
-   ((type==4 || type==5) && (red==0 || red==2 || red==3)) || \
-   ((type==6 || type==9) && (red==0 || red==2 || red==3)) || \
-   (type==7 && red==0) || \
-   (type==8 && red==0))
+inline int ncclTypeSize(ncclDataType_t type) {
+  switch (type) {
+  case ncclInt8:
+  case ncclUint8:
+    return 1;
+  case ncclFloat16:
+  #if defined(__CUDA_BF16_TYPES_EXIST__)
+  case ncclBfloat16:
+  #endif
+    return 2;
+  case ncclInt32:
+  case ncclUint32:
+  case ncclFloat32:
+    return 4;
+  case ncclInt64:
+  case ncclUint64:
+  case ncclFloat64:
+    return 8;
+  default:
+    return -1;
+  }
+}
 
 #endif
diff --git a/src/include/comm.h b/src/include/comm.h
index 8986f9349c..bc5a9c5683 100644
--- a/src/include/comm.h
+++ b/src/include/comm.h
@@ -10,8 +10,10 @@
 #include "transport.h"
 #include "p2p.h"
 #include "collectives.h"
+#include "nccl_tuner.h"
 #include "proxy.h"
 #include "strongstream.h"
+#include "nccl_net.h"
 
 #if CUDART_VERSION < 9000
 struct cudaLaunchParams {
@@ -125,7 +127,7 @@ struct ncclChannel {
   struct ncclChannelPeer** peers;
   struct ncclDevChannelPeer** devPeers;
   /* devPeer pointer array used for host side access */
-  struct ncclDevChannelPeer** devPeersHostPtr; 
+  struct ncclDevChannelPeer** devPeersHostPtr;
   struct ncclRing ring;
   int* devRingUserRanks;
   struct ncclTree tree;
@@ -155,6 +157,14 @@ struct ncclPointerList {
   void *ptr;
 };
 
+struct ncclNvlsMcHandleList {
+  struct ncclNvlsMcHandleList *next;
+  CUmemGenericAllocationHandle mcHandle;
+  CUdeviceptr ptr;
+  int dev;
+  size_t size;
+};
+
 struct ncclKernelPlan {
   // A kernel plan is also a callback that reclaims itself. Hence this must
   // be the first member.
@@ -178,6 +188,7 @@ struct ncclKernelPlan {
   int collOpCount; // zero based for this plan
 
   struct ncclIntruQueue<struct ncclPointerList, &ncclPointerList::next> ipcMemQueue;
+  struct ncclIntruQueue<struct ncclNvlsMcHandleList, &ncclNvlsMcHandleList::next> nvlsMcHandleQueue;
 
   struct Channel {
     int nWork;
@@ -191,6 +202,23 @@ struct ncclKernelPlan {
   } channels[MAXCHANNELS];
 };
 
+struct ncclRegRequest {
+  uintptr_t buff;
+  size_t size;
+  struct ncclRegRequest *next;
+};
+
+struct ncclRegRecord {
+  uintptr_t buff;
+  size_t size;
+  CUdeviceptr regAddr;
+  size_t regSize;
+  int dev;
+  CUmemGenericAllocationHandle mcHandle;
+  uintptr_t *addrs; /* use to check if NVLS buffers match among intra-node ranks */
+  struct ncclRegRecord *next;
+};
+
 struct ncclComm {
   struct ncclMemoryStack memPermanent, memScoped;
   // List of destructors to run when comm is destructed
@@ -261,6 +289,7 @@ struct ncclComm {
   ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
   float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
   float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float ringbdw[NCCL_NUM_FUNCTIONS][NCCL_NUM_PROTOCOLS];
   int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
 
   /* This attribute can indicate the states of communicators and return code of
@@ -270,7 +299,7 @@ struct ncclComm {
   // Flag to ask NCCL kernels to abort
   volatile uint32_t *abortFlag;
   volatile uint32_t *childAbortFlag;
-  uint32_t *abortFlagRefCount;
+  volatile uint32_t *abortFlagRefCount;
 
   // Device side of the communicator (for cudaFree's)
   struct ncclDevComm* devComm; // actually = &ncclDevCommAndChannels::comm
@@ -310,15 +339,19 @@ struct ncclComm {
 
   // NVLink SHARP (NVLS) support
   int nvlsSupport;
+  int nvlsRegSupport;
   /* sharable NVLS resource. */
   struct ncclNvlsSharedRes* nvlsResources;
+  struct ncclShmemCollBuff nvlsShmem;
+  void *nvlsShmemHandle;
 
-  size_t channelSize; // User requested work size (bytes) for channel partitions
+  ssize_t channelSize; // User requested work size (bytes) for channel partitions
 
   // pools backed by comm->memPermanent
   struct ncclMemoryPool memPool_ncclProxyOp;
   struct ncclMemoryPool memPool_ncclKernelPlan;
   struct ncclMemoryPool memPool_ncclPointerList;
+  struct ncclMemoryPool memPool_ncclNvlsHandleList;
   // Next comm in this thread's active ncclGroup[Start|End](). Holds "0x1" when
   // this comm is not yet in a group.
   struct ncclComm* groupNext;
@@ -346,6 +379,16 @@ struct ncclComm {
   bool finalizeCalled;
   // shared structures for finalization
   int finalizeRankCnt;
+  // group job to support multi-thread FT
+  struct ncclGroupJob *groupJob;
+
+  /* store to buffer register request */
+  struct ncclIntruQueue<struct ncclRegRequest, &ncclRegRequest::next> regRequestQueue;
+  /* store registered buffer */
+  struct ncclIntruQueue<struct ncclRegRecord, &ncclRegRecord::next> regRecordQueue;
+
+  // Tuning plugin
+  ncclTuner_t* tuner;
 };
 
 enum ncclLaunchMode {
diff --git a/src/include/core.h b/src/include/core.h
index ac6ea77f2d..a1754beeb1 100644
--- a/src/include/core.h
+++ b/src/include/core.h
@@ -30,29 +30,6 @@
     ret func(args)
 #endif // end PROFAPI
 
-static __inline__ int ncclTypeSize(ncclDataType_t type) {
-  switch (type) {
-    case ncclInt8:
-    case ncclUint8:
-      return 1;
-    case ncclFloat16:
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-    case ncclBfloat16:
-#endif
-      return 2;
-    case ncclInt32:
-    case ncclUint32:
-    case ncclFloat32:
-      return 4;
-    case ncclInt64:
-    case ncclUint64:
-    case ncclFloat64:
-      return 8;
-    default:
-      return -1;
-  }
-}
-
 #include "debug.h"
 #include "checks.h"
 #include "cudawrap.h"
diff --git a/src/include/cudawrap.h b/src/include/cudawrap.h
index da9ce45a4f..cc363c1ac7 100644
--- a/src/include/cudawrap.h
+++ b/src/include/cudawrap.h
@@ -30,7 +30,7 @@ typedef CUresult (CUDAAPI *PFN_cuGetProcAddress_v11030)(const char *symbol, void
     if( err != CUDA_SUCCESS ) {				      \
       const char *errStr;				      \
       (void) pfn_cuGetErrorString(err, &errStr);	      \
-      WARN("Cuda failure '%s'", errStr);		      \
+      WARN("Cuda failure %d '%s'", err, errStr);	      \
       return ncclUnhandledCudaError;			      \
     }							      \
 } while(false)
@@ -40,7 +40,7 @@ typedef CUresult (CUDAAPI *PFN_cuGetProcAddress_v11030)(const char *symbol, void
     if( err != CUDA_SUCCESS ) {				      \
       const char *errStr;				      \
       (void) pfn_cuGetErrorString(err, &errStr);	      \
-      WARN("Cuda failure '%s'", errStr);		      \
+      WARN("Cuda failure %d '%s'", err, errStr);	      \
       res = ncclUnhandledCudaError;			      \
       goto label;					      \
     }							      \
@@ -52,7 +52,7 @@ typedef CUresult (CUDAAPI *PFN_cuGetProcAddress_v11030)(const char *symbol, void
     if( err != CUDA_SUCCESS ) {						\
       const char *errStr;						\
       (void) pfn_cuGetErrorString(err, &errStr);			\
-      INFO(NCCL_ALL,"%s:%d Cuda failure '%s'", __FILE__, __LINE__, errStr);	\
+      INFO(NCCL_ALL,"%s:%d Cuda failure %d '%s'", __FILE__, __LINE__, err, errStr); \
     }									\
 } while(false)
 
@@ -79,6 +79,7 @@ DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy, 4000);
 DECLARE_CUDA_PFN_EXTERN(cuCtxGetCurrent, 4000);
 DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent, 4000);
 DECLARE_CUDA_PFN_EXTERN(cuCtxGetDevice, 2000);
+DECLARE_CUDA_PFN_EXTERN(cuPointerGetAttribute, 4000);
 // cuMem API support
 DECLARE_CUDA_PFN_EXTERN(cuMemAddressReserve, 10020);
 DECLARE_CUDA_PFN_EXTERN(cuMemAddressFree, 10020);
diff --git a/src/include/debug.h b/src/include/debug.h
index cd6e53b92b..eb5189058f 100644
--- a/src/include/debug.h
+++ b/src/include/debug.h
@@ -4,10 +4,11 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#ifndef NCCL_DEBUG_H_
-#define NCCL_DEBUG_H_
+#ifndef NCCL_INT_DEBUG_H_
+#define NCCL_INT_DEBUG_H_
 
-#include "nccl_net.h"
+#include "nccl.h"
+#include "nccl_common.h"
 #include <stdio.h>
 #include <chrono>
 #include <type_traits>
diff --git a/src/include/devcomm.h b/src/include/device.h
similarity index 78%
rename from src/include/devcomm.h
rename to src/include/device.h
index d4762b8b04..56f8039f30 100644
--- a/src/include/devcomm.h
+++ b/src/include/device.h
@@ -8,31 +8,33 @@
 #define NCCL_DEVICE_H_
 
 #include "nccl.h"
+#include "nccl_common.h"
 #include "align.h"
 #include <stdint.h>
 
-#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
-typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t;
 extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS];
 
-#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
-#define NCCL_ALGO_TREE 0
-#define NCCL_ALGO_RING 1
-#define NCCL_ALGO_COLLNET_DIRECT 2
-#define NCCL_ALGO_COLLNET_CHAIN 3
-#define NCCL_ALGO_NVLS 4
-#define NCCL_ALGO_NVLS_TREE 5
 extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS];
 
-#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
-#define NCCL_PROTO_LL 0
-#define NCCL_PROTO_LL128 1
-#define NCCL_PROTO_SIMPLE 2
 extern const char* ncclProtoStr[NCCL_NUM_PROTOCOLS];
 
 #define NCCL_MAX_OPS 2048
 #define NCCL_STEPS 8
 
+#include "net_device.h"
+
+enum ncclDevRedOp_t {
+  ncclDevSum, ncclDevProd, ncclDevMinMax,
+  ncclDevPreMulSum, ncclDevSumPostDiv,
+  ncclNumDevRedOps
+};
+struct ncclDevRedOpFull {
+  ncclDevRedOp_t op;
+  ncclRedOp_t proxyOp;
+  bool scalarArgIsPtr;
+  uint64_t scalarArg;
+};
+
 union ncclLLFifoLine {
   /* Flags have to be *after* data, because otherwise, an incomplete receive
      from the network may receive the flag but not the data.
@@ -85,6 +87,7 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK
 struct ncclConnInfo {
   // Regular comm mechanism
   char *buffs[NCCL_NUM_PROTOCOLS]; // Local for recv, remote for send
+  void* mhandles[NCCL_NUM_PROTOCOLS];
   uint64_t *tail;     // Local for recv, remote for send
   uint64_t *head;     // Local for send, remote for recv
 
@@ -98,6 +101,7 @@ struct ncclConnInfo {
 
   uint64_t step;      // Keep where we are
   uint64_t llLastCleaning;
+  ncclNetDeviceHandle_t netDeviceHandle;
 };
 
 struct ncclProxyConnector {
@@ -105,6 +109,7 @@ struct ncclProxyConnector {
   int tpLocalRank;
   int sameProcess;
   struct ncclProxyConnection* connection;
+  ncclResult_t (*proxyProgress)(struct ncclProxyState* proxyState, struct ncclProxyArgs*); // Copied from transport if necessary
 };
 
 struct ncclConnector {
@@ -292,6 +297,7 @@ struct ncclDevComm {
   int rank;
   int nRanks;
   int buffSizes[NCCL_NUM_PROTOCOLS];
+  int p2pChunkSize;
 
   // Operation list for aggregation
   int workFifoDepth;
@@ -370,4 +376,88 @@ __host__ __device__ constexpr int ncclShmemDynamicSize(int cudaArch = NCCL_CUDA_
   return cudaArch < 700 ? 0 : ncclShmemScratchWarpSize(cudaArch)*(NCCL_MAX_NTHREADS/WARP_SIZE);
 }
 
+// Host-side table of kernel function pointers.
+extern int const ncclDevKernelCount;
+extern void* const ncclDevKernelList[/*ncclDevKernelCount*/];
+
+// Table of most specialized kernel function to run given func index.
+extern int const ncclDevFuncRowToId[];
+extern void* const ncclDevKernelForFunc[/*funcIndex*/];
+extern bool const ncclDevKernelForFuncIsSpecialized[/*funcIndex*/];
+
+// Launch a one-rank reduction on stream.
+ncclResult_t ncclLaunchOneRank(void* dst, void const* src, size_t nElts, struct ncclDevRedOpFull redOp, ncclDataType_t type, cudaStream_t stream);
+
+// `ncclNvlsSupported()` needs to be in sync with "func_valid" in "src/device/generate.py"
+inline bool ncclNvlsSupported(int devRedOp, int type) {
+  switch (type) {
+  case ncclInt32:
+  case ncclUint32:
+  case ncclInt64:
+  case ncclUint64:
+  case ncclFloat16:
+  #if defined(__CUDA_BF16_TYPES_EXIST__)
+  case ncclBfloat16:
+  #endif
+    return devRedOp == ncclDevSum || devRedOp == ncclDevMinMax;
+  case ncclFloat:
+  case ncclDouble:
+    return devRedOp == ncclDevSum;
+  default:
+    return false;
+  }
+}
+
+// `ncclDevFuncIndex()` needs to be in sync with "all_functions()" in "src/device/generate.py"
+inline int ncclDevFuncId(int coll, int devRedOp, int type, int algo, int proto) {
+  #if defined(__CUDA_BF16_TYPES_EXIST__)
+  constexpr int NumTypes = ncclNumTypes;
+  #else
+  constexpr int NumTypes = ncclNumTypes + 1;
+  #endif
+
+  int row = 0; // ncclDevFuncIndex_P2p
+  if (coll == ncclFuncSendRecv) goto have_row;
+  row += 1;
+
+  if (coll == ncclFuncAllGather) {
+    int algo1 = algo == NCCL_ALGO_RING ? 0 :
+              /*algo == NCCL_ALGO_NVLS*/ 1;
+    row += algo1*NCCL_NUM_PROTOCOLS + proto;
+    goto have_row;
+  }
+  row += (/*NumAlgos=*/2)*NCCL_NUM_PROTOCOLS;
+
+  if (coll == ncclFuncBroadcast) {
+    row += proto;
+    goto have_row;
+  }
+  row += (/*NumAlgos=*/1)*NCCL_NUM_PROTOCOLS;
+
+  if (coll == ncclFuncAllReduce) {
+    row += ((devRedOp*NumTypes + type)*NCCL_NUM_ALGORITHMS + algo)*NCCL_NUM_PROTOCOLS + proto;
+    goto have_row;
+  }
+  row += ncclNumDevRedOps*NumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS;
+
+  if (coll == ncclFuncReduce) {
+    row += (devRedOp*NumTypes + type)*NCCL_NUM_PROTOCOLS + proto;
+    goto have_row;
+  }
+  row += ncclNumDevRedOps*NumTypes*(/*NumAlgos=*/1)*NCCL_NUM_PROTOCOLS;
+
+  if (coll == ncclFuncReduceScatter) {
+    int algo1 = algo == NCCL_ALGO_RING ? 0 :
+              /*algo == NCCL_ALGO_NVLS*/ 1;
+    row += ((devRedOp*NumTypes + type)*2 + algo1)*NCCL_NUM_PROTOCOLS + proto;
+    goto have_row;
+  }
+  row += ncclNumDevRedOps*NumTypes*(/*NumAlgos=*/2)*NCCL_NUM_PROTOCOLS;
+
+have_row:
+  return ncclDevFuncRowToId[row];
+}
+
+inline int ncclDevFuncId_P2p() { return ncclDevFuncRowToId[0]; }
+
 #endif
diff --git a/src/include/graph.h b/src/include/graph.h
index ae524397da..fdd634894d 100644
--- a/src/include/graph.h
+++ b/src/include/graph.h
@@ -8,7 +8,7 @@
 #define NCCL_GRAPH_H_
 
 #include "nccl.h"
-#include "devcomm.h"
+#include "device.h"
 #include <limits.h>
 #include <stdlib.h>
 #include <ctype.h>
@@ -38,7 +38,6 @@ ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int
 ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net);
 int ncclPxnDisable(struct ncclComm* comm);
 ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks);
-ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank);
 
 // Find CPU affinity
 ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity);
@@ -112,6 +111,6 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
 
 ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs);
 #include "info.h"
-ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time);
+ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time, bool* backup = NULL);
 
 #endif
diff --git a/src/include/group.h b/src/include/group.h
index 9b5ea9c475..72251147f5 100644
--- a/src/include/group.h
+++ b/src/include/group.h
@@ -14,7 +14,8 @@ ncclResult_t ncclGroupErrCheck(ncclResult_t ret);
 void ncclGroupCommJoin(struct ncclComm* comm);
 void ncclGroupCommPreconnect(struct ncclComm* comm);
 ncclResult_t ncclGroupCommLeave(struct ncclComm* comm);
-void ncclGroupJobAbort();
+ncclResult_t ncclGroupJobAbort(struct ncclGroupJob* groupJob);
+ncclResult_t ncclGroupJobComplete(struct ncclGroupJob *groupJob);
 
 typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev);
 
@@ -52,8 +53,9 @@ struct ncclGroupJob {
   struct ncclComm **groupCommPreconnectHeadPtr;
   ncclResult_t *groupErrorPtr;
   volatile bool *abortFlagPtr;
+  int *groupBlockingPtr;
   struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsPtr;
-  bool doneFlag;
+  bool initialized;
 };
 
 ncclResult_t ncclGroupStartInternal();
@@ -87,14 +89,6 @@ static inline ncclResult_t groupJobComplete(struct ncclGroupJob* job) {
 }
 
 inline ncclResult_t ncclGroupStartInternal() {
-  /* if previous group launch does not complete, don't launch this one. */
-  if (ncclGroupJobMainPtr != NULL) {
-    if (__atomic_load_n(&ncclGroupJobMainPtr->doneFlag, __ATOMIC_ACQUIRE) == false) {
-      return ncclInvalidUsage;
-    } else {
-      NCCLCHECK(groupJobComplete(ncclGroupJobMainPtr));
-    }
-  }
   ncclGroupDepth++;
   return ncclSuccess;
 }
diff --git a/src/include/ibvcore.h b/src/include/ibvcore.h
index 00a6b6f60b..8d8ecf1ec8 100644
--- a/src/include/ibvcore.h
+++ b/src/include/ibvcore.h
@@ -1040,4 +1040,19 @@ static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struc
   return qp->context->ops.post_send(qp, wr, bad_wr);
 }
 
+struct ibv_ece {
+	/*
+	 * Unique identifier of the provider vendor on the network.
+	 * The providers will set IEEE OUI here to distinguish
+	 * itself in non-homogenius network.
+	 */
+	uint32_t vendor_id;
+	/*
+	 * Provider specific attributes which are supported or
+	 * needed to be enabled by ECE users.
+	 */
+	uint32_t options;
+	uint32_t comp_mask;
+};
+
 #endif  // NCCL_IBV_CORE_H_
diff --git a/src/include/ibvsymbols.h b/src/include/ibvsymbols.h
index 7cf1e08d8c..906b0df747 100644
--- a/src/include/ibvsymbols.h
+++ b/src/include/ibvsymbols.h
@@ -36,6 +36,8 @@ struct ncclIbvSymbols {
   int (*ibv_internal_modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
   int (*ibv_internal_destroy_qp)(struct ibv_qp *qp);
   const char * (*ibv_internal_event_type_str)(enum ibv_event_type event);
+  int (*ibv_internal_query_ece)(struct ibv_qp *qp, struct ibv_ece *ece);
+  int (*ibv_internal_set_ece)(struct ibv_qp *qp, struct ibv_ece *ece);
 };
 
 /* Constructs IB verbs symbols per rdma-core linking or dynamic loading mode */
diff --git a/src/include/ibvwrap.h b/src/include/ibvwrap.h
index d1c7d08e71..c3709584c3 100644
--- a/src/include/ibvwrap.h
+++ b/src/include/ibvwrap.h
@@ -66,6 +66,8 @@ static inline ncclResult_t wrap_ibv_poll_cq(struct ibv_cq *cq, int num_entries,
 ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr);
 ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
 ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp);
+ncclResult_t wrap_ibv_query_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported);
+ncclResult_t wrap_ibv_set_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported);
 
 static inline ncclResult_t wrap_ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) {
   int ret = qp->context->ops.post_send(qp, wr, bad_wr); /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
diff --git a/src/include/info.h b/src/include/info.h
index 5802f3e58d..f65ed2e698 100644
--- a/src/include/info.h
+++ b/src/include/info.h
@@ -8,7 +8,7 @@
 #define NCCL_INFO_H_
 
 #include "nccl.h"
-#include "devcomm.h"
+#include "device.h"
 #include "collectives.h"
 #include "core.h"
 #include "utils.h"
@@ -54,6 +54,8 @@ struct ncclInfo {
   int nChannels;
   int nThreads;
   size_t nBytes;
+  size_t sendbuffSize;
+  size_t recvbuffSize;
   int nstepsPerLoop;
   int nchunksPerLoop;
   int chunkSize;
@@ -67,6 +69,17 @@ inline ncclResult_t ncclInfoSetDerived(struct ncclInfo* info, int nRanks) {
     info->datatype = ncclInt8;
   }
   if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= nRanks; // count is per rank
+
+  /* compute buffer size for NVLS buffer registration */
+  if (info->coll == ncclFuncAllGather) {
+    info->sendbuffSize = info->count * ncclTypeSize(info->datatype);
+    info->recvbuffSize = info->sendbuffSize * nRanks;
+  } else if (info->coll == ncclFuncReduceScatter) {
+    info->recvbuffSize = info->count * ncclTypeSize(info->datatype);
+    info->sendbuffSize = info->recvbuffSize * nRanks;
+  } else {
+    info->sendbuffSize = info->recvbuffSize = info->count * ncclTypeSize(info->datatype);
+  }
   return ncclSuccess;
 }
 
diff --git a/src/include/ipcsocket.h b/src/include/ipcsocket.h
index 700f0bcdeb..ccecde84c7 100644
--- a/src/include/ipcsocket.h
+++ b/src/include/ipcsocket.h
@@ -30,6 +30,7 @@ struct ncclIpcSocket {
 
 ncclResult_t ncclIpcSocketInit(struct ncclIpcSocket *handle, int rank, uint64_t hash, volatile uint32_t* abortFlag);
 ncclResult_t ncclIpcSocketClose(struct ncclIpcSocket *handle);
+ncclResult_t ncclIpcSocketGetFd(struct ncclIpcSocket* handle, int* fd);
 
 ncclResult_t ncclIpcSocketRecvFd(struct ncclIpcSocket *handle, int *fd);
 ncclResult_t ncclIpcSocketSendFd(struct ncclIpcSocket *handle, const int fd, int rank, uint64_t hash);
diff --git a/src/include/nccl_common.h b/src/include/nccl_common.h
new file mode 100644
index 0000000000..a37ac203ea
--- /dev/null
+++ b/src/include/nccl_common.h
@@ -0,0 +1,33 @@
+/*************************************************************************
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_DEBUG_H_
+#define NCCL_DEBUG_H_
+
+typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
+typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys;
+
+typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
+
+#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
+typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t;
+
+#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
+#define NCCL_ALGO_UNDEF -1
+#define NCCL_ALGO_TREE 0
+#define NCCL_ALGO_RING 1
+#define NCCL_ALGO_COLLNET_DIRECT 2
+#define NCCL_ALGO_COLLNET_CHAIN 3
+#define NCCL_ALGO_NVLS 4
+#define NCCL_ALGO_NVLS_TREE 5
+
+#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
+#define NCCL_PROTO_UNDEF -1
+#define NCCL_PROTO_LL 0
+#define NCCL_PROTO_LL128 1
+#define NCCL_PROTO_SIMPLE 2
+
+#endif
diff --git a/src/include/nccl_net.h b/src/include/nccl_net.h
index a387e66d7a..9b3e6719fc 100644
--- a/src/include/nccl_net.h
+++ b/src/include/nccl_net.h
@@ -8,6 +8,8 @@
 #define NCCL_NET_H_
 
 #include "nccl.h"
+#include "nccl_common.h"
+#include "net_device.h"
 #include <stdint.h>
 
 #define NCCL_NET_HANDLE_MAXSIZE 128
@@ -17,13 +19,89 @@
 #define NCCL_PTR_DMABUF 0x4
 
 // Maximum number of requests per comm object
-#define NCCL_NET_MAX_REQUESTS 8
+#define NCCL_NET_MAX_REQUESTS 32
 
-typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
-typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys;
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+} ncclNetProperties_v7_t;
 
-typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
+typedef ncclNetProperties_v7_t ncclNetProperties_t;
 
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+} ncclNet_v7_t;
+
+typedef ncclNet_v7_t ncclNet_t;
+
+#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v7
+
+#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v7
+
+#define NCCL_NET_MAX_REQUESTS_V6 8
+
+// v6 struct for backwards compatibility
 typedef struct {
   char* name;     // Used mostly for logging.
   char* pciPath;  // Path to the PCI device in /sys.
@@ -35,9 +113,7 @@ typedef struct {
   float latency;  // Network latency
   int maxComms;   // Maximum number of comms we can create
   int maxRecvs;   // Maximum number of grouped receives.
-}ncclNetProperties_v6_t;
-
-typedef ncclNetProperties_v6_t ncclNetProperties_t;
+} ncclNetProperties_v6_t;
 
 typedef struct {
   // Name of the network (mainly for logs)
@@ -86,10 +162,49 @@ typedef struct {
   ncclResult_t (*closeListen)(void* listenComm);
 } ncclNet_v6_t;
 
-typedef ncclNet_v6_t ncclNet_t;
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclCollNet_v7_t;
 
-#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v6
+typedef ncclCollNet_v7_t ncclCollNet_t;
 
+// v6 struct for backwards compatibility
 typedef struct {
   // Name of the collective network (mainly for logs)
   const char* name;
@@ -130,10 +245,6 @@ typedef struct {
   ncclResult_t (*closeListen)(void* listenComm);
 } ncclCollNet_v6_t;
 
-typedef ncclCollNet_v6_t ncclCollNet_t;
-
-#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v6
-
 // v5 struct for backwards compatibility
 typedef struct {
   // Name of the network (mainly for logs)
@@ -219,95 +330,4 @@ typedef struct {
   ncclResult_t (*closeListen)(void* listenComm);
 } ncclCollNet_v5_t;
 
-// v4 struct for backwards compatibility
-typedef struct {
-  char* name;     // Used mostly for logging.
-  char* pciPath;  // Path to the PCI device in /sys.
-  uint64_t guid;  // Unique identifier for the NIC chip. Important for
-                  // cards with multiple PCI functions (Physical or virtual).
-  int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA
-  int speed;      // Port speed in Mbps.
-  int port;       // Port number.
-  int maxComms;   // Maximum number of comms we can create
-} ncclNetProperties_v4_t;
-
-// v4 struct for backwards compatibility
-typedef struct {
-  // Name of the network (mainly for logs)
-  const char* name;
-  // Initialize the network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create a connection.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Connect to a handle and return a sending comm object for that peer.
-  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
-  // Finalize connection establishment after remote peer has called connectHandle
-  ncclResult_t (*accept)(void* listenComm, void** recvComm);
-  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
-  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
-  ncclResult_t (*deregMr)(void* comm, void* mhandle);
-  // Asynchronous send to a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
-  // Asynchronous recv from a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* recvComm, void* data, int size, void* mhandle, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* size);
-  // Close and free send/recv comm objects
-  ncclResult_t (*closeSend)(void* sendComm);
-  ncclResult_t (*closeRecv)(void* recvComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclNet_v4_t;
-
-// v4 struct for backwards compatibility
-typedef struct {
-  // Name of the collective network (mainly for logs)
-  const char* name;
-  // Initialize the collective network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters capable of doing collective operations.
-  // If ndev returns 0, all other functions might be set to NULL.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create connections.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Create a group for collective operations. handles have been created
-  // using listen() above. rank indicates caller's rank in the collective network.
-  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
-  // Returns whether a reduction operation on a data type is supported.
-  // 1 for supported, 0 otherwise.
-  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
-  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
-  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
-  // Performs an asynchronous allreduce operation on the collective group.
-  // May return request == NULL if the call cannot be performed (or would block).
-  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* size);
-  // Close and free collective comm objects
-  ncclResult_t (*closeColl)(void* collComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclCollNet_v4_t;
-
 #endif // end include guard
diff --git a/src/include/nccl_tuner.h b/src/include/nccl_tuner.h
new file mode 100644
index 0000000000..b4a696e385
--- /dev/null
+++ b/src/include/nccl_tuner.h
@@ -0,0 +1,55 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_TUNER_H_
+#define NCCL_TUNER_H_
+
+#include "nccl.h"
+#include "nccl_common.h"
+
+// API to be implemented by external tuner
+typedef struct {
+  // Name of the tuner
+  const char* name;
+
+  // Initializes tuner states.
+  // nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
+  // nNodes: number of nodes in current communicator.
+  // logFunction: a logFunction can be useful to integrate logging together with NCCL core.
+  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction);
+
+  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
+  // Inputs:
+  //   - collType: collective type , e.g., allreduce, allgather…
+  //   - nBytes: collective size in bytes
+  //   - collNetTypeSupport: whether collnet supports this type
+  //   - nvlsTypeSupport: whether nvlink sharp supports this time
+  //   - numPipeOps: number of operations in the group
+  //
+  // Outputs:
+  //   - algorithm: selected algorithm to be used for the given collective
+  //   - protocol: selected protocol to be used for the given collective
+  //   - nChannels: number of channels (hence SMs) to be used.
+  //
+  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
+  // default tuning for the given collective.
+  // Also, the plugin is allowed to not set any output, or set only the
+  // algorithm and protocol, but not only the algorithm or only the protocol.
+  // Unset fields will be set automatically by NCCL.
+  ncclResult_t (*getCollInfo)(ncclFunc_t collType, size_t nBytes,
+                              int collNetSupport, int nvlsSupport, int numPipeOps,
+                              int *algorithm, int *protocol, int* nChannels);
+
+  // Terminates the plugin and cleans up any resources that the plugin allocated.
+  ncclResult_t (*destroy)();
+} ncclTuner_v1_t;
+
+typedef ncclTuner_v1_t ncclTuner_t;
+
+#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v1"
+
+#endif
diff --git a/src/include/net_device.h b/src/include/net_device.h
new file mode 100644
index 0000000000..8f7c0d6e1e
--- /dev/null
+++ b/src/include/net_device.h
@@ -0,0 +1,29 @@
+/*************************************************************************
+ * Copyright (c) 2023-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_NET_DEVICE_H_
+#define NCCL_NET_DEVICE_H_
+
+#define NCCL_NET_DEVICE_INVALID_VERSION      0x0
+#define NCCL_NET_MTU_SIZE                    4096
+
+// Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin
+// version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version.
+#define NCCL_NET_DEVICE_UNPACK_VERSION 0x7  
+
+typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType;
+
+typedef struct {
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+  void* handle;
+  size_t size;
+  int needsProxyProgress;
+} ncclNetDeviceHandle_v7_t;
+
+typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_t;
+
+#endif
diff --git a/src/include/nvmlwrap.h b/src/include/nvmlwrap.h
index fa1f5cf835..2ab8e3a2b0 100644
--- a/src/include/nvmlwrap.h
+++ b/src/include/nvmlwrap.h
@@ -160,7 +160,12 @@ typedef union nvmlValue_st
 #define NVML_FI_DEV_NVLINK_GET_SPEED                  164
 #define NVML_FI_DEV_NVLINK_GET_STATE                  165
 #define NVML_FI_DEV_NVLINK_GET_VERSION                166
-#define NVML_FI_MAX 167 //!< One greater than the largest field ID defined above
+
+#define NVML_FI_DEV_C2C_LINK_COUNT                    170 //!< Number of C2C Links present on the device
+#define NVML_FI_DEV_C2C_LINK_GET_STATUS               171 //!< C2C Link Status 0=INACTIVE 1=ACTIVE
+#define NVML_FI_DEV_C2C_LINK_GET_MAX_BW               172 //!< C2C Link Speed in MBps for active links
+
+#define NVML_FI_MAX 173 //!< One greater than the largest field ID defined above
 
 /**
  * Information for a Field Value Sample
diff --git a/src/include/p2p.h b/src/include/p2p.h
index 426a15017a..6ffba4b0e1 100644
--- a/src/include/p2p.h
+++ b/src/include/p2p.h
@@ -12,7 +12,7 @@
 #define NCCL_P2P_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
 
 typedef struct {
-  int data; // Currently only support an fd based descriptor
+  uint64_t data; // Needs to hold a CUmemGenericAllocationHandle for UDS fd support
 } ncclCuDesc;
 
 typedef union {
diff --git a/src/include/param.h b/src/include/param.h
index c95b67c36b..963da9d175 100644
--- a/src/include/param.h
+++ b/src/include/param.h
@@ -12,6 +12,7 @@
 const char* userHomeDir();
 void setEnvFile(const char* fileName);
 void initEnv();
+const char *ncclGetEnv(const char *name);
 
 void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache);
 
diff --git a/src/include/proxy.h b/src/include/proxy.h
index ed6c59eebc..daf3885829 100644
--- a/src/include/proxy.h
+++ b/src/include/proxy.h
@@ -7,10 +7,11 @@
 #ifndef NCCL_PROXY_H_
 #define NCCL_PROXY_H_
 
-#include "devcomm.h"
+#include "device.h"
 #include "info.h"
 #include "socket.h"
 #include "ipcsocket.h"
+#include "nccl_net.h"
 #include <pthread.h>
 #include "shm.h"
 #include "p2p.h"
@@ -65,6 +66,8 @@ struct ncclProxySubArgs {
   uint64_t end;
   void* requests[NCCL_STEPS];
   void* profilingEvents[NCCL_STEPS];
+  void* recvRequestsCache[NCCL_STEPS];
+  int recvRequestsSubCount;
 };
 
 struct ncclProxyArgs {
@@ -146,7 +149,7 @@ struct ncclProxyProgressState {
   char opsPoolShmSuffix[6];
 
   pthread_t thread;
-  bool stop;
+  volatile int stop;
   struct ncclProxyPeer** localPeers;
   struct ncclSharedNetComms* netComms[NCCL_MAX_NETDEVS];
   struct ncclProxyArgs* active;
@@ -157,11 +160,12 @@ struct ncclProxyProgressState {
 
 // Expected proxy response fifo
 struct ncclExpectedProxyResponse {
-  void*    opId;
-  int      respSize;
-  bool     done;
-  void*    respBuff;
-  struct   ncclExpectedProxyResponse* next;
+  void*                             opId;
+  int                               respSize;
+  bool                              done;
+  void*                             respBuff;
+  ncclResult_t                      res;
+  struct ncclExpectedProxyResponse* next;
 };
 
 struct ncclProxyAsyncOp {
@@ -181,7 +185,16 @@ struct ncclProxyLocalPeer {
   int asyncOpCounter;
 };
 
+// Common response header for all proxyOps
+// We pack this into a struct to reduce the number of blocking send and recv calls
+struct ncclProxyRpcResponseHeader {
+  void* opId;
+  ncclResult_t res;
+  int respSize;
+};
+
 struct ncclProxyState {
+  int internalRefCount;
   int refCount;
   int tpRank;
   int tpnRanks;
@@ -196,11 +209,13 @@ struct ncclProxyState {
   ncclNet_t* ncclNet;
   ncclCollNet_t* ncclCollNet;
   volatile uint32_t* abortFlag;
+  volatile uint32_t* abortFlagRefCount;
   // Service thread
   pthread_t thread;
   struct ncclSocket* listenSock;
-  int stop;
+  volatile int stop;
   CUcontext cudaCtx;
+  ncclResult_t asyncResult;
 
   // Used by main thread
   union ncclSocketAddress* peerAddresses;
@@ -233,8 +248,11 @@ struct ncclProxyConnection {
   struct ncclProxyArgs *proxyAppend;
   struct ncclProxyArgs **proxyAppendPtr;
   void* transportResources;
+  ncclNetDeviceHandle_t* netDeviceHandle;
+  void* mhandles[NCCL_NUM_PROTOCOLS];
   proxyConnectState state;
   struct ncclCollNetSharedRes* collNet;
+  int needsProxyProgress;
 };
 
 typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*);
@@ -260,7 +278,7 @@ enum ncclProxyMsgType {
   ncclProxyMsgClose = 6,
   ncclProxyMsgAbort = 7,
   ncclProxyMsgStop = 8,
-  ncclProxyMsgConvertFd = 9, // cuMem API support (UDS)
+  ncclProxyMsgGetFd = 9, // cuMem API support (UDS)
 };
 
 // This function is called by a client of the proxy that needs to invoke any of the non-progress proxyOp types
@@ -272,9 +290,10 @@ ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector
 ncclResult_t ncclProxyCallBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize);
 ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void* respBuff, void* opId);
 
-ncclResult_t ncclProxyClientConvertFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int fd, int* convertedFd);
+ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void *handle, int* convertedFd);
 
 ncclResult_t ncclProxyStop(struct ncclComm* comm);
 ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm);
-ncclResult_t ncclProxyDestroy(struct ncclComm* comm);
+ncclResult_t ncclProxyDestroy(struct ncclProxyState *proxyState);
+ncclResult_t ncclProxyTryDetach(struct ncclProxyState *proxyState);
 #endif
diff --git a/src/include/shm.h b/src/include/shm.h
index 61b0b4d8f5..e75caa6a6e 100644
--- a/src/include/shm.h
+++ b/src/include/shm.h
@@ -14,4 +14,12 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de
 ncclResult_t ncclShmClose(ncclShmHandle_t handle);
 ncclResult_t ncclShmUnlink(ncclShmHandle_t handle);
 
+struct ncclShmemCollBuff {
+  volatile size_t *cnt[2];
+  volatile void *ptr[2];
+  int round;
+};
+
+ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize);
+
 #endif
diff --git a/src/include/transport.h b/src/include/transport.h
index 3884a1152a..d0cd9747e2 100644
--- a/src/include/transport.h
+++ b/src/include/transport.h
@@ -7,7 +7,7 @@
 #ifndef NCCL_TRANSPORT_H_
 #define NCCL_TRANSPORT_H_
 
-#include "devcomm.h"
+#include "device.h"
 #include "graph.h"
 #include "nvmlwrap.h"
 #include "core.h"
@@ -65,6 +65,7 @@ struct ncclNvlsSharedRes {
   CUmemGenericAllocationHandle ucHandle; // Unicast Handle for NVLS buffer
   char* ucBuff; // Unicast NVLS buffer address
   char shareableHandle[NVLS_HANDLE_SIZE];
+  size_t ucGran;
   int nChannels;
 };
 
@@ -102,8 +103,20 @@ struct ncclTransport {
 ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
 ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL);
 
+// Currently we only support POSIX_FILE_DESCRIPTOR handle exchange
+#define USE_POSIX_FD 1
+
+#if USE_POSIX_FD
+#define NVLS_CU_MEM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
+#else
+#define NVLS_CU_MEM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_NONE
+#endif
+
 ncclResult_t ncclNvlsInit(struct ncclComm* comm);
 ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent);
+ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKernelPlan *plan, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv);
+ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv);
+ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size);
 ncclResult_t ncclNvlsFree(struct ncclComm* comm);
 
 enum { collNetRecv=0, collNetSend=1 };
diff --git a/src/include/tuner.h b/src/include/tuner.h
new file mode 100644
index 0000000000..d8b275017e
--- /dev/null
+++ b/src/include/tuner.h
@@ -0,0 +1,22 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_INT_TUNER_H_
+#define NCCL_INT_TUNER_H_
+
+#include "nccl_tuner.h"
+
+// Tuning plugin to override NCCL's default algorithm/protocol tuning.
+
+// Attempts to load NCCL tuner from environmental variable.
+// Returns ncclSuccess if the correct tuner symbol has been found and
+// successully loaded.  Otherwise returns an error and also logs the error.
+ncclResult_t ncclLoadTunerPlugin(ncclTuner_t** tuner);
+
+// Cleans up NCCL tuner plugin.
+ncclResult_t ncclCloseTunerPlugin(ncclTuner_t** tuner);
+#endif
diff --git a/src/include/utils.h b/src/include/utils.h
index 1c300b0cda..60f6efb5f8 100644
--- a/src/include/utils.h
+++ b/src/include/utils.h
@@ -13,6 +13,7 @@
 #include <stdint.h>
 #include <time.h>
 #include <sched.h>
+#include <algorithm>
 #include <new>
 
 int ncclCudaCompCap();
@@ -259,11 +260,6 @@ struct ncclMemoryPool {
   struct Cell {
     Cell *next;
   };
-  template<int Size, int Align>
-  union CellSized {
-    Cell cell;
-    alignas(Align) char space[Size];
-  };
   struct Cell* head;
   struct Cell* tail; // meaningful only when head != nullptr
 };
@@ -275,14 +271,15 @@ inline void ncclMemoryPoolConstruct(struct ncclMemoryPool* me) {
 template<typename T>
 inline T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing) {
   using Cell = ncclMemoryPool::Cell;
-  using CellSized = ncclMemoryPool::CellSized<sizeof(T), alignof(T)>;
   Cell* cell;
   if (__builtin_expect(me->head != nullptr, true)) {
     cell = me->head;
     me->head = cell->next;
   } else {
     // Use the internal allocate() since it doesn't memset to 0 yet.
-    cell = (Cell*)ncclMemoryStack::allocate(backing, sizeof(CellSized), alignof(CellSized));
+    size_t cellSize = std::max(sizeof(Cell), sizeof(T));
+    size_t cellAlign = std::max(alignof(Cell), alignof(T));
+    cell = (Cell*)ncclMemoryStack::allocate(backing, cellSize, cellAlign);
   }
   memset(cell, 0, sizeof(T));
   return reinterpret_cast<T*>(cell);
@@ -349,6 +346,32 @@ inline T* ncclIntruQueueDequeue(ncclIntruQueue<T,next> *me) {
   return ans;
 }
 
+template<typename T, T *T::*next>
+inline bool ncclIntruQueueDelete(ncclIntruQueue<T,next> *me, T *x) {
+  T *prev = nullptr;
+  T *cur = me->head;
+  bool found = false;
+
+  while (cur) {
+    if (cur == x) {
+      found = true;
+      break;
+    }
+    prev = cur;
+    cur = cur->*next;
+  }
+
+  if (found) {
+    if (prev == nullptr)
+      me->head = cur->*next;
+    else
+      prev->*next = cur->*next;
+    if (cur == me->tail)
+      me->tail = prev;
+  }
+  return found;
+}
+
 template<typename T, T *T::*next>
 inline T* ncclIntruQueueTryDequeue(ncclIntruQueue<T,next> *me) {
   T *ans = me->head;
diff --git a/src/init.cc b/src/init.cc
index 309ce10bb9..c681f2afa8 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -16,6 +16,7 @@
 #include "enqueue.h"
 #include "graph.h"
 #include "argcheck.h"
+#include "tuner.h"
 #include <fcntl.h>
 #include <string.h>
 #include <errno.h>
@@ -24,6 +25,7 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <unistd.h>
+#include "param.h"
 
 #define STR2(v) #v
 #define STR(v) STR2(v)
@@ -177,7 +179,13 @@ static ncclResult_t commFree(ncclComm_t comm) {
    * free all intra-process communicators; therefore, we only need to focus on local
    * resource cleanup in commFree(). */
   if (comm->proxyState && comm->proxyRefCountOld == 0 && comm->proxyState->thread) {
-    pthread_join(comm->proxyState->thread, nullptr);
+    if (*comm->abortFlag == 0) {
+      /* regular thread join */
+      pthread_join(comm->proxyState->thread, nullptr);
+    } else {
+      /* try to detach thread due to abort */
+      ncclProxyTryDetach(comm->proxyState);
+    }
   }
 
   delete[] comm->userRedOps;
@@ -211,7 +219,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
       free(comm->sharedRes->tpRankToLocalRank);
       NCCLCHECK(ncclStrongStreamDestruct(&comm->sharedRes->hostStream));
       NCCLCHECK(ncclStrongStreamDestruct(&comm->sharedRes->deviceStream));
-      NCCLCHECK(ncclProxyDestroy(comm));
+      NCCLCHECK(ncclProxyDestroy(comm->sharedRes->proxyState));
       free(comm->sharedRes);
     }
   }
@@ -229,13 +237,25 @@ static ncclResult_t commFree(ncclComm_t comm) {
 
   if (ncclAtomicRefCountDecrement(comm->abortFlagRefCount) == 0) {
     NCCLCHECK(ncclCudaHostFree((void *)comm->abortFlag));
-    free(comm->abortFlagRefCount);
+    free((void*)comm->abortFlagRefCount);
   }
   free((void*)comm->config.netName);
 
   free(comm->topParentRanks);
   free(comm->topParentLocalRanks);
 
+  while (!ncclIntruQueueEmpty(&comm->regRecordQueue)) {
+    struct ncclRegRecord* rec = ncclIntruQueueDequeue(&comm->regRecordQueue);
+    NCCLCHECK(ncclNvlsDeregBuffer(&rec->mcHandle, rec->regAddr, rec->dev, rec->regSize));
+    free(rec->addrs);
+    free(rec);
+  }
+
+  while (!ncclIntruQueueEmpty(&comm->regRequestQueue)) {
+    struct ncclRegRequest* req = ncclIntruQueueDequeue(&comm->regRequestQueue);
+    free(req);
+  }
+
   commPoison(comm); // poison comm before free to avoid comm reuse.
   free(comm);
 
@@ -275,7 +295,7 @@ ncclResult_t ncclCommEnsureReady(ncclComm_t comm) {
   ncclResult_t ret = ncclSuccess;
 
   if (*comm->abortFlag) {
-    ncclGroupJobAbort();
+    ncclGroupJobAbort(comm->groupJob);
   } else {
     NCCLCHECK(ncclCommGetAsyncError(comm, &ret));
     if (ret != ncclSuccess) {
@@ -284,6 +304,11 @@ ncclResult_t ncclCommEnsureReady(ncclComm_t comm) {
       if (ret == ncclInProgress) ret = ncclInvalidArgument;
       goto exit;
     }
+    /* if there is linked group job, we should complete it. */
+    if (comm->groupJob) {
+      NCCLCHECK(ncclGroupJobComplete(comm->groupJob));
+      comm->groupJob = NULL;
+    }
   }
 
 exit:
@@ -338,6 +363,7 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
   ncclMemoryPoolConstruct(&comm->memPool_ncclKernelPlan);
   ncclMemoryPoolConstruct(&comm->memPool_ncclProxyOp);
   ncclMemoryPoolConstruct(&comm->memPool_ncclPointerList);
+  ncclMemoryPoolConstruct(&comm->memPool_ncclNvlsHandleList);
 
   comm->groupNext = reinterpret_cast<struct ncclComm*>(0x1);
   comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
@@ -373,6 +399,8 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
       comm->topParentRanks[i] = i;
   }
 
+  ncclIntruQueueConstruct(&comm->regRequestQueue);
+  ncclIntruQueueConstruct(&comm->regRecordQueue);
   ncclIntruQueueMpscConstruct(&comm->callbackQueue);
   return ncclSuccess;
 }
@@ -393,6 +421,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
   for (int p=0; p < NCCL_NUM_PROTOCOLS; p++) {
     tmpCommAndChans.comm.buffSizes[p] = comm->buffSizes[p];
   }
+  tmpCommAndChans.comm.p2pChunkSize = comm->p2pChunkSize;
   tmpCommAndChans.comm.channels = &devCommAndChans->channels[0];
 
   comm->workFifoDepth = ncclParamWorkFifoDepth();
@@ -500,7 +529,6 @@ static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank,
 #define DEFAULT_LL_BUFFSIZE (NCCL_LL_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS*NCCL_STEPS*sizeof(union ncclLLFifoLine))
 #define DEFAULT_LL128_BUFFSIZE (NCCL_LL128_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS*NCCL_STEPS*sizeof(uint64_t))
 #define DEFAULT_BUFFSIZE (1 << 22) /* 4MiB */
-#define DEFAULT_BUFFSIZE_ARM (1 << 20) /* 1MiB */
 NCCL_PARAM(BuffSize, "BUFFSIZE", -2);
 NCCL_PARAM(LlBuffSize, "LL_BUFFSIZE", -2);
 NCCL_PARAM(Ll128BuffSize, "LL128_BUFFSIZE", -2);
@@ -516,8 +544,6 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
   int64_t envs[NCCL_NUM_PROTOCOLS] = { ncclParamLlBuffSize(), ncclParamLl128BuffSize(), ncclParamBuffSize() };
   int defaults[NCCL_NUM_PROTOCOLS] = { DEFAULT_LL_BUFFSIZE, DEFAULT_LL128_BUFFSIZE, DEFAULT_BUFFSIZE };
 
-  if (cpuArch == NCCL_TOPO_CPU_ARCH_ARM) defaults[NCCL_PROTO_SIMPLE] = DEFAULT_BUFFSIZE_ARM;
-
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
     comm->buffSizes[p] = envs[p] != -2 ? envs[p] : defaults[p];
   }
@@ -525,6 +551,10 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
   if (comm->nNodes > 1) comm->p2pChunkSize = ncclParamP2pNetChunkSize();
   else if (ncclTopoPathAllNVLink(comm->topo)) comm->p2pChunkSize = ncclParamP2pNvlChunkSize();
   else comm->p2pChunkSize = ncclParamP2pPciChunkSize();
+
+  // Make sure P2P chunksize is not larger than coll chunksize.
+  if (comm->p2pChunkSize * NCCL_STEPS > comm->buffSizes[NCCL_PROTO_SIMPLE]) comm->p2pChunkSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
+
   if (comm->sharedRes->owner != comm) {
     /* make sure split comm p2pChunkSize won't exceed shared p2pChunkSize. */
     comm->p2pChunkSize = std::min(comm->p2pChunkSize, comm->sharedRes->tpP2pChunkSize);
@@ -606,7 +636,7 @@ static ncclResult_t collNetTrySetup(ncclComm_t comm, ncclComm_t parent, struct n
       if (share) {
         if (myinfo->isMaster) {
           comm->collNetSharedRes = parent->collNetSharedRes;
-          comm->collNetChannels = std::min(std::max(comm->nChannels, comm->nvlsChannels), parent->collNetSharedRes->nChannels);
+          comm->collNetChannels = std::min(comm->nChannels, parent->collNetSharedRes->nChannels);
           for (int c = 0; c < comm->collNetChannels; ++c)
             NCCLCHECKGOTO(initCollnetChannel(comm, c, parent, true), ret, fail);
         }
@@ -625,8 +655,7 @@ static ncclResult_t collNetTrySetup(ncclComm_t comm, ncclComm_t parent, struct n
   } else {
     /* this allocated buffer will be freed on proxy side */
     NCCLCHECK(ncclCalloc(&comm->collNetSharedRes, 1));
-    /* TODO: min or max? */
-    comm->collNetChannels = comm->collNetSharedRes->nChannels = std::max(comm->nChannels, comm->nvlsChannels);
+    comm->collNetChannels = comm->collNetSharedRes->nChannels = comm->nChannels;
     comm->collNetSharedRes->buffSize = comm->buffSizes[NCCL_PROTO_SIMPLE];
     for (int c = 0; c < comm->collNetChannels; c++) {
       struct ncclChannel* channel = comm->channels + c;
@@ -804,6 +833,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
     int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0;
     for (int i = 0; i < nranks; i++) comm->minCompCap = std::min(comm->minCompCap, comm->peerInfo[rank].cudaCompCap);
     for (int i = 0; i < nranks; i++) comm->maxCompCap = std::max(comm->maxCompCap, comm->peerInfo[rank].cudaCompCap);
+
+    comm->nvlsRegSupport = 1;
     for (int i = 0; i < nranks; i++) {
       if ((comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash)
           && (comm->peerInfo[i].pidHash == comm->peerInfo[rank].pidHash)) {
@@ -816,6 +847,16 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
           comm->intraNext = comm->peerInfo[i].comm;
         }
       }
+
+      if (comm->nvlsRegSupport) {
+        for (int j = i + 1; j < nranks; j++) {
+          if (comm->peerInfo[i].hostHash == comm->peerInfo[j].hostHash &&
+            comm->peerInfo[i].pidHash == comm->peerInfo[j].pidHash) {
+            comm->nvlsRegSupport = 0;
+            break;
+          }
+        }
+      }
     }
     TRACE(NCCL_INIT,"pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
         rank, comm->peerInfo[rank].pidHash, intraProcRank, intraProcRanks, intraProcRank0);
@@ -859,7 +900,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
 
   // Determine local CollNet support
   if (collNetSupport(comm)) {
-    char *collNetEnable = getenv("NCCL_COLLNET_ENABLE");
+    const char *collNetEnable = ncclGetEnv("NCCL_COLLNET_ENABLE");
     if (collNetEnable != NULL) {
       INFO(NCCL_ALL, "NCCL_COLLNET_ENABLE set by environment to %s.", collNetEnable);
       if (strcmp(collNetEnable, "1") == 0) {
@@ -872,22 +913,23 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   NCCLCHECK(ncclNvlsInit(comm));
 
   // Get rings and trees
+  memset(&ringGraph, 0, sizeof(struct ncclTopoGraph));
   ringGraph.id = 0;
   ringGraph.pattern = NCCL_TOPO_PATTERN_RING;
-  ringGraph.collNet = 0;
   ringGraph.minChannels = 1;
   ringGraph.maxChannels = MAXCHANNELS/2;
   NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &ringGraph), ret, fail);
   NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &ringGraph), ret, fail);
 
+  memset(&treeGraph, 0, sizeof(struct ncclTopoGraph));
   treeGraph.id = 1;
   treeGraph.pattern = NCCL_TOPO_PATTERN_BALANCED_TREE;
-  treeGraph.collNet = 0;
   treeGraph.minChannels = ringGraph.nChannels;
   treeGraph.maxChannels = ringGraph.nChannels;
   NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &treeGraph), ret, fail);
   NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &treeGraph), ret, fail);
 
+  memset(&collNetGraph, 0, sizeof(struct ncclTopoGraph));
   collNetGraph.id = 2;
   collNetGraph.pattern = NCCL_TOPO_PATTERN_TREE;
   collNetGraph.collNet = 1;
@@ -895,20 +937,16 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   if (comm->collNetSupport) {
     NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &collNetGraph), ret, fail);
     NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &collNetGraph), ret, fail);
-  } else {
-    collNetGraph.nChannels = 0;
   }
 
+  memset(&nvlsGraph, 0, sizeof(struct ncclTopoGraph));
   nvlsGraph.id = 3;
   nvlsGraph.pattern = NCCL_TOPO_PATTERN_NVLS;
-  nvlsGraph.collNet = 0;
   nvlsGraph.minChannels = 1;
   nvlsGraph.maxChannels = MAXCHANNELS;
   if (comm->nvlsSupport) {
     NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &nvlsGraph), ret, fail);
     NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &nvlsGraph), ret, fail);
-  } else {
-    nvlsGraph.nChannels = 0;
   }
 
   // Initialize num P2P LL buffers for this communicator
@@ -1136,7 +1174,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
       int sendNode = (node+delta)%nNodes;
       for (int step=0; step < steps; step++) {
         int recvIndex = (localRank-step+steps)%steps;
-	int recvRank = recvIndex < nodeRanks[recvNode].localRanks ? nodeRanks[recvNode].localRankToRank[recvIndex] : -1;
+        int recvRank = recvIndex < nodeRanks[recvNode].localRanks ? nodeRanks[recvNode].localRankToRank[recvIndex] : -1;
         tasks->p2pRecvOrder[i] = recvRank;
         int sendIndex = (localRank+step)%steps;
         int sendRank = sendIndex < nodeRanks[sendNode].localRanks ? nodeRanks[sendNode].localRankToRank[sendIndex] : -1;
@@ -1197,7 +1235,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   }
 
   if (comm->intraRank == 0) { // Load ncclParamLaunchMode
-    char* str = getenv("NCCL_LAUNCH_MODE");
+    const char* str = ncclGetEnv("NCCL_LAUNCH_MODE");
     enum ncclLaunchMode mode, modeOld;
     if (str && strcasecmp(str, "GROUP") == 0) {
       mode = ncclLaunchModeGroup;
@@ -1357,6 +1395,11 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
 
   NCCLCHECKGOTO(initTransportsRank(comm, job->parent), res, fail);
 
+  NCCLCHECKGOTO(ncclLoadTunerPlugin(&comm->tuner), res, fail);
+  if (comm->tuner) {
+    NCCLCHECK(comm->tuner->init(comm->nRanks, comm->nNodes, ncclDebugLog));
+  }
+
   // update communicator state
   comm->initState = ncclSuccess;
 
@@ -1425,7 +1468,7 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
     comm->config.maxCTAs = maxCTAsEnv;
   }
 
-  envNetName = getenv("NCCL_NET");
+  envNetName = ncclGetEnv("NCCL_NET");
   if (envNetName)
     tmpNetName = envNetName;
   if (tmpNetName != NULL) {
@@ -1560,7 +1603,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUni
   ncclResult_t res = ncclSuccess;
   ncclComm_t comm = NULL;
   struct ncclCommInitRankAsyncJob *job = NULL;
-  char* env = getenv("NCCL_COMM_ID");
+  const char* env = ncclGetEnv("NCCL_COMM_ID");
   if (env && myrank == 0) {
     INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env);
     NCCLCHECKGOTO(bootstrapCreateRoot((struct ncclBootstrapHandle*)&commId, true), res, fail);
@@ -1602,7 +1645,7 @@ exit:
 fail:
   if (comm) {
     if (comm->abortFlag) ncclCudaHostFree((void *)comm->abortFlag);
-    if (comm->abortFlagRefCount) free(comm->abortFlagRefCount);
+    if (comm->abortFlagRefCount) free((void*)comm->abortFlagRefCount);
     free(comm);
   }
   if (newcomm) *newcomm = NULL;
@@ -1777,6 +1820,11 @@ static ncclResult_t commCleanup(ncclComm_t comm) {
     CUDACHECK(cudaSetDevice(commDevice));
   }
 
+  if (comm->tuner != NULL) {
+    NCCLCHECK(comm->tuner->destroy());
+    NCCLCHECK(ncclCloseTunerPlugin(&comm->tuner));
+  }
+
   NCCLCHECK(commFree(comm));
 
   if (savedDevice != commDevice) {
@@ -1991,6 +2039,7 @@ ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newc
   NCCLCHECK(ncclGroupStartInternal());
   NCCLCHECKGOTO(PtrCheck(comm, "CommSplit", "comm"), res, fail);
   NCCLCHECKGOTO(PtrCheck(newcomm, "CommSplit", "newcomm"), res, fail);
+  NCCLCHECKGOTO(ncclCommEnsureReady(comm), res, fail);
 
   /* *newcomm should be NCCL_COMM_NULL until comm split fully complete. */
   *newcomm = NCCL_COMM_NULL;
@@ -2037,7 +2086,7 @@ fail:
   if (childComm) {
     if (comm && !comm->config.splitShare) {
       if (childComm->abortFlag) ncclCudaHostFree((void*)childComm->abortFlag);
-      if (childComm->abortFlagRefCount) free(childComm->abortFlagRefCount);
+      if (childComm->abortFlagRefCount) free((void*)childComm->abortFlagRefCount);
     }
     free(childComm);
   }
@@ -2074,6 +2123,7 @@ ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) {
   NCCLCHECK(PtrCheck(asyncError, "ncclGetAsyncError", "asyncError"));
 
   *asyncError = __atomic_load_n(&comm->asyncResult, __ATOMIC_ACQUIRE);
+  if (*asyncError == ncclSuccess && comm->proxyState) *asyncError = __atomic_load_n(&comm->proxyState->asyncResult, __ATOMIC_ACQUIRE);
   return ncclSuccess;
 }
 
@@ -2116,3 +2166,208 @@ ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
   *rank = comm->rank;
   return ncclSuccess;
 }
+
+NCCL_PARAM(LocalRegister, "LOCAL_REGISTER", 1);
+
+NCCL_API(ncclResult_t, ncclCommRegister, const ncclComm_t comm, void* buff, size_t size, void** handle);
+ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) {
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  ncclResult_t ret = ncclSuccess;
+
+#if CUDART_VERSION >= 12010
+  size_t granularity;
+  if (ncclParamLocalRegister()) {
+    if (comm == NCCL_COMM_NULL || buff == NULL || handle == NULL || size == 0) {
+      WARN("Invalid arguments comm %p, buff %p, size %ld, handle %p", comm, buff, size, handle);
+      ret = ncclInvalidArgument;
+    } else if (comm->nvlsSupport) {
+      CUmulticastObjectProp prop = comm->nvlsResources->properties;
+
+      prop.size = size;
+      CUCHECK(cuMulticastGetGranularity(&granularity, &prop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
+
+      if ((uintptr_t)buff % comm->nvlsResources->ucGran == 0 && size % granularity == 0) {
+        /* we can direct register what user provide */
+        struct ncclRegRequest* req;
+        NCCLCHECK(ncclCalloc(&req, 1));
+        req->buff = (uintptr_t)buff;
+        req->size = size;
+        ncclIntruQueueEnqueue(&comm->regRequestQueue, req);
+        *handle = (void*)req;
+      } else {
+        void* base;
+        size_t baseSize;
+        /* Since we don't provide actually allocated buffer size for users by ncclMemAlloc,
+         * therefore, we need to get the full range of the buffer by cuMemGetAddressRange to
+         * register buffers. */
+        CUCHECK(cuMemGetAddressRange((CUdeviceptr*)&base, &baseSize, (CUdeviceptr)buff));
+        if ((uintptr_t)base % comm->nvlsResources->ucGran == 0 && baseSize % granularity == 0) {
+          struct ncclRegRequest* req;
+          NCCLCHECK(ncclCalloc(&req, 1));
+          req->buff = (uintptr_t)base;
+          req->size = baseSize;
+          ncclIntruQueueEnqueue(&comm->regRequestQueue, req);
+          *handle = (void*)req;
+        } else {
+          WARN("register fails, buffer %p (aligned %s, granularity %ld) and size %ld (aligned %s, granularity %ld) for registration", buff, (uintptr_t)buff % comm->nvlsResources->ucGran == 0 ? "TRUE" : "FALSE", comm->nvlsResources->ucGran, size, size % granularity == 0 ? "TRUE" : "FALSE", granularity);
+          ret = ncclInvalidArgument;
+        }
+      }
+    }
+  }
+#endif
+
+  return ret;
+}
+
+NCCL_API(ncclResult_t, ncclCommDeregister, const ncclComm_t comm, void* handle);
+ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle) {
+  ncclResult_t ret = ncclSuccess;
+
+#if CUDART_VERSION >= 12010
+  struct ncclRegRequest* dreq = (struct ncclRegRequest*)handle;
+  if (ncclParamLocalRegister()) {
+    if (comm == NCCL_COMM_NULL || handle == NULL) {
+      WARN("Invalid arguments comm %p, handle %p", comm, handle);
+      ret = ncclInvalidArgument;
+    } else {
+      struct ncclRegRecord* rec;
+
+      /* first release register record */
+      rec = ncclIntruQueueHead(&comm->regRecordQueue);
+
+      while (rec) {
+        if (rec->buff == dreq->buff && rec->size == dreq->size) {
+          NCCLCHECK(ncclNvlsDeregBuffer(&rec->mcHandle, rec->regAddr, rec->dev, rec->regSize));
+          ncclIntruQueueDelete(&comm->regRecordQueue, rec);
+          free(rec->addrs);
+          free(rec);
+          break;
+        }
+        rec = rec->next;
+      }
+
+      /* then free register request */
+      if (ncclIntruQueueDelete(&comm->regRequestQueue, dreq) == false) {
+        WARN("Invalid handle %p", handle);
+        ret = ncclInvalidArgument;
+      }
+    }
+  }
+#endif
+
+  return ret;
+}
+
+NCCL_API(ncclResult_t, ncclMemAlloc, void **ptr, size_t size);
+ncclResult_t  ncclMemAlloc(void **ptr, size_t size) {
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  ncclResult_t ret = ncclSuccess;
+
+#if CUDART_VERSION >= 12010
+  size_t memGran = 0;
+  size_t mcGran = 0;
+  CUdevice currentDev;
+  CUmemAllocationProp memprop = {};
+  CUmulticastObjectProp mcprop = {};
+  CUmemAccessDesc accessDesc = {};
+  CUmemGenericAllocationHandle handle;
+  int cudaDev;
+  int flag = 0;
+  int dcnt;
+  int mcSupport = 0;
+
+  if (ptr == NULL || size == 0) goto fallback;
+
+  if (ncclCudaLibraryInit() != ncclSuccess) goto fallback;
+
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  CUCHECK(cuDeviceGet(&currentDev, cudaDev));
+  if (CUPFN(cuMulticastCreate) != NULL)
+    CUCHECK(cuDeviceGetAttribute(&mcSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, currentDev));
+
+  if (mcSupport) {
+    memprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+    memprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+    memprop.requestedHandleTypes = NVLS_CU_MEM_HANDLE_TYPE;
+    memprop.location.id = currentDev;
+    // Query device to see if RDMA support is available
+    CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, currentDev));
+    if (flag) memprop.allocFlags.gpuDirectRDMACapable = 1;
+    CUCHECK(cuMemGetAllocationGranularity(&memGran, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
+
+    /* mc property */
+    CUDACHECK(cudaGetDeviceCount(&dcnt));
+    mcprop.size = size;
+    /* device cnt is a dummy value right now, it might affect mc granularity in the future. */
+    mcprop.numDevices = dcnt;
+    mcprop.handleTypes = NVLS_CU_MEM_HANDLE_TYPE;
+    mcprop.flags = 0;
+    CUCHECK(cuMulticastGetGranularity(&mcGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
+
+    /* only size needs to be aligned to mcGran */
+    ALIGN_SIZE(size, mcGran);
+    /* Allocate the physical memory on the device */
+    CUCHECK(cuMemCreate(&handle, size, &memprop, 0));
+    /* Reserve a virtual address range */
+    CUCHECK(cuMemAddressReserve((CUdeviceptr*)ptr, size, memGran, 0, 0));
+    /* Map the virtual address range to the physical allocation */
+    CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
+    /* Now allow RW access to the newly mapped memory */
+    for (int i = 0; i < dcnt; ++i) {
+      int p2p = 0;
+      if (i == cudaDev || ((cudaDeviceCanAccessPeer(&p2p, cudaDev, i) == cudaSuccess) && p2p)) {
+        accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+        accessDesc.location.id = i;
+        accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+        CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
+      }
+    }
+    goto exit;
+  }
+
+fallback:
+#endif
+  CUDACHECKGOTO(cudaMalloc(ptr, size), ret, fail);
+
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+NCCL_API(ncclResult_t, ncclMemFree, void *ptr);
+ncclResult_t  ncclMemFree(void *ptr) {
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  ncclResult_t ret = ncclSuccess;
+  int saveDevice;
+
+  CUDACHECK(cudaGetDevice(&saveDevice));
+#if CUDART_VERSION >= 12010
+  CUdevice ptrDev = 0;
+  int mcSupport = 0;
+
+  if (ptr == NULL) goto fallback;
+
+  if (ncclCudaLibraryInit() != ncclSuccess) goto fallback;
+
+  CUCHECKGOTO(cuPointerGetAttribute((void*)&ptrDev, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, (CUdeviceptr)ptr), ret, fail);
+  if (CUPFN(cuMulticastCreate) != NULL)
+    CUCHECKGOTO(cuDeviceGetAttribute(&mcSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, ptrDev), ret, fail);
+
+  CUDACHECKGOTO(cudaSetDevice((int)ptrDev), ret, fail);
+  if (mcSupport) {
+    NCCLCHECKGOTO(ncclCuMemFree(ptr), ret, fail);
+    goto exit;
+  }
+
+fallback:
+#endif
+  CUDACHECKGOTO(cudaFree(ptr), ret, fail);
+
+exit:
+  cudaSetDevice(saveDevice);
+  return ret;
+fail:
+  goto exit;
+}
diff --git a/src/misc/cudawrap.cc b/src/misc/cudawrap.cc
index 334ee10f69..f2260a1c0e 100644
--- a/src/misc/cudawrap.cc
+++ b/src/misc/cudawrap.cc
@@ -12,7 +12,7 @@
 #include <dlfcn.h>
 
 // This env var (NCCL_CUMEM_ENABLE) toggles cuMem API usage
-NCCL_PARAM(CuMemEnable, "CUMEM_ENABLE", 0);
+NCCL_PARAM(CuMemEnable, "CUMEM_ENABLE", -2);
 
 static int ncclCuMemSupported = 0;
 
@@ -43,7 +43,9 @@ error:
 }
 
 int ncclCuMemEnable() {
-  return ((ncclParamCuMemEnable() == -2 && ncclCuMemSupported) || ncclParamCuMemEnable());
+  // NCCL_CUMEM_ENABLE=-2 means auto-detect CUMEM support
+  int param = ncclParamCuMemEnable();
+  return  param >= 0 ? param : (param == -2 && ncclCuMemSupported);
 }
 
 #define DECLARE_CUDA_PFN(symbol,version) PFN_##symbol##_v##version pfn_##symbol = nullptr
@@ -74,6 +76,8 @@ DECLARE_CUDA_PFN(cuMemRelease, 10020);
 DECLARE_CUDA_PFN(cuMemRetainAllocationHandle, 11000);
 DECLARE_CUDA_PFN(cuMemSetAccess, 10020);
 DECLARE_CUDA_PFN(cuMemUnmap, 10020);
+/* ncclMemAlloc/Free */
+DECLARE_CUDA_PFN(cuPointerGetAttribute, 4000);
 #if CUDA_VERSION >= 11070
 /* transport/collNet.cc/net.cc*/
 DECLARE_CUDA_PFN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support
@@ -137,6 +141,8 @@ static ncclResult_t cudaPfnFuncLoader(void) {
   LOAD_SYM(cuMemRetainAllocationHandle, 11000, 1);
   LOAD_SYM(cuMemSetAccess, 10020, 1);
   LOAD_SYM(cuMemUnmap, 10020, 1);
+/* ncclMemAlloc/Free */
+  LOAD_SYM(cuPointerGetAttribute, 4000, 1);
 #if CUDA_VERSION >= 11070
   LOAD_SYM(cuMemGetHandleForAddressRange, 11070, 1); // DMA-BUF support
 #endif
@@ -158,7 +164,7 @@ static ncclResult_t initResult;
 
 static void initOnceFunc() {
   do {
-    char* val = getenv("CUDA_LAUNCH_BLOCKING");
+    const char* val = ncclGetEnv("CUDA_LAUNCH_BLOCKING");
     ncclCudaLaunchBlocking = val!=nullptr && val[0]!=0 && !(val[0]=='0' && val[1]==0);
   } while (0);
 
@@ -167,7 +173,7 @@ static void initOnceFunc() {
    * Load CUDA driver library
    */
   char path[1024];
-  char *ncclCudaPath = getenv("NCCL_CUDA_PATH");
+  const char *ncclCudaPath = ncclGetEnv("NCCL_CUDA_PATH");
   if (ncclCudaPath == NULL)
     snprintf(path, 1024, "%s", "libcuda.so");
   else
diff --git a/src/misc/ibvsymbols.cc b/src/misc/ibvsymbols.cc
index c41a457c8f..bd5f33390f 100644
--- a/src/misc/ibvsymbols.cc
+++ b/src/misc/ibvsymbols.cc
@@ -50,6 +50,9 @@ ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols) {
   ASSIGN_SYM(ibvSymbols, ibv_destroy_qp, ibv_internal_destroy_qp);
   ASSIGN_SYM(ibvSymbols, ibv_fork_init, ibv_internal_fork_init);
   ASSIGN_SYM(ibvSymbols, ibv_event_type_str, ibv_internal_event_type_str);
+  
+  ASSIGN_SYM(ibvSymbols, ibv_query_ece, ibv_internal_query_ece);
+  ASSIGN_SYM(ibvSymbols, ibv_set_ece, ibv_internal_set_ece);
 
   ibvSymbols->ibv_internal_reg_mr = &ibv_internal_reg_mr;
   ibvSymbols->ibv_internal_query_port = &ibv_internal_query_port;
@@ -123,6 +126,9 @@ ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols) {
   LOAD_SYM(ibvhandle, "ibv_fork_init", ibvSymbols->ibv_internal_fork_init);
   LOAD_SYM(ibvhandle, "ibv_event_type_str", ibvSymbols->ibv_internal_event_type_str);
 
+  LOAD_SYM_VERSION(ibvhandle, "ibv_query_ece", ibvSymbols->ibv_internal_query_ece, "IBVERBS_1.10");
+  LOAD_SYM_VERSION(ibvhandle, "ibv_set_ece",   ibvSymbols->ibv_internal_set_ece, "IBVERBS_1.10");
+
   return ncclSuccess;
 
 teardown:
@@ -150,6 +156,8 @@ teardown:
   ibvSymbols->ibv_internal_destroy_qp = NULL;
   ibvSymbols->ibv_internal_fork_init = NULL;
   ibvSymbols->ibv_internal_event_type_str = NULL;
+  ibvSymbols->ibv_internal_query_ece = NULL;
+  ibvSymbols->ibv_internal_set_ece = NULL;
 
   if (ibvhandle != NULL) dlclose(ibvhandle);
   return ncclSystemError;
diff --git a/src/misc/ibvwrap.cc b/src/misc/ibvwrap.cc
index bc896e10eb..eb4e52b606 100644
--- a/src/misc/ibvwrap.cc
+++ b/src/misc/ibvwrap.cc
@@ -45,11 +45,30 @@ ncclResult_t wrap_ibv_symbols(void) {
   } \
   return ncclSuccess;
 
+#define IBV_INT_CHECK_RET_ERRNO_OPTIONAL(container, internal_name, call, success_retval, name, supported) \
+  if (container.internal_name == NULL) { \
+    INFO(NCCL_NET, "Call to " name " skipped, internal_name doesn't exist"); \
+    *supported = 0; \
+    return ncclSuccess; \
+  } \
+  int ret = container.call; \
+  if (ret == ENOTSUP || ret == EOPNOTSUPP) { \
+    INFO(NCCL_NET, "Call to " name " failed with error %s errno %d", strerror(ret), ret); \
+    *supported = 0; \
+    return ncclSuccess; \
+  } else if (ret != success_retval) { \
+    WARN("Call to " name " failed with error %s errno %d", strerror(ret), ret); \
+    *supported = 1; \
+    return ncclSystemError; \
+  } \
+  *supported = 1; \
+  return ncclSuccess;
+
 #define IBV_INT_CHECK_RET_ERRNO(container, internal_name, call, success_retval, name) \
   CHECK_NOT_NULL(container, internal_name); \
   int ret = container.call; \
   if (ret != success_retval) { \
-    WARN("Call to " name " failed with error %s", strerror(ret)); \
+    WARN("Call to " name " failed with error %s errno %d", strerror(ret), ret); \
     return ncclSystemError; \
   } \
   return ncclSuccess;
@@ -187,6 +206,14 @@ ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int
   IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_modify_qp, ibv_internal_modify_qp(qp, attr, attr_mask), 0, "ibv_modify_qp");
 }
 
+ncclResult_t wrap_ibv_query_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
+  IBV_INT_CHECK_RET_ERRNO_OPTIONAL(ibvSymbols, ibv_internal_query_ece, ibv_internal_query_ece(qp, ece), 0, "ibv_query_ece", supported);
+}
+
+ncclResult_t wrap_ibv_set_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
+  IBV_INT_CHECK_RET_ERRNO_OPTIONAL(ibvSymbols, ibv_internal_set_ece, ibv_internal_set_ece(qp, ece), 0, "ibv_set_ece", supported);
+}
+
 ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event) {
   *ret = (char *) ibvSymbols.ibv_internal_event_type_str(event);
   return ncclSuccess;
diff --git a/src/misc/ipcsocket.cc b/src/misc/ipcsocket.cc
index b2dee4852d..9d66ac7197 100644
--- a/src/misc/ipcsocket.cc
+++ b/src/misc/ipcsocket.cc
@@ -30,7 +30,7 @@ ncclResult_t ncclIpcSocketInit(ncclIpcSocket *handle, int rank, uint64_t hash, v
   handle->fd = -1;
   handle->socketName[0] = '\0';
   if ((fd = socket(AF_UNIX, SOCK_DGRAM, 0)) < 0) {
-    WARN("UDS: Socket creation error : %d", errno);
+    WARN("UDS: Socket creation error : %s (%d)", strerror(errno), errno);
     return ncclSystemError;
   }
 
@@ -54,7 +54,7 @@ ncclResult_t ncclIpcSocketInit(ncclIpcSocket *handle, int rank, uint64_t hash, v
   cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick
 #endif
   if (bind(fd, (struct sockaddr *)&cliaddr, sizeof(cliaddr)) < 0) {
-    WARN("UDS: Binding to socket %s failed : %d", temp, errno);
+    WARN("UDS: Binding to socket %s failed : %s (%d)", temp, strerror(errno), errno);
     close(fd);
     return ncclSystemError;
   }
@@ -73,6 +73,15 @@ ncclResult_t ncclIpcSocketInit(ncclIpcSocket *handle, int rank, uint64_t hash, v
   return ncclSuccess;
 }
 
+ncclResult_t ncclIpcSocketGetFd(struct ncclIpcSocket* handle, int* fd) {
+  if (handle == NULL) {
+    WARN("ncclSocketGetFd: pass NULL socket");
+    return ncclInvalidArgument;
+  }
+  if (fd) *fd = handle->fd;
+  return ncclSuccess;
+}
+
 ncclResult_t ncclIpcSocketClose(ncclIpcSocket *handle) {
   if (handle == NULL) {
     return ncclInternalError;
@@ -90,7 +99,7 @@ ncclResult_t ncclIpcSocketClose(ncclIpcSocket *handle) {
   return ncclSuccess;
 }
 
-ncclResult_t ncclIpcSocketRecvFd(ncclIpcSocket *handle, int *recvFd) {
+ncclResult_t ncclIpcSocketRecvMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, int *recvFd) {
   struct msghdr msg = {0, 0, 0, 0, 0, 0, 0};
   struct iovec iov[1];
 
@@ -107,8 +116,13 @@ ncclResult_t ncclIpcSocketRecvFd(ncclIpcSocket *handle, int *recvFd) {
   msg.msg_control = control_un.control;
   msg.msg_controllen = sizeof(control_un.control);
 
-  iov[0].iov_base = (void *)dummy_buffer;
-  iov[0].iov_len = sizeof(dummy_buffer);
+  if (hdr == NULL) {
+    iov[0].iov_base = (void *)dummy_buffer;
+    iov[0].iov_len = sizeof(dummy_buffer);
+  } else {
+    iov[0].iov_base = hdr;
+    iov[0].iov_len = hdrLen;
+  }
 
   msg.msg_iov = iov;
   msg.msg_iovlen = 1;
@@ -121,25 +135,30 @@ ncclResult_t ncclIpcSocketRecvFd(ncclIpcSocket *handle, int *recvFd) {
     if (handle->abortFlag && *handle->abortFlag) return ncclInternalError;
   }
 
-  if (((cmptr = CMSG_FIRSTHDR(&msg)) != NULL) && (cmptr->cmsg_len == CMSG_LEN(sizeof(int)))) {
-    if ((cmptr->cmsg_level != SOL_SOCKET) || (cmptr->cmsg_type != SCM_RIGHTS)) {
-      WARN("UDS: Receiving data over socket failed");
+  if (recvFd != NULL) {
+    if (((cmptr = CMSG_FIRSTHDR(&msg)) != NULL) && (cmptr->cmsg_len == CMSG_LEN(sizeof(int)))) {
+      if ((cmptr->cmsg_level != SOL_SOCKET) || (cmptr->cmsg_type != SCM_RIGHTS)) {
+        WARN("UDS: Receiving data over socket failed");
+      return ncclSystemError;
+      }
+
+      memmove(recvFd, CMSG_DATA(cmptr), sizeof(*recvFd));
+    } else {
+      WARN("UDS: Receiving data over socket %s failed", handle->socketName);
       return ncclSystemError;
     }
-
-    memmove(recvFd, CMSG_DATA(cmptr), sizeof(*recvFd));
-  } else {
-    WARN("UDS: Receiving data over socket %s failed", handle->socketName);
-    return ncclSystemError;
+    TRACE(NCCL_INIT|NCCL_P2P, "UDS: Got recvFd %d from socket %s", *recvFd, handle->socketName);
   }
 
-  TRACE(NCCL_INIT|NCCL_P2P, "UDS: Got recvFd %d from socket %s", *recvFd, handle->socketName);
-
   return ncclSuccess;
 }
 
-ncclResult_t ncclIpcSocketSendFd(ncclIpcSocket *handle, const int sendFd, int rank, uint64_t hash) {
-  struct msghdr msg;
+ncclResult_t ncclIpcSocketRecvFd(ncclIpcSocket *handle, int *recvFd) {
+  return ncclIpcSocketRecvMsg(handle, NULL, 0, recvFd);
+}
+
+ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, const int sendFd, int rank, uint64_t hash) {
+  struct msghdr msg = {0, 0, 0, 0, 0, 0, 0};
   struct iovec iov[1];
   char temp[NCCL_IPC_SOCKNAME_LEN];
 
@@ -149,6 +168,7 @@ ncclResult_t ncclIpcSocketSendFd(ncclIpcSocket *handle, const int sendFd, int ra
   } control_un;
 
   struct cmsghdr *cmptr;
+  char dummy_buffer[1];
   struct sockaddr_un cliaddr;
 
   // Construct client address to send this shareable handle to
@@ -162,35 +182,43 @@ ncclResult_t ncclIpcSocketSendFd(ncclIpcSocket *handle, const int sendFd, int ra
   }
   (void) strncpy(cliaddr.sun_path, temp, len);
 
-  TRACE(NCCL_INIT, "UDS: Sending fd %d to UDS socket %s", sendFd, temp);
-
 #ifdef USE_ABSTRACT_SOCKET
   cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick
 #endif
 
-  msg.msg_control = control_un.control;
-  msg.msg_controllen = sizeof(control_un.control);
+  TRACE(NCCL_INIT, "UDS: Sending hdr %p len %d to UDS socket %s", hdr, hdrLen, temp);
 
-  cmptr = CMSG_FIRSTHDR(&msg);
-  cmptr->cmsg_len = CMSG_LEN(sizeof(int));
-  cmptr->cmsg_level = SOL_SOCKET;
-  cmptr->cmsg_type = SCM_RIGHTS;
+  if (sendFd != -1) {
+    TRACE(NCCL_INIT, "UDS: Sending fd %d to UDS socket %s", sendFd, temp);
 
-  memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd));
+    msg.msg_control = control_un.control;
+    msg.msg_controllen = sizeof(control_un.control);
+
+    cmptr = CMSG_FIRSTHDR(&msg);
+    cmptr->cmsg_len = CMSG_LEN(sizeof(int));
+    cmptr->cmsg_level = SOL_SOCKET;
+    cmptr->cmsg_type = SCM_RIGHTS;
+    memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd));
+  }
 
   msg.msg_name = (void *)&cliaddr;
   msg.msg_namelen = sizeof(struct sockaddr_un);
 
-  iov[0].iov_base = (void *)"";
-  iov[0].iov_len = 1;
+  if (hdr == NULL) {
+    iov[0].iov_base = (void *)dummy_buffer;
+    iov[0].iov_len = sizeof(dummy_buffer);
+  } else {
+    iov[0].iov_base = hdr;
+    iov[0].iov_len = hdrLen;
+  }
   msg.msg_iov = iov;
   msg.msg_iovlen = 1;
   msg.msg_flags = 0;
 
   ssize_t sendResult;
-  while ((sendResult = sendmsg(handle->fd, &msg, 0)) <= 0) {
+  while ((sendResult = sendmsg(handle->fd, &msg, 0)) < 0) {
     if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
-      WARN("UDS: Sending data over socket %s failed : %d", temp, errno);
+      WARN("UDS: Sending data over socket %s failed : %s (%d)", temp, strerror(errno), errno);
       return ncclSystemError;
     }
     if (handle->abortFlag && *handle->abortFlag) return ncclInternalError;
@@ -198,3 +226,7 @@ ncclResult_t ncclIpcSocketSendFd(ncclIpcSocket *handle, const int sendFd, int ra
 
   return ncclSuccess;
 }
+
+ncclResult_t ncclIpcSocketSendFd(ncclIpcSocket *handle, const int sendFd, int rank, uint64_t hash) {
+  return ncclIpcSocketSendMsg(handle, NULL, 0, sendFd, rank, hash);
+}
diff --git a/src/misc/param.cc b/src/misc/param.cc
index bf7aa00871..e0b6ab821b 100644
--- a/src/misc/param.cc
+++ b/src/misc/param.cc
@@ -63,7 +63,7 @@ void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int6
   static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
   pthread_mutex_lock(&mutex);
   if (__atomic_load_n(cache, __ATOMIC_RELAXED) == uninitialized) {
-    char* str = getenv(env);
+    const char* str = ncclGetEnv(env);
     int64_t value = deftVal;
     if (str && strlen(str) > 0) {
       errno = 0;
@@ -79,3 +79,9 @@ void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int6
   }
   pthread_mutex_unlock(&mutex);
 }
+
+const char *ncclGetEnv(const char *name) {
+  static pthread_once_t once = PTHREAD_ONCE_INIT;
+  pthread_once(&once, initEnv);
+  return getenv(name);
+}
\ No newline at end of file
diff --git a/src/misc/profiler.cc b/src/misc/profiler.cc
index 145b18fe8c..785d616b8b 100644
--- a/src/misc/profiler.cc
+++ b/src/misc/profiler.cc
@@ -61,7 +61,7 @@ void ncclProfilingDump() {
   static int dumpDone = 0;
   if (dumpDone) return;
   dumpDone = 1;
-  const char* str = getenv("NCCL_PROXY_PROFILE");
+  const char* str = ncclGetEnv("NCCL_PROXY_PROFILE");
   if (!str) { free(profilingEvents); return; }
   FILE* f = fopen(str, "w");
   fprintf(f, "[\n");
diff --git a/src/misc/shmutils.cc b/src/misc/shmutils.cc
index ce05c3ef3e..80ece40c1c 100644
--- a/src/misc/shmutils.cc
+++ b/src/misc/shmutils.cc
@@ -5,6 +5,7 @@
  ************************************************************************/
 
 #include "shm.h"
+#include "comm.h"
 #include "checks.h"
 #include <sys/types.h>
 #include <sys/mman.h>
@@ -67,7 +68,7 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de
       SYSCHECKGOTO(fd = open(shmPath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), ret, fail);
     }
 
-    if (ftruncate(fd, realShmSize) != 0) {
+    if (fallocate(fd, 0, 0, realShmSize) != 0) {
       WARN("Error: failed to extend %s to %ld bytes", shmPath, realShmSize);
       ret = ncclSystemError;
       goto fail;
@@ -162,3 +163,37 @@ ncclResult_t ncclShmUnlink(ncclShmHandle_t handle) {
   }
   return ret;
 }
+
+ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize) {
+  ncclResult_t ret = ncclSuccess;
+  int curRound = shmem->round;
+  size_t mycnt;
+
+  if (comm == NULL || shmem == NULL || sendbuff == NULL || recvbuff == NULL) {
+    ret = ncclInvalidArgument;
+    goto exit;
+  }
+
+  memcpy((char*)shmem->ptr[curRound] + comm->localRank * typeSize, sendbuff, typeSize);
+  /* sync among local ranks */
+  mycnt = __atomic_add_fetch(shmem->cnt[curRound], 1, __ATOMIC_ACQ_REL);
+  if (mycnt == comm->localRanks) {
+    *shmem->cnt[curRound ^ 1] = 0; /* prepare next round */
+    __atomic_store_n(shmem->cnt[curRound], comm->localRanks + 1, __ATOMIC_RELEASE); /* release everyone */
+  } else {
+    uint64_t t0 = clockNano();
+    while(__atomic_load_n(shmem->cnt[curRound], __ATOMIC_ACQUIRE) != comm->localRanks + 1) {
+      if (clockNano() - t0 >= 5 * 1000) sched_yield();
+      if (*comm->abortFlag == 1) {
+        ret = ncclInternalError;
+        goto exit;
+      }
+    }
+  }
+
+  memcpy(recvbuff, (const void*)shmem->ptr[curRound], comm->localRanks * typeSize);
+  shmem->round ^= 1;
+
+exit:
+  return ret;
+}
diff --git a/src/misc/socket.cc b/src/misc/socket.cc
index 5700d83cdc..149bd73aa1 100644
--- a/src/misc/socket.cc
+++ b/src/misc/socket.cc
@@ -11,6 +11,7 @@
 #include <unistd.h>
 #include <ifaddrs.h>
 #include <net/if.h>
+#include "param.h"
 
 static ncclResult_t socketProgressOpt(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int block, int* closed) {
   int bytes = 0;
@@ -84,7 +85,7 @@ static uint16_t socketToPort(union ncclSocketAddress *addr) {
 /* Allow the user to force the IPv4/IPv6 interface selection */
 static int envSocketFamily(void) {
   int family = -1; // Family selection is not forced, will use first one found
-  char* env = getenv("NCCL_SOCKET_FAMILY");
+  const char* env = ncclGetEnv("NCCL_SOCKET_FAMILY");
   if (env == NULL)
     return family;
 
@@ -325,7 +326,7 @@ int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNa
   // Allow user to force the INET socket family selection
   int sock_family = envSocketFamily();
   // User specified interface
-  char* env = getenv("NCCL_SOCKET_IFNAME");
+  const char* env = ncclGetEnv("NCCL_SOCKET_IFNAME");
   if (env && strlen(env) > 1) {
     INFO(NCCL_ENV, "NCCL_SOCKET_IFNAME set by environment to %s", env);
     // Specified by user : find or fail
@@ -337,10 +338,10 @@ int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNa
     nIfs = findInterfaces("ib", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
     // else see if we can get some hint from COMM ID
     if (nIfs == 0) {
-      char* commId = getenv("NCCL_COMM_ID");
+      const char* commId = ncclGetEnv("NCCL_COMM_ID");
       if (commId && strlen(commId) > 1) {
-	INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commId);
-	// Try to find interface that is in the same subnet as the IP in comm id
+        INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commId);
+        // Try to find interface that is in the same subnet as the IP in comm id
         union ncclSocketAddress idAddr;
         ncclSocketGetAddrFromString(&idAddr, commId);
         nIfs = ncclFindInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, maxIfs);
diff --git a/src/misc/tuner.cc b/src/misc/tuner.cc
new file mode 100644
index 0000000000..bfe61e8c1d
--- /dev/null
+++ b/src/misc/tuner.cc
@@ -0,0 +1,82 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <dlfcn.h>
+#include <errno.h>
+#include <stdlib.h>
+
+#include "debug.h"
+#include "nccl_tuner.h"
+
+pthread_mutex_t tunerPluginLock = PTHREAD_MUTEX_INITIALIZER;
+static int tunerPluginRefCount = -1;
+static void* tunerPluginLib = nullptr;
+ncclTuner_t* tunerSymbol = nullptr;
+
+ncclResult_t ncclLoadTunerPlugin(ncclTuner_t** tuner) {
+  // Initialize to nullptr by default if plugin tuner cannot be loaded.
+  *tuner = nullptr;
+  if (tunerPluginRefCount == -2) return ncclSuccess;
+
+  pthread_mutex_lock(&tunerPluginLock);
+  if (tunerPluginRefCount == -1) {
+    tunerPluginRefCount = -2; // Default: no plugin, don't try again later
+
+    const char* name = getenv("NCCL_TUNER_PLUGIN");
+    if (name) {
+      INFO(NCCL_TUNING, "NCCL_TUNER_PLUGIN set to %s", name);
+      tunerPluginLib = dlopen(name, RTLD_LAZY | RTLD_LOCAL);
+    }
+    if (tunerPluginLib == nullptr) {
+      // dlopen does not guarantee to set errno, but dlerror only gives us a
+      // string, so checking errno doesn't hurt to try to provide a better
+      // error message
+      if (errno == ENOENT) {
+        INFO(NCCL_TUNING, "Tuner: no plugin found '%s', using default tuner instead.", name);
+      } else {
+        INFO(NCCL_TUNING, "Tuner: plugin load '%s' returned error (%d : %s), using default tuner instead.", name, errno, dlerror());
+      }
+    } else {
+      tunerSymbol = (ncclTuner_t*)dlsym(tunerPluginLib, NCCL_TUNER_PLUGIN_SYMBOL);
+      if (tunerSymbol == nullptr) {
+        INFO(NCCL_TUNING, "Tuner: failed to find " NCCL_TUNER_PLUGIN_SYMBOL " in plugin (%s), using default tuner instead.", name);
+        dlclose(tunerPluginLib);
+        tunerPluginLib = nullptr;
+      } else {
+        INFO(NCCL_TUNING, "Opened tuner: '%s'", tunerSymbol->name);
+        tunerPluginRefCount = 0;
+      }
+    }
+  }
+
+  if (tunerPluginRefCount >= 0) {
+    *tuner = tunerSymbol;
+    INFO(NCCL_INIT, "Using tuner plugin: '%s'", tunerSymbol->name);
+    tunerPluginRefCount++;
+  }
+  pthread_mutex_unlock(&tunerPluginLock);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCloseTunerPlugin(ncclTuner_t** tuner) {
+  if (*tuner == nullptr) return ncclSuccess;
+  pthread_mutex_lock(&tunerPluginLock);
+  if (--tunerPluginRefCount == 0) {
+    if (tunerPluginLib == nullptr) {
+      WARN("Tuner plugin refcount is 0, yet tunerPluginLib ptr is NULL\n");
+    } else {
+      INFO(NCCL_TUNING, "Closing tuner: '%s'", tunerSymbol->name);
+      dlclose(tunerPluginLib);
+    }
+    tunerPluginLib = nullptr;
+    tunerSymbol = nullptr;
+    *tuner = nullptr;
+    tunerPluginRefCount = -1;
+  }
+  pthread_mutex_unlock(&tunerPluginLock);
+  return ncclSuccess;
+}
diff --git a/src/misc/utils.cc b/src/misc/utils.cc
index 20e8e41a60..b775666799 100644
--- a/src/misc/utils.cc
+++ b/src/misc/utils.cc
@@ -85,13 +85,13 @@ uint64_t getHash(const char* string, int n) {
 #define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
 uint64_t getHostHash(void) {
   char hostHash[1024];
-  char *hostId;
+  const char *hostId;
 
   // Fall back is the full hostname if something fails
   (void) getHostName(hostHash, sizeof(hostHash), '\0');
   int offset = strlen(hostHash);
 
-  if ((hostId = getenv("NCCL_HOSTID")) != NULL) {
+  if ((hostId = ncclGetEnv("NCCL_HOSTID")) != NULL) {
     INFO(NCCL_ENV, "NCCL_HOSTID set by environment to %s", hostId);
     strncpy(hostHash, hostId, sizeof(hostHash));
   } else {
diff --git a/src/nccl.h.in b/src/nccl.h.in
index 0b613eef93..1585d58acb 100644
--- a/src/nccl.h.in
+++ b/src/nccl.h.in
@@ -78,6 +78,15 @@ typedef struct ncclConfig_v21700 {
   NCCL_CONFIG_UNDEF_INT                     /* splitShare */            \
 }
 
+/* NCCL malloc and free function for all types of NCCL optimizations
+ * (e.g. user buffer registration). The actual allocated size might
+ * be larger than requested due to granularity requirement. */
+ncclResult_t  ncclMemAlloc(void** ptr, size_t size);
+ncclResult_t pncclMemAlloc(void** ptr, size_t size);
+
+ncclResult_t  ncclMemFree(void *ptr);
+ncclResult_t pncclMemFree(void *ptr);
+
 /* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
  * This integer is coded with the MAJOR, MINOR and PATCH level of the
  * NCCL library
@@ -417,6 +426,14 @@ ncclResult_t pncclGroupStart();
 ncclResult_t  ncclGroupEnd();
 ncclResult_t pncclGroupEnd();
 
+/* Register CUDA buffer for zero-copy operation */
+ncclResult_t  ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
+ncclResult_t pncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
+
+/* Deregister CUDA buffer */
+ncclResult_t  ncclCommDeregister(const ncclComm_t comm, void* handle);
+ncclResult_t pncclCommDeregister(const ncclComm_t comm, void* handle);
+
 #ifdef __cplusplus
 } // end extern "C"
 #endif
diff --git a/src/net.cc b/src/net.cc
index 3077f8806f..2bfc9a9277 100644
--- a/src/net.cc
+++ b/src/net.cc
@@ -1,3 +1,9 @@
+/*************************************************************************
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
 #include "net.h"
 #include "bootstrap.h"
 #include "checks.h"
@@ -9,148 +15,190 @@
 //#include <sys/stat.h>
 //#include <unistd.h>
 
-static ncclNet_v6_t ncclNet_v4_as_v6;
-static ncclNet_v6_t ncclNet_v5_as_v6;
-static ncclNet_v4_t *ncclNet_v4;
+static ncclNet_v7_t ncclNet_v5_as_v7;
+static ncclNet_v7_t ncclNet_v6_as_v7;
 static ncclNet_v5_t *ncclNet_v5;
-static ncclCollNet_v6_t ncclCollNet_v4_as_v6;
-static ncclCollNet_v6_t ncclCollNet_v5_as_v6;
-static ncclCollNet_v4_t *ncclCollNet_v4;
+static ncclNet_v6_t *ncclNet_v6;
+static ncclCollNet_v7_t ncclCollNet_v5_as_v7;
+static ncclCollNet_v7_t ncclCollNet_v6_as_v7;
 static ncclCollNet_v5_t *ncclCollNet_v5;
+static ncclCollNet_v6_t *ncclCollNet_v6;
 
-static ncclResult_t ncclNet_v4_as_v6_getProperties(int dev, ncclNetProperties_v6_t* props) {
-  ncclNetProperties_v4_t p4;
-  ncclResult_t ans = ncclNet_v4->getProperties(dev, &p4);
+static ncclResult_t ncclNet_v6_as_v7_getProperties(int dev, ncclNetProperties_v7_t* props) {
+  ncclNetProperties_v6_t p6;
+  ncclResult_t ans = ncclNet_v6->getProperties(dev, &p6);
   if (ans != ncclSuccess) return ans;
-  props->name = p4.name;
-  props->pciPath = p4.pciPath;
-  props->guid = p4.guid;
-  props->ptrSupport = p4.ptrSupport;
-  props->speed = p4.speed;
-  props->port = p4.port;
-  props->maxComms = p4.maxComms;
-  props->maxRecvs = 1;
-  props->latency = 0;
+  props->name = p6.name;
+  props->pciPath = p6.pciPath;
+  props->guid = p6.guid;
+  props->ptrSupport = p6.ptrSupport;
+  props->speed = p6.speed;
+  props->port = p6.port;
+  props->maxComms = p6.maxComms;
+  props->maxRecvs = p6.maxRecvs;
+  props->latency = p6.latency;
+  props->netDeviceType = NCCL_NET_DEVICE_HOST;
+  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
   return ncclSuccess;
 }
 
-static ncclResult_t ncclNet_v4_as_v6_isend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
-  return ncclNet_v4->isend(sendComm, data, size, mhandle, request);
+static ncclResult_t ncclNet_v6_as_v7_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
+  return ncclNet_v6->connect(dev, handle, sendComm);
 }
 
-static ncclResult_t ncclNet_v4_as_v6_irecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
-  if (n == 0) return ncclSuccess;
-  if (n != 1) return ncclInvalidArgument;
-  return ncclNet_v4->irecv(recvComm, data[0], sizes[0], mhandles[0], request);
+static ncclResult_t ncclNet_v6_as_v7_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
+  return ncclNet_v6->accept(listenComm, recvComm);
 }
 
-static ncclResult_t ncclNet_v4_as_v6_iflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) {
-  if (n == 0) return ncclSuccess;
-  if (n != 1) return ncclInvalidArgument;
-  return ncclNet_v4->iflush(recvComm, data[0], sizes[0], mhandles[0], request);
-}
-
-// We use a wrapper around the v4 init to copy over the struct contents
-// post-init since they may not be initialized before hand.
-static ncclResult_t ncclNet_v4_as_v6_init(ncclDebugLogger_t logfn) {
-  NCCLCHECK(ncclNet_v4->init(logfn));
-  ncclNet_v4_as_v6.name = ncclNet_v4->name;
-  ncclNet_v4_as_v6.devices = ncclNet_v4->devices;
-  ncclNet_v4_as_v6.getProperties = ncclNet_v4_as_v6_getProperties;
-  ncclNet_v4_as_v6.listen = ncclNet_v4->listen;
-  ncclNet_v4_as_v6.connect = ncclNet_v4->connect;
-  ncclNet_v4_as_v6.accept = ncclNet_v4->accept;
-  ncclNet_v4_as_v6.regMr = ncclNet_v4->regMr;
-  ncclNet_v4_as_v6.regMrDmaBuf = NULL;
-  ncclNet_v4_as_v6.deregMr = ncclNet_v4->deregMr;
-  ncclNet_v4_as_v6.isend = ncclNet_v4_as_v6_isend;
-  ncclNet_v4_as_v6.irecv = ncclNet_v4_as_v6_irecv;
-  ncclNet_v4_as_v6.iflush = ncclNet_v4_as_v6_iflush;
-  ncclNet_v4_as_v6.test = ncclNet_v4->test;
-  ncclNet_v4_as_v6.closeSend = ncclNet_v4->closeSend;
-  ncclNet_v4_as_v6.closeRecv = ncclNet_v4->closeRecv;
-  ncclNet_v4_as_v6.closeListen = ncclNet_v4->closeListen;
+static ncclResult_t ncclNet_v6_as_v7_init(ncclDebugLogger_t logfn) {
+  NCCLCHECK(ncclNet_v6->init(logfn));
+  ncclNet_v6_as_v7.name = ncclNet_v6->name;
+  ncclNet_v6_as_v7.devices = ncclNet_v6->devices;
+  ncclNet_v6_as_v7.getProperties = ncclNet_v6_as_v7_getProperties; // ncclNet_v5->getProperties;
+  ncclNet_v6_as_v7.listen = ncclNet_v6->listen;
+  ncclNet_v6_as_v7.connect = ncclNet_v6_as_v7_connect;
+  ncclNet_v6_as_v7.accept =  ncclNet_v6_as_v7_accept;
+  ncclNet_v6_as_v7.regMr = ncclNet_v6->regMr;
+  ncclNet_v6_as_v7.regMrDmaBuf = ncclNet_v6->regMrDmaBuf;
+  ncclNet_v6_as_v7.deregMr = ncclNet_v6->deregMr;
+  ncclNet_v6_as_v7.isend = ncclNet_v6->isend;
+  ncclNet_v6_as_v7.irecv = ncclNet_v6->irecv;
+  ncclNet_v6_as_v7.iflush = ncclNet_v6->iflush;
+  ncclNet_v6_as_v7.test = ncclNet_v6->test;
+  ncclNet_v6_as_v7.closeSend = ncclNet_v6->closeSend;
+  ncclNet_v6_as_v7.closeRecv = ncclNet_v6->closeRecv;
+  ncclNet_v6_as_v7.closeListen = ncclNet_v6->closeListen;
+  ncclNet_v6_as_v7.getDeviceMr = NULL;
+  ncclNet_v6_as_v7.irecvConsumed = NULL;
   return ncclSuccess;
 }
 
+static ncclResult_t ncclNet_v5_as_v7_getProperties(int dev, ncclNetProperties_v7_t* props) {
+  ncclNetProperties_v6_t p6;
+  ncclResult_t ans = ncclNet_v5->getProperties(dev, &p6);
+  if (ans != ncclSuccess) return ans;
+  props->name = p6.name;
+  props->pciPath = p6.pciPath;
+  props->guid = p6.guid;
+  props->ptrSupport = p6.ptrSupport;
+  props->speed = p6.speed;
+  props->port = p6.port;
+  props->maxComms = p6.maxComms;
+  props->maxRecvs = p6.maxRecvs;
+  props->latency = p6.latency;
+  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
+  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclNet_v5_as_v7_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
+  return ncclNet_v5->connect(dev, handle, sendComm);
+}
+
+static ncclResult_t ncclNet_v5_as_v7_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
+  return ncclNet_v5->accept(listenComm, recvComm);
+}
+
 // We use a wrapper around the v5 init to copy over the struct contents
 // post-init since they may not be initialized before hand.
-static ncclResult_t ncclNet_v5_as_v6_init(ncclDebugLogger_t logfn) {
+static ncclResult_t ncclNet_v5_as_v7_init(ncclDebugLogger_t logfn) {
   NCCLCHECK(ncclNet_v5->init(logfn));
-  ncclNet_v5_as_v6.name = ncclNet_v5->name;
-  ncclNet_v5_as_v6.devices = ncclNet_v5->devices;
-  ncclNet_v5_as_v6.getProperties = ncclNet_v5->getProperties;
-  ncclNet_v5_as_v6.listen = ncclNet_v5->listen;
-  ncclNet_v5_as_v6.connect = ncclNet_v5->connect;
-  ncclNet_v5_as_v6.accept = ncclNet_v5->accept;
-  ncclNet_v5_as_v6.regMr = ncclNet_v5->regMr;
-  ncclNet_v5_as_v6.regMrDmaBuf = NULL;
-  ncclNet_v5_as_v6.deregMr = ncclNet_v5->deregMr;
-  ncclNet_v5_as_v6.isend = ncclNet_v5->isend;
-  ncclNet_v5_as_v6.irecv = ncclNet_v5->irecv;
-  ncclNet_v5_as_v6.iflush = ncclNet_v5->iflush;
-  ncclNet_v5_as_v6.test = ncclNet_v5->test;
-  ncclNet_v5_as_v6.closeSend = ncclNet_v5->closeSend;
-  ncclNet_v5_as_v6.closeRecv = ncclNet_v5->closeRecv;
-  ncclNet_v5_as_v6.closeListen = ncclNet_v5->closeListen;
+  ncclNet_v5_as_v7.name = ncclNet_v5->name;
+  ncclNet_v5_as_v7.devices = ncclNet_v5->devices;
+  ncclNet_v5_as_v7.getProperties = ncclNet_v5_as_v7_getProperties;
+  ncclNet_v5_as_v7.listen = ncclNet_v5->listen;
+  ncclNet_v5_as_v7.connect = ncclNet_v5_as_v7_connect;
+  ncclNet_v5_as_v7.accept =  ncclNet_v5_as_v7_accept;
+  ncclNet_v5_as_v7.regMr = ncclNet_v5->regMr;
+  ncclNet_v5_as_v7.regMrDmaBuf = NULL;
+  ncclNet_v5_as_v7.deregMr = ncclNet_v5->deregMr;
+  ncclNet_v5_as_v7.isend = ncclNet_v5->isend;
+  ncclNet_v5_as_v7.irecv = ncclNet_v5->irecv;
+  ncclNet_v5_as_v7.iflush = ncclNet_v5->iflush;
+  ncclNet_v5_as_v7.test = ncclNet_v5->test;
+  ncclNet_v5_as_v7.closeSend = ncclNet_v5->closeSend;
+  ncclNet_v5_as_v7.closeRecv = ncclNet_v5->closeRecv;
+  ncclNet_v5_as_v7.closeListen = ncclNet_v5->closeListen;
+  ncclNet_v5_as_v7.getDeviceMr = NULL;
+  ncclNet_v5_as_v7.irecvConsumed = NULL;
   return ncclSuccess;
 }
 
-static ncclResult_t ncclCollNet_v4_as_v6_getProperties(int dev, ncclNetProperties_v6_t* props) {
-  ncclNetProperties_v4_t p4;
-  ncclResult_t ans = ncclCollNet_v4->getProperties(dev, &p4);
+static ncclResult_t ncclCollNet_v5_as_v7_getProperties(int dev, ncclNetProperties_v7_t* props) {
+  ncclNetProperties_v6_t p6;
+  ncclResult_t ans = ncclCollNet_v5->getProperties(dev, &p6);
   if (ans != ncclSuccess) return ans;
-  props->name = p4.name;
-  props->pciPath = p4.pciPath;
-  props->guid = p4.guid;
-  props->ptrSupport = p4.ptrSupport;
-  props->speed = p4.speed;
-  props->port = p4.port;
-  props->maxComms = p4.maxComms;
-  props->maxRecvs = 1;
-  props->latency = 0;
-  return ncclSuccess;
-}
-
-// We use a wrapper around the v4 init to copy over the struct contents
-// post-init since they may not be initialized before hand.
-static ncclResult_t ncclCollNet_v4_as_v6_init(ncclDebugLogger_t logfn) {
-  NCCLCHECK(ncclCollNet_v4->init(logfn));
-  ncclCollNet_v4_as_v6.name = ncclCollNet_v4->name;
-  ncclCollNet_v4_as_v6.devices = ncclCollNet_v4->devices;
-  ncclCollNet_v4_as_v6.getProperties = ncclCollNet_v4_as_v6_getProperties;
-  ncclCollNet_v4_as_v6.listen = ncclCollNet_v4->listen;
-  ncclCollNet_v4_as_v6.connect = ncclCollNet_v4->connect;
-  ncclCollNet_v4_as_v6.reduceSupport = ncclCollNet_v4->reduceSupport;
-  ncclCollNet_v4_as_v6.regMr = ncclCollNet_v4->regMr;
-  ncclCollNet_v4_as_v6.regMrDmaBuf = NULL;
-  ncclCollNet_v4_as_v6.deregMr = ncclCollNet_v4->deregMr;
-  ncclCollNet_v4_as_v6.iallreduce = ncclCollNet_v4->iallreduce;
-  ncclCollNet_v4_as_v6.iflush = ncclCollNet_v4->iflush;
-  ncclCollNet_v4_as_v6.test = ncclCollNet_v4->test;
-  ncclCollNet_v4_as_v6.closeColl = ncclCollNet_v4->closeColl;
-  ncclCollNet_v4_as_v6.closeListen = ncclCollNet_v4->closeListen;
+  props->name = p6.name;
+  props->pciPath = p6.pciPath;
+  props->guid = p6.guid;
+  props->ptrSupport = p6.ptrSupport;
+  props->speed = p6.speed;
+  props->port = p6.port;
+  props->maxComms = p6.maxComms;
+  props->maxRecvs = p6.maxRecvs;
+  props->latency = p6.latency;
+  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
+  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
   return ncclSuccess;
 }
 
 // We use a wrapper around the v5 init to copy over the struct contents
 // post-init since they may not be initialized before hand.
-static ncclResult_t ncclCollNet_v5_as_v6_init(ncclDebugLogger_t logfn) {
+static ncclResult_t ncclCollNet_v5_as_v7_init(ncclDebugLogger_t logfn) {
   NCCLCHECK(ncclCollNet_v5->init(logfn));
-  ncclCollNet_v5_as_v6.name = ncclCollNet_v5->name;
-  ncclCollNet_v5_as_v6.devices = ncclCollNet_v5->devices;
-  ncclCollNet_v5_as_v6.getProperties = ncclCollNet_v5->getProperties;
-  ncclCollNet_v5_as_v6.listen = ncclCollNet_v5->listen;
-  ncclCollNet_v5_as_v6.connect = ncclCollNet_v5->connect;
-  ncclCollNet_v5_as_v6.reduceSupport = ncclCollNet_v5->reduceSupport;
-  ncclCollNet_v5_as_v6.regMr = ncclCollNet_v5->regMr;
-  ncclCollNet_v5_as_v6.regMrDmaBuf = NULL;
-  ncclCollNet_v5_as_v6.deregMr = ncclCollNet_v5->deregMr;
-  ncclCollNet_v5_as_v6.iallreduce = ncclCollNet_v5->iallreduce;
-  ncclCollNet_v5_as_v6.iflush = ncclCollNet_v5->iflush;
-  ncclCollNet_v5_as_v6.test = ncclCollNet_v5->test;
-  ncclCollNet_v5_as_v6.closeColl = ncclCollNet_v5->closeColl;
-  ncclCollNet_v5_as_v6.closeListen = ncclCollNet_v5->closeListen;
+  ncclCollNet_v5_as_v7.name = ncclCollNet_v5->name;
+  ncclCollNet_v5_as_v7.devices = ncclCollNet_v5->devices;
+  ncclCollNet_v5_as_v7.getProperties = ncclCollNet_v5_as_v7_getProperties;
+  ncclCollNet_v5_as_v7.listen = ncclCollNet_v5->listen;
+  ncclCollNet_v5_as_v7.connect = ncclCollNet_v5->connect;
+  ncclCollNet_v5_as_v7.reduceSupport = ncclCollNet_v5->reduceSupport;
+  ncclCollNet_v5_as_v7.regMr = ncclCollNet_v5->regMr;
+  ncclCollNet_v5_as_v7.regMrDmaBuf = NULL;
+  ncclCollNet_v5_as_v7.deregMr = ncclCollNet_v5->deregMr;
+  ncclCollNet_v5_as_v7.iallreduce = ncclCollNet_v5->iallreduce;
+  ncclCollNet_v5_as_v7.iflush = ncclCollNet_v5->iflush;
+  ncclCollNet_v5_as_v7.test = ncclCollNet_v5->test;
+  ncclCollNet_v5_as_v7.closeColl = ncclCollNet_v5->closeColl;
+  ncclCollNet_v5_as_v7.closeListen = ncclCollNet_v5->closeListen;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclCollNet_v6_as_v7_getProperties(int dev, ncclNetProperties_v7_t* props) {
+  ncclNetProperties_v6_t p6;
+  ncclResult_t ans = ncclCollNet_v6->getProperties(dev, &p6);
+  if (ans != ncclSuccess) return ans;
+  props->name = p6.name;
+  props->pciPath = p6.pciPath;
+  props->guid = p6.guid;
+  props->ptrSupport = p6.ptrSupport;
+  props->speed = p6.speed;
+  props->port = p6.port;
+  props->maxComms = p6.maxComms;
+  props->maxRecvs = p6.maxRecvs;
+  props->latency = p6.latency;
+  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
+  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  return ncclSuccess;
+}
+
+// We use a wrapper around the v5 init to copy over the struct contents
+// post-init since they may not be initialized before hand.
+static ncclResult_t ncclCollNet_v6_as_v7_init(ncclDebugLogger_t logfn) {
+  NCCLCHECK(ncclCollNet_v6->init(logfn));
+  ncclCollNet_v6_as_v7.name = ncclCollNet_v6->name;
+  ncclCollNet_v6_as_v7.devices = ncclCollNet_v6->devices;
+  ncclCollNet_v6_as_v7.getProperties = ncclCollNet_v6_as_v7_getProperties;
+  ncclCollNet_v6_as_v7.listen = ncclCollNet_v6->listen;
+  ncclCollNet_v6_as_v7.connect = ncclCollNet_v6->connect;
+  ncclCollNet_v6_as_v7.reduceSupport = ncclCollNet_v6->reduceSupport;
+  ncclCollNet_v6_as_v7.regMr = ncclCollNet_v6->regMr;
+  ncclCollNet_v6_as_v7.regMrDmaBuf = ncclCollNet_v6->regMrDmaBuf;
+  ncclCollNet_v6_as_v7.deregMr = ncclCollNet_v6->deregMr;
+  ncclCollNet_v6_as_v7.iallreduce = ncclCollNet_v6->iallreduce;
+  ncclCollNet_v6_as_v7.iflush = ncclCollNet_v6->iflush;
+  ncclCollNet_v6_as_v7.test = ncclCollNet_v6->test;
+  ncclCollNet_v6_as_v7.closeColl = ncclCollNet_v6->closeColl;
+  ncclCollNet_v6_as_v7.closeListen = ncclCollNet_v6->closeListen;
   return ncclSuccess;
 }
 
@@ -167,7 +215,7 @@ enum ncclNetState ncclCollNetStates[3] = { ncclNetStateInit, ncclNetStateInit, n
 
 ncclResult_t ncclNetPluginInit() {
   char ncclNetPluginName[128];
-  const char* envPluginName = getenv("NCCL_NET_PLUGIN");
+  const char* envPluginName = ncclGetEnv("NCCL_NET_PLUGIN");
   if (envPluginName && strlen(envPluginName)) {
     snprintf(ncclNetPluginName, 128, "libnccl-net-%s.so", envPluginName);
     INFO(NCCL_INIT, "Plugin name set by env to %s", ncclNetPluginName);
@@ -176,62 +224,97 @@ ncclResult_t ncclNetPluginInit() {
   }
   void* netPluginLib = dlopen(ncclNetPluginName, RTLD_NOW | RTLD_LOCAL);
   if (netPluginLib == nullptr) {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load (%s) returned %d : %s", ncclNetPluginName, errno, dlerror());
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found, using internal implementation");
+    // dlopen does not guarantee to set errno, but dlerror only gives us a
+    // string, so checking errno doesn't hurt to try to provide a better
+    // error message
+    if (errno == ENOENT) {
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : dlerror=%s No plugin found (%s), using internal implementation", dlerror(), ncclNetPluginName);
+      // exit(-1);
+    } else {
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror());
+    }
     return ncclSuccess;
   }
 
-  ncclNets[0] = (ncclNet_v6_t*)dlsym(netPluginLib, "ncclNetPlugin_v6");
+  ncclNets[0] = (ncclNet_v7_t*)dlsym(netPluginLib, "ncclNetPlugin_v7");
   if (ncclNets[0] == nullptr) {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v6 symbol.");
-    // Try v5 plugin
-    ncclNet_v5 = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5");
-    if (ncclNet_v5 == nullptr) {
-      ncclNet_v4 = (ncclNet_v4_t*)dlsym(netPluginLib, "ncclNetPlugin_v4");
-      if (ncclNet_v4 == nullptr) {
-        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin symbol (v4 or v5).");
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v7 symbol.");
+    // Try v6 plugin
+    ncclNet_v6 = (ncclNet_v6_t*)dlsym(netPluginLib, "ncclNetPlugin_v6");
+    if (ncclNet_v6 == nullptr) {
+      // Try v5 plugin
+      ncclNet_v5 = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5");
+      if (ncclNet_v5 == nullptr) {
+        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin symbol (>= v5). ncclNetPlugin symbols v4 and lower are not supported.");
         if (netPluginLib != nullptr) dlclose(netPluginLib);
         return ncclSuccess;
+      } else {
+        ncclNets[0] = &ncclNet_v5_as_v7;
+        ncclNet_v5_as_v7.init = ncclNet_v5_as_v7_init;
+        // Set the name right away to allow for NCCL_NET=... to work
+        ncclNet_v5_as_v7.name = ncclNet_v5->name;
+        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v5)", ncclNets[0]->name);
       }
-      ncclNets[0] = &ncclNet_v4_as_v6;
-      ncclNet_v4_as_v6.init = ncclNet_v4_as_v6_init;
-      // Set the name right away to allow for NCCL_NET=... to work
-      ncclNet_v4_as_v6.name = ncclNet_v4->name;
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v4)", ncclNets[0]->name);
     } else {
-      ncclNets[0] = &ncclNet_v5_as_v6;
-      ncclNet_v5_as_v6.init = ncclNet_v5_as_v6_init;
+      ncclNets[0] = &ncclNet_v6_as_v7;
+      ncclNet_v6_as_v7.init = ncclNet_v6_as_v7_init;
       // Set the name right away to allow for NCCL_NET=... to work
-      ncclNet_v5_as_v6.name = ncclNet_v5->name;
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v5)", ncclNets[0]->name);
+      ncclNet_v6_as_v7.name = ncclNet_v6->name;
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v6)", ncclNets[0]->name);
     }
   }
 
   // Check for CollNet
-  ncclCollNets[0] = (ncclCollNet_v6_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v6");
+  ncclCollNets[0] = (ncclCollNet_v7_t*) dlsym(netPluginLib, "ncclCollNetPlugin_v7");
   if (ncclCollNets[0] == nullptr) {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol.");
-    ncclCollNet_v5 = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5");
-    if (ncclCollNet_v5 == nullptr) {
-      ncclCollNet_v4 = (ncclCollNet_v4_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v4");
-      if (ncclCollNet_v4 == nullptr) {
-        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5).");
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v7 symbol.");
+    ncclCollNet_v6 = (ncclCollNet_v6_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v6");
+    if (ncclCollNet_v6 == nullptr) {
+      ncclCollNet_v5 = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5");
+      if (ncclCollNet_v5 == nullptr) {
+        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin symbol (>= v5). ncclCollNetPlugin symbols v4 and lower are not supported.");
       } else {
-        ncclCollNets[0] = &ncclCollNet_v4_as_v6;
-        ncclCollNet_v4_as_v6.init = ncclCollNet_v4_as_v6_init;
-        ncclCollNet_v4_as_v6.name = ncclCollNet_v4->name;
-        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v4)", ncclCollNets[0]->name);
+        ncclCollNets[0] = &ncclCollNet_v5_as_v7;
+        ncclCollNet_v5_as_v7.init = ncclCollNet_v5_as_v7_init;
+        ncclCollNet_v5_as_v7.name = ncclCollNet_v5->name;
+        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v5)", ncclCollNets[0]->name);
       }
     } else {
-      ncclCollNets[0] = &ncclCollNet_v5_as_v6;
-      ncclCollNet_v5_as_v6.init = ncclCollNet_v5_as_v6_init;
-      ncclCollNet_v5_as_v6.name = ncclCollNet_v5->name;
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v5)", ncclCollNets[0]->name);
+      ncclCollNets[0] = &ncclCollNet_v6_as_v7;
+      ncclCollNet_v6_as_v7.init = ncclCollNet_v6_as_v7_init;
+      ncclCollNet_v6_as_v7.name = ncclCollNet_v6->name;
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v6)", ncclCollNets[0]->name);
     }
   }
   return ncclSuccess;
 }
 
+ncclResult_t ncclNetCheckDeviceVersion(struct ncclComm* comm, ncclNet_t* net, int dev) {
+  ncclNetProperties_t props;
+
+  NCCLCHECK(net->getProperties(dev, &props));
+  ncclNetDeviceType type = props.netDeviceType;
+  if (type) switch (type) {
+    case NCCL_NET_DEVICE_UNPACK:
+      if (props.netDeviceVersion == NCCL_NET_DEVICE_UNPACK_VERSION) {
+        INFO(NCCL_INIT, "Using NCCL_NET_DEVICE_UNPACK net plugin version %d",
+          props.netDeviceVersion);
+        return ncclSuccess;
+      } else {
+        WARN("NCCL_DEVICE_UNPACK plugin has incompatible version %d, this NCCL build is compatible with %d, not using it",
+          props.netDeviceVersion, NCCL_NET_DEVICE_UNPACK_VERSION);
+        return ncclInternalError;
+      }
+    default:
+      WARN("Unknown device code index");
+      return ncclInternalError;
+  }
+
+  INFO(NCCL_INIT, "Using non-device net plugin version %d",
+    props.netDeviceVersion);
+  return ncclSuccess;
+}
+
 static ncclResult_t netGetState(int i, enum ncclNetState* state) {
   pthread_mutex_lock(&netLock);
   if (ncclNetStates[i] == ncclNetStateInit) {
@@ -268,6 +351,10 @@ ncclResult_t ncclNetInit(struct ncclComm* comm) {
     NCCLCHECK(netGetState(i, &state));
     if (state != ncclNetStateEnabled) continue;
     if (netName && strcasecmp(netName, ncclNets[i]->name) != 0) continue;
+    if (ncclSuccess != ncclNetCheckDeviceVersion(comm, ncclNets[i], 0)) {
+      // Mismatched device plugin version
+      continue;
+    }
 
     comm->ncclNet = ncclNets[i];
     ok = true;
@@ -334,10 +421,10 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
       }
 
       if (sComm == NULL)
-        NCCLCHECKGOTO(comm->ncclNet->connect(dev, &handle, &sComm), ret, cleanup2);
+        NCCLCHECKGOTO(comm->ncclNet->connect(dev, &handle, &sComm, NULL), ret, cleanup2);
 
       if (rComm == NULL)
-        NCCLCHECKGOTO(comm->ncclNet->accept(lComm, &rComm), ret, cleanup2);
+        NCCLCHECKGOTO(comm->ncclNet->accept(lComm, &rComm, NULL), ret, cleanup2);
 
       connected = (rComm != NULL) && (sComm != NULL);
     }
@@ -366,5 +453,11 @@ cleanup1:
 }
 
 int ncclNetVersion(struct ncclComm* comm) {
-  return (comm->ncclNet == &ncclNet_v4_as_v6) ? 4 : ((comm->ncclNet == &ncclNet_v5_as_v6) ? 5 : 6);
+  if (comm->ncclNet == &ncclNet_v5_as_v7) {
+    return 5;
+  } else if (comm->ncclNet == &ncclNet_v6_as_v7) {
+    return 6;
+  } else {
+    return 7;
+  }
 }
diff --git a/src/proxy.cc b/src/proxy.cc
index 9756c93dbb..976b1d3ba5 100644
--- a/src/proxy.cc
+++ b/src/proxy.cc
@@ -15,6 +15,16 @@
 
 #include <sys/syscall.h>
 #include <assert.h>
+#include <unistd.h>
+#include <sys/time.h>
+
+#define PROGRESS_RUNNING 0
+#define PROGRESS_REQUEST_STOP 1
+#define PROGRESS_ABORT 2
+#define PROGRESS_COMPLETE 3
+
+#define SERVICE_RUNNING 0
+#define SERVICE_COMPLETE 1
 
 enum { proxyRecv=0, proxySend=1 };
 
@@ -50,7 +60,7 @@ static void expectedProxyResponseFree(struct ncclProxyState* state) {
   }
 }
 
-static ncclResult_t expectedProxyResponseStore(struct ncclProxyState* state, void* opId, void* respBuff, int respSize) {
+static ncclResult_t expectedProxyResponseStore(struct ncclProxyState* state, void* opId, void* respBuff, int respSize, ncclResult_t res) {
   struct ncclExpectedProxyResponse* elem = state->expectedResponses;
   while (elem) {
     if (elem->opId == opId) {
@@ -67,6 +77,7 @@ static ncclResult_t expectedProxyResponseStore(struct ncclProxyState* state, voi
       memcpy(elem->respBuff, respBuff, respSize);
       free(respBuff);
       elem->done = true;
+      elem->res  = res;
       return ncclSuccess;
     }
     elem = elem->next;
@@ -84,6 +95,7 @@ static ncclResult_t expectedProxyResponseEnqueue(struct ncclProxyState* state, v
   // Pre-alloc response buffer
   ex->respBuff = malloc(respSize);
   ex->respSize = respSize;
+  ex->res      = ncclInternalError;
   ex->done     = false;
 
   // Enqueue
@@ -109,10 +121,11 @@ static ncclResult_t expectedProxyResponseDequeue(struct ncclProxyState* state, v
         prev->next = elem->next;
       }
       memcpy(respBuff, elem->respBuff, elem->respSize);
+      ncclResult_t res = elem->res;
       free(elem->respBuff);
       free(elem);
       *found = 1;
-      return ncclSuccess;
+      return res;
     }
     prev = elem;
     elem = elem->next;
@@ -509,7 +522,7 @@ static ncclResult_t SaveProxy(struct ncclComm* comm, struct ncclChannel* channel
         type == proxyRecv ? "recv" : "send", peer, channel->id, connIndex);
     return ncclInternalError;
   }
-  if (connector->transportComm->proxyProgress == NULL) return ncclSuccess;
+  if (connector->proxyConn.proxyProgress == NULL) return ncclSuccess;
 
   if (justInquire) *justInquire = true;
   else {
@@ -707,13 +720,13 @@ static ncclResult_t ncclProxyGetPostedOps(struct ncclProxyState* proxyState, int
 
   if (state->active == NULL) {
     pthread_mutex_lock(&pool->mutex);
-    while (pool->nextOps == -1 && !state->stop) {
+    while (pool->nextOps == -1 && state->stop == PROGRESS_RUNNING) {
       struct ncclProxyArgs profArgs; // Only used for profiling purposes
       ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileSleep);
       pthread_cond_wait(&pool->cond, &pool->mutex);
       ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileWakeup);
     }
-    if (state->stop) { // We might have been woken up to stop.
+    if (state->stop != PROGRESS_RUNNING) { // We might have been woken up to stop.
       pthread_mutex_unlock(&pool->mutex);
       return ncclSuccess;
     }
@@ -851,12 +864,13 @@ void* ncclProxyProgress(void *proxyState_) {
    * frequency of calling ncclProxyGetPostedOps() and reduce the perf impact. */
   int proxyOpAppendCounter = 0;
   struct ncclProxyArgs profArgs; // Only used for profiling purposes
-  while ((state->stop == false || (state->stop == true && state->active)) && *proxyState->abortFlag == 0) {
+  while (state->stop == PROGRESS_RUNNING || (state->stop == PROGRESS_REQUEST_STOP && state->active)) {
     int idle = 1;
     ncclResult_t ret = progressOps(proxyState, state, state->active, &idle);
     if (ret != ncclSuccess) {
-      INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
-      return NULL;
+      __atomic_store_n(&proxyState->asyncResult, ret, __ATOMIC_RELEASE);
+      INFO(NCCL_ALL,"%s:%d -> %d [Progress Thread]", __FILE__, __LINE__, ret);
+      continue;
     }
     if (lastIdle == 0 && idle == 1) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileIdle);
     if (lastIdle == 1 && idle == 0) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileActive);
@@ -864,11 +878,12 @@ void* ncclProxyProgress(void *proxyState_) {
       int added = 0;
       proxyOpAppendCounter = 0;
       TIME_START(3);
-      if (state->stop == false)
+      if (state->stop == PROGRESS_RUNNING)
         ret = ncclProxyGetPostedOps(proxyState, &added);
       if (added) { TIME_STOP(3); } else { TIME_CANCEL(3); }
       if (ret != ncclSuccess) {
-        INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
+        __atomic_store_n(&proxyState->asyncResult, ret, __ATOMIC_RELEASE);
+        INFO(NCCL_ALL,"%s:%d -> %d [Progress Thread]", __FILE__, __LINE__, ret);
       }
       if (added == 0) {
         sched_yield(); // No request progressed. Let others run.
@@ -876,6 +891,9 @@ void* ncclProxyProgress(void *proxyState_) {
     }
     lastIdle = idle;
   }
+
+  /* progress serive thread should be waiting for me, I need to notify it. */
+  __atomic_store_n(&state->stop, PROGRESS_COMPLETE, __ATOMIC_RELEASE);
   return NULL;
 }
 
@@ -898,7 +916,11 @@ ncclResult_t ncclProxyStart(struct ncclComm* comm) {
 static ncclResult_t ncclProxyProgressCreate(struct ncclProxyState* proxyState) {
   struct ncclProxyProgressState* state = &proxyState->progressState;
   if (!state->thread) {
-    pthread_create(&state->thread, NULL, ncclProxyProgress, proxyState);
+    pthread_attr_t attr;
+    SYSCHECK(pthread_attr_init(&attr), "pthread_attr_init");
+    SYSCHECK(pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED), "pthread_attr_setdetachstate");
+    SYSCHECK(pthread_create(&state->thread, &attr, ncclProxyProgress, proxyState), "pthread_create");
+    SYSCHECK(pthread_attr_destroy(&attr), "pthread_attr_destroy");
     ncclSetThreadName(state->thread, "NCCL Progress%2d", proxyState->tpLocalnRanks);
   }
   return ncclSuccess;
@@ -910,10 +932,17 @@ ncclResult_t ncclProxyProgressDestroy(struct ncclProxyState* proxyState) {
   // Request the proxy to stop and then wake it
   if (state->opsPool) {
     pthread_mutex_lock(&state->opsPool->mutex);
-    state->stop = true;
+    if (*proxyState->abortFlag == 0) 
+      state->stop = PROGRESS_REQUEST_STOP;
+    else
+      state->stop = PROGRESS_ABORT;
     pthread_cond_signal(&state->opsPool->cond);
     pthread_mutex_unlock(&state->opsPool->mutex);
-    pthread_join(state->thread, NULL);
+    /* progress thread is always detached, wait for it to exit. */
+    uint64_t t0 = clockNano();
+    while (__atomic_load_n(&state->stop, __ATOMIC_ACQUIRE) != PROGRESS_COMPLETE) {
+      if (clockNano() - t0 >= 1000) sched_yield();
+    }
   }
 
   // Free off any memory allocated for the proxy arg pools
@@ -1005,7 +1034,7 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
   int ready, proxyRank = -1;
   struct ncclProxyState* sharedProxyState = comm->proxyState;
 
-  // Keep one connection per mlocal rank
+  // Keep one connection per local rank
   for (int i = 0; i < comm->localRanks; ++i) {
     /* find the proxy rank in comm. */
     if (comm->topParentRanks[comm->localRankToRank[i]] == tpProxyRank) {
@@ -1058,42 +1087,43 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
       proxyOps->nextOps = proxyOps->nextOpsEnd = proxyOps->freeOp = -1;
     }
   }
-  INFO(NCCL_NET|NCCL_PROXY, "Connection to proxy localRank %d -> connection %p", proxyConn->tpLocalRank, proxyConn->connection);
+  INFO(NCCL_NET|NCCL_PROXY, "Connected to proxy localRank %d -> connection %p", proxyConn->tpLocalRank, proxyConn->connection);
   return ncclSuccess;
 }
 
 // cuMem API support
 // The response is sent out-of-band using ncclIpcSocket for this specific command
-ncclResult_t ncclProxyClientConvertFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int fd, int* convertedFd) {
+ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void *handle, int* convertedFd) {
   ncclResult_t ret = ncclSuccess;
   ncclResult_t res = ncclInProgress;
   struct ncclIpcSocket ipcSock = { 0 };
-  void* opId = malloc(1);
+  void *opId = (void*)((((uintptr_t)random()) << 32) | random());
+
   // Create a UDS socket to receive the converted fd
   NCCLCHECK(ncclIpcSocketInit(&ipcSock, comm->topParentLocalRanks[comm->localRank], (uint64_t)opId, comm->abortFlag));
 
-  // Request the conversion of the fd over sockets
-  NCCLCHECKGOTO(ncclProxyCallAsync(comm, proxyConn, ncclProxyMsgConvertFd, &fd, sizeof(int), 0, opId), ret, error);
+  // Request the allocation of a UDS fd for the handle over sockets
+  NCCLCHECKGOTO(ncclProxyCallAsync(comm, proxyConn, ncclProxyMsgGetFd, handle, sizeof(CUmemGenericAllocationHandle), 0, opId), ret, error);
 
-  // Receive converted fd over UDS
-  NCCLCHECK(ncclIpcSocketRecvFd(&ipcSock, convertedFd));
-  TRACE(NCCL_PROXY, "UDS: ConvertFd rank %d returned %p %d", proxyConn->tpLocalRank, convertedFd, *convertedFd);
-  NCCLCHECK(ncclIpcSocketClose(&ipcSock));
+  // Receive the converted fd over UDS
+  NCCLCHECKGOTO(ncclIpcSocketRecvFd(&ipcSock, convertedFd), ret, error);
+  TRACE(NCCL_PROXY, "UDS: ClientGetFd handle 0x%lx rank %d returned fd %d", *(uint64_t*)handle, proxyConn->tpLocalRank, *convertedFd);
+  NCCLCHECKGOTO(ncclIpcSocketClose(&ipcSock), ret, error);
 
+  // Wait for proxy response (sockets)
   while (res == ncclInProgress) {
     res = ncclPollProxyResponse(comm, proxyConn, NULL, opId);
   }
 
-  free(opId);
-  return res;
+  return ret;
 
 error:
   NCCLCHECK(ncclIpcSocketClose(&ipcSock));
-  WARN("ncclProxyClientConvertFd call to top parent rank %d failed", proxyConn->tpRank);
+  WARN("ncclProxyClientGetFd call to rank %d handle 0x%lx failed : %d", proxyConn->tpRank, *(uint64_t*)handle, ret);
   return ret;
 }
 
-const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop", "ConvertFd" };
+const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop", "GetFd" };
 ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId) {
   struct ncclSocket* sock;
   ncclResult_t ret = ncclSuccess;
@@ -1132,14 +1162,13 @@ ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnec
 
   // Check response queue
   int found = 0;
-  NCCLCHECK(expectedProxyResponseDequeue(sharedProxyState, opId, respBuff, &found));
+  ncclResult_t res = expectedProxyResponseDequeue(sharedProxyState, opId, respBuff, &found);
   if (found == 0) {
     // Attempt to read in a new response header from the proxy thread
     struct ncclSocket* sock = sharedProxyState->peerSocks + proxyConn->tpLocalRank;
-
-    void* recvOpId;
+    ncclProxyRpcResponseHeader resp = {0};
     int offset = 0;
-    if (ncclSuccess != ncclSocketProgress(NCCL_SOCKET_RECV, sock, &recvOpId, sizeof(recvOpId), &offset)) {
+    if (ncclSuccess != ncclSocketProgress(NCCL_SOCKET_RECV, sock, &resp, sizeof(resp), &offset)) {
       WARN("Socket recv failed while polling for opId=%p", opId);
       return ncclInternalError;
     }
@@ -1147,42 +1176,38 @@ ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnec
     if (offset == 0) {
       return ncclInProgress;
     // If we've returned a partial response, block to receive the rest of it
-    } else if (offset < sizeof(recvOpId)) {
-      while (offset < sizeof(recvOpId))
-        NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, sock, &recvOpId, sizeof(recvOpId), &offset));
+    } else if (offset < sizeof(resp)) {
+      while (offset < sizeof(resp))
+        NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, sock, &resp, sizeof(resp), &offset));
     }
 
-    INFO(NCCL_PROXY, "ncclPollProxyResponse Received new opId=%p", recvOpId);
-
-    // Now do a blocking recv of the response size
-    int respSize = 0;
-    NCCLCHECK(ncclSocketRecv(sock, &respSize, sizeof(respSize)));
+    INFO(NCCL_PROXY, "ncclPollProxyResponse Received new opId=%p", resp.opId);
 
     // If there's a respSize to recv
-    if (respSize > 0) {
-      if (recvOpId != opId) {
+    if (resp.respSize > 0) {
+      if (resp.opId != opId) {
         // Unexpected response, need to buffer the socket data
-        respBuff = malloc(respSize);
+        respBuff = malloc(resp.respSize);
       }
       assert(respBuff != NULL);
-      NCCLCHECK(ncclSocketRecv(sock, respBuff, respSize));
+      NCCLCHECK(ncclSocketRecv(sock, respBuff, resp.respSize));
     }
 
-    if (recvOpId == opId) {
-      INFO(NCCL_PROXY, "recvOpId=%p matches expected opId=%p", recvOpId, opId);
-      NCCLCHECK(expectedProxyResponseRemove(sharedProxyState, recvOpId));
-      return ncclSuccess;
+    if (resp.opId == opId) {
+      INFO(NCCL_PROXY, "resp.opId=%p matches expected opId=%p", resp.opId, opId);
+      NCCLCHECK(expectedProxyResponseRemove(sharedProxyState, resp.opId));
+      return resp.res;
     } else {
-      INFO(NCCL_PROXY, "Queuing opId=%p respBuff=%p respSize=%d", recvOpId, respBuff, respSize);
+      INFO(NCCL_PROXY, "Queuing opId=%p respBuff=%p respSize=%d", resp.opId, respBuff, resp.respSize);
       // Store the result and mark response as completed
-      NCCLCHECK(expectedProxyResponseStore(sharedProxyState, recvOpId, respBuff, respSize));
+      NCCLCHECK(expectedProxyResponseStore(sharedProxyState, resp.opId, respBuff, resp.respSize, resp.res));
       return ncclInProgress;
     }
   } else {
     INFO(NCCL_PROXY, "ncclPollProxyResponse Dequeued cached opId=%p", opId);
   }
 
-  return ncclSuccess;
+  return res;
 }
 
 ncclResult_t ncclProxyCallBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize) {
@@ -1284,38 +1309,52 @@ static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclPr
 }
 
 // cuMem API support
-static ncclResult_t proxyConvertFd(struct ncclProxyLocalPeer* peer, void *opId, struct ncclProxyState* proxyState, int fd) {
+static ncclResult_t proxyGetFd(struct ncclProxyLocalPeer* peer, void *opId, struct ncclProxyState* proxyState, uint64_t handle) {
+#if CUDART_VERSION >= 11030
+  // cuMem API support
+  ncclResult_t ret = ncclSuccess;
   struct ncclIpcSocket ipcSock = { 0 };
   uint64_t hash = (uint64_t) opId;
+  INFO(NCCL_PROXY, "UDS proxyGetFd received handle 0x%lx peer %d opId %lx", handle, peer->tpLocalRank, hash);
 
-  INFO(NCCL_PROXY, "UDS proxyConvertFd received fd %d peer %d opId %lx", fd, peer->tpLocalRank, hash);
+  CUmemAllocationHandleType type = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+  int fd = -1;
+
+  CUCHECK(cuMemExportToShareableHandle(&fd, handle, type, 0));
   // Send back the converted fd using UDS
-  NCCLCHECK(ncclIpcSocketInit(&ipcSock, proxyState->tpRank, hash^1, proxyState->abortFlag));
-  NCCLCHECK(ncclIpcSocketSendFd(&ipcSock, fd, peer->tpLocalRank, hash));
+  NCCLCHECKGOTO(ncclIpcSocketInit(&ipcSock, proxyState->tpRank, hash^1, proxyState->abortFlag), ret, error);
+  NCCLCHECKGOTO(ncclIpcSocketSendFd(&ipcSock, fd, peer->tpLocalRank, hash), ret, error);
+error:
   NCCLCHECK(ncclIpcSocketClose(&ipcSock));
-  return ncclSuccess;
+  // We can now safely close the exported fd
+  (void) close(fd);
+  return ret;
+#else
+  return ncclInternalError;
+#endif
 }
 
 static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclProxyState* proxyState, int* asyncOpCount, struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool) {
   int done = 1;
+  ncclResult_t res = ncclInternalError;
   if (op->type == ncclProxyMsgSetup) {
     TRACE(NCCL_PROXY, "proxyProgressAsync::proxySetup() opId=%p", op->opId);
-    NCCLCHECK(op->connection->tcomm->proxySetup(op->connection, proxyState, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done));
+    res = op->connection->tcomm->proxySetup(op->connection, proxyState, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done);
   } else if (op->type == ncclProxyMsgConnect) {
     TRACE(NCCL_PROXY, "proxyProgressAsync::proxyConnect() opId=%p op.reqBuff=%p", op->opId, op->reqBuff);
-    NCCLCHECK(op->connection->tcomm->proxyConnect(op->connection, proxyState, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done));
+    res = op->connection->tcomm->proxyConnect(op->connection, proxyState, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done);
   } else if (op->type == ncclProxyMsgSharedInit) {
     int nChannels = (int) *op->reqBuff;
     TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgSharedInit opId=%p op.reqBuff=%p nChannels=%d", op->opId, op->reqBuff, nChannels);
-    if (op->connection->tcomm->proxySharedInit) NCCLCHECK(op->connection->tcomm->proxySharedInit(op->connection, proxyState, nChannels));
+    if (op->connection->tcomm->proxySharedInit) res = op->connection->tcomm->proxySharedInit(op->connection, proxyState, nChannels);
     __atomic_store_n(&op->connection->state, connSharedInitialized, __ATOMIC_RELEASE);
-  } else if (op->type == ncclProxyMsgConvertFd) {
-    int fd = *(int *)op->reqBuff;
-    TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgConvertFd opId=%p op.reqBuff=%p fd=%d", op->opId, op->reqBuff, fd);
-    NCCLCHECK(proxyConvertFd(peer, op->opId, proxyState, fd)); // cuMem API support
+  } else if (op->type == ncclProxyMsgGetFd) {
+    uint64_t handle = *(uint64_t*)op->reqBuff;
+    TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgGetFd opId=%p op.reqBuff=%p handle=0x%lx", op->opId, op->reqBuff, handle);
+    res = proxyGetFd(peer, op->opId, proxyState, handle); // cuMem API support
   } else if (op->type == ncclProxyMsgInit) {
     TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgInit opId=%p op.reqBuff=%p", op->opId, op->reqBuff);
-    NCCLCHECK(proxyConnInit(peer, connectionPool, proxyState, (ncclProxyInitReq*) op->reqBuff, (ncclProxyInitResp*) op->respBuff, &op->connection));
+    res = proxyConnInit(peer, connectionPool, proxyState, (ncclProxyInitReq*) op->reqBuff, (ncclProxyInitResp*) op->respBuff, &op->connection);
   } else return ncclInternalError;
 
   if (done) {
@@ -1329,11 +1368,10 @@ static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclP
      * to abort and close the connection, it can cause segfault if the requester is using
      * the respBuff. */
 
-    // Send the opId for referencing async operation
-    NCCLCHECK(ncclSocketSend(op->connection->sock, &op->opId, sizeof(op->opId)));
+    ncclProxyRpcResponseHeader resp = {op->opId, res, op->respSize};
 
-    // Send the response size
-    NCCLCHECK(ncclSocketSend(op->connection->sock, &op->respSize, sizeof(op->respSize)));
+    // Send the opId for referencing async operation
+    NCCLCHECK(ncclSocketSend(op->connection->sock, &resp, sizeof(resp)));
 
     if (op->respSize) {
       // Send the response
@@ -1386,7 +1424,7 @@ static bool proxyMatchOpType(int type) {
     case ncclProxyMsgSharedInit:
     case ncclProxyMsgSetup:
     case ncclProxyMsgConnect:
-    case ncclProxyMsgConvertFd:
+    case ncclProxyMsgGetFd:
       return true;
     default:
       return false;
@@ -1544,6 +1582,19 @@ void* ncclProxyService(void* _args) {
   ncclSocketClose(proxyState->listenSock);
   free(proxyState->listenSock);
   proxyOpsFree(proxyState);
+
+  if (*proxyState->abortFlag) {
+    /* abort happened, need to notify main thread I am done. */
+    __atomic_store_n(&proxyState->stop, SERVICE_COMPLETE, __ATOMIC_RELEASE);
+  }
+
+  if (ncclAtomicRefCountDecrement(proxyState->abortFlagRefCount) == 0) {
+    ncclCudaHostFree((void *)proxyState->abortFlag);
+    free((void*)proxyState->abortFlagRefCount);
+  }
+
+  /* proxy itself holds one internal ref count, needs to call ncclProxyDestroy */
+  ncclProxyDestroy(proxyState);
   return NULL;
 }
 
@@ -1552,8 +1603,16 @@ ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union
   NCCLCHECK(ncclCalloc(&comm->sharedRes->proxyState, 1));
   comm->proxyState = comm->sharedRes->proxyState;
   comm->proxyState->refCount = 1;
+  /* ref count for communicator and proxy service thread. */
+  comm->proxyState->internalRefCount = 2;
   comm->proxyState->listenSock = sock;
   comm->proxyState->peerAddresses = peerAddresses;
+  // Seed the random number generator for UDS filename generation
+  struct timeval time;
+  gettimeofday(&time,NULL);
+  unsigned int seed = time.tv_sec*time.tv_usec;
+  seed ^= getpid();
+  srandom(seed);
   return ncclSuccess;
 }
 
@@ -1568,6 +1627,8 @@ ncclResult_t ncclProxyCreate(struct ncclComm* comm) {
     proxyState->tpLocalnRanks = comm->localRanks;
     proxyState->cudaDev = comm->cudaDev;
     proxyState->abortFlag = comm->abortFlag;
+    proxyState->abortFlagRefCount = comm->abortFlagRefCount;
+    ncclAtomicRefCountIncrement(comm->abortFlagRefCount);
     proxyState->p2pnChannels = comm->p2pnChannels;
     proxyState->p2pChunkSize = comm->p2pChunkSize;
     proxyState->nChannels = comm->nChannels;
@@ -1584,8 +1645,8 @@ ncclResult_t ncclProxyCreate(struct ncclComm* comm) {
 }
 
 ncclResult_t ncclProxyStop(struct ncclComm* comm) {
-  if (comm->sharedRes && comm->sharedRes->proxyState) {
-    struct ncclProxyState* sharedProxyState = comm->sharedRes->proxyState;
+  if (comm->proxyState) {
+    struct ncclProxyState* sharedProxyState = comm->proxyState;
 
     if ((comm->proxyRefCountOld = ncclAtomicRefCountDecrement(&sharedProxyState->refCount)) == 0) {
       if (sharedProxyState->peerAddresses) {
@@ -1625,15 +1686,41 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) {
   return ncclSuccess;
 }
 
-ncclResult_t ncclProxyDestroy(struct ncclComm* comm) {
-  struct ncclProxyState* sharedProxyState = comm->sharedRes->proxyState;
-
-  assert(sharedProxyState->refCount == 0);
-  free(sharedProxyState->peerAddresses);
-  free(sharedProxyState->peerSocks);
-  free(sharedProxyState->proxyOps);
-  free(sharedProxyState->sharedDevMems);
-  expectedProxyResponseFree(sharedProxyState);
-  free(sharedProxyState);
+ncclResult_t ncclProxyDestroy(struct ncclProxyState *proxyState) {
+  if (__atomic_sub_fetch(&proxyState->internalRefCount, 1, __ATOMIC_ACQ_REL) == 0) {
+    free(proxyState->peerAddresses);
+    free(proxyState->peerSocks);
+    free(proxyState->proxyOps);
+    free(proxyState->sharedDevMems);
+    expectedProxyResponseFree(proxyState);
+    free(proxyState);
+  }
+  return ncclSuccess;
+}
+
+/* detach all proxy threads in case of abort */
+ncclResult_t ncclProxyTryDetach(struct ncclProxyState *proxyState) {
+  if (proxyState && proxyState->thread) {
+    /* proxy service thread can call cudaFreeHost to free pinned host mem, but
+     * it can cause a hang if main thread is issuing other cuda calls. To solution
+     * should be allocate/free pinned host mem using cuMem* driver API, this waiting
+     * 5 secs is just a workaround for now. */
+    bool join = false;
+    struct timespec start, now;
+    clock_gettime(CLOCK_MONOTONIC, &start);
+    do {
+      clock_gettime(CLOCK_MONOTONIC, &now);
+      if (__atomic_load_n(&proxyState->stop, __ATOMIC_ACQUIRE) == SERVICE_COMPLETE) {
+        /* proxy thread is done, join it. */
+        pthread_join(proxyState->thread, NULL);
+        join = true;
+        break;
+      }
+    } while(now.tv_sec - start.tv_sec < 5);
+    
+    if (join == false) {
+      pthread_detach(proxyState->thread);
+    }
+  }
   return ncclSuccess;
 }
diff --git a/src/transport.cc b/src/transport.cc
index 9817beb183..c66a81ed7f 100644
--- a/src/transport.cc
+++ b/src/transport.cc
@@ -178,10 +178,32 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
     }
   }
 
-  // Clear all connect masks and free each connectInfo array
-  for (int i=1; i<comm->nRanks; i++) {
+  /* We need to sync ranks here since some ranks might run too fast after connection setup
+   * and start to destroy the connection after returning from this function; however, the
+   * others might still be trying to connect and import the buffer. No sync can lead to invalid
+   * shmem/cuda buffer. In addition, we also clear all connect masks and free each connectInfo array */
+  for (int i = 1; i < comm->nRanks; i++) {
+    int bootstrapTag = (i << 8) + (graph ? graph->id + 1 : 0);
     int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
     int sendPeer = (comm->rank + i) % comm->nRanks;
+    int flag = 0;
+
+    if (recvPeer != sendPeer) {
+      if (comm->connectSend[sendPeer] != 0UL)
+        NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail);
+      if (comm->connectRecv[recvPeer] != 0UL)
+        NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, &flag, sizeof(int)), ret, fail);
+
+      if (comm->connectSend[sendPeer] != 0UL)
+        NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail);
+      if (comm->connectRecv[recvPeer] != 0UL)
+        NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, &flag, sizeof(int)), ret, fail);
+    } else {
+      if (comm->connectSend[sendPeer] != 0UL || comm->connectRecv[recvPeer] != 0UL) {
+        NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail);
+        NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail);
+      }
+    }
     comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0UL;
     free(data[i]);
   }
diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc
index f66abe8b41..04bab8b4f2 100644
--- a/src/transport/coll_net.cc
+++ b/src/transport/coll_net.cc
@@ -155,7 +155,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
   send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
 
-  NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &send->proxyConn.tpLocalRank));
+  send->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
   tpProxyRank = comm->topParentRanks[myInfo->rank];
   NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, tpProxyRank, &send->proxyConn));
   ncclAtomicRefCountIncrement(&comm->collNetSharedRes->refCount);
@@ -177,7 +177,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   // Determine whether we need to flush the GDR buffer on recv or not
   if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush));
 
-  NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &recv->proxyConn.tpLocalRank));
+  recv->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
   tpProxyRank = comm->topParentRanks[myInfo->rank];
   NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, tpProxyRank, &recv->proxyConn));
   struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;
@@ -224,6 +224,8 @@ struct collNetConnectArgs {
   struct ncclConnect* connectInfos;
 };
 
+static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args);
+
 static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* send) {
   // We're on the same process as the proxy. We can pass a pointer to a struct.
   struct collNetConnectArgs args = { rank, nranks, connectInfos };
@@ -247,9 +249,14 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
 
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
     send->conn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]);
+
+  send->proxyConn.proxyProgress = sendProxyProgress;
+
   return ncclSuccess;
 }
 
+static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args);
+
 static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* recv) {
   // We're on the same process as the proxy. We can pass a pointer to a struct.
   struct collNetConnectArgs args = { rank, nranks, connectInfos };
@@ -272,6 +279,9 @@ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* conne
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
     recv->conn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]);
   }
+
+  recv->proxyConn.proxyProgress = recvProxyProgress;
+
   return ncclSuccess;
 }
 
diff --git a/src/transport/net.cc b/src/transport/net.cc
index 273d5d5e60..0998172f59 100644
--- a/src/transport/net.cc
+++ b/src/transport/net.cc
@@ -80,7 +80,7 @@ struct connectMap {
   } offsets;
 };
 
-struct sendResources {
+struct sendNetResources {
   struct connectMap map;
   void* netSendComm;
   struct ncclSendMem* sendMem;
@@ -103,9 +103,12 @@ struct sendResources {
   void* mhandles[NCCL_NUM_PROTOCOLS];
   uint64_t step;
   uint64_t llLastCleaning;
+  int netDeviceVersion;
+  ncclNetDeviceType netDeviceType;
+  ncclNetDeviceHandle_t* netDeviceHandle;
 };
 
-struct recvResources {
+struct recvNetResources {
   struct connectMap map;
   void* netListenComm;
   void* netRecvComm;
@@ -132,6 +135,9 @@ struct recvResources {
   void* mhandles[NCCL_NUM_PROTOCOLS];
   uint64_t step;
   uint64_t llLastCleaning;
+  int netDeviceVersion;
+  ncclNetDeviceType netDeviceType;
+  ncclNetDeviceHandle_t* netDeviceHandle;
 };
 
 /* Determine if two peers can communicate with NET */
@@ -159,11 +165,14 @@ struct setupReq {
   int connIndex;
 };
 
+// Forward declaration
+static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args);
+
 /* Determine if we will use this transport for this peer and return connect
  * information for this peer */
 static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
   struct setupReq req = { 0 };
-  int localRank, tpProxyRank;
+  int tpProxyRank;
 
   send->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
   req.channelId = channelId;
@@ -176,8 +185,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
 
   tpProxyRank = comm->topParentRanks[proxyRank];
   NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &send->proxyConn));
-  NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &localRank));
-  req.tpLocalRank = comm->topParentLocalRanks[localRank];
+  req.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
   req.tpRank = comm->topParentRanks[myInfo->rank];
   req.tpRemoteRank = comm->topParentRanks[peerInfo->rank];
   NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
@@ -201,7 +209,6 @@ NCCL_PARAM(GdrCopyFlushEnable, "GDRCOPY_FLUSH_ENABLE", 0);
 /* Setup recv connector */
 static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
   struct setupReq req = { 0 };
-  int localRank;
 
   recv->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
   req.channelId = channelId;
@@ -219,8 +226,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   tpProxyRank = comm->topParentRanks[myInfo->rank];
   NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, tpProxyRank, &recv->proxyConn));
 
-  NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &localRank));
-  req.tpLocalRank = comm->topParentLocalRanks[localRank];
+  req.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
   req.tpRank = comm->topParentRanks[myInfo->rank];
   req.tpRemoteRank = comm->topParentRanks[peerInfo->rank];
   NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t)));
@@ -267,6 +273,14 @@ static ncclResult_t netDumpMap(struct connectMap* map) {
   return ncclSuccess;
 }
 
+struct netSendConnectArgs {
+  ncclNetHandle_t handle;
+};
+
+struct netRecvConnectArgs {
+  int proxyRank;
+};
+
 static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
   struct connectMap* map = (connectMap*) send->transportResources;
 
@@ -279,7 +293,9 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
     send->transportResources = map;
     opId = send;
     INFO(NCCL_PROXY, "sendConnect ncclProxyCallAsync opId=%p", opId);
-    NCCLCHECK(ncclProxyCallAsync(comm, &send->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(ncclNetHandle_t), sizeof(struct connectMap), opId));
+    netSendConnectArgs args = {0};
+    memcpy(&args.handle, connectInfo, sizeof(ncclNetHandle_t));
+    NCCLCHECK(ncclProxyCallAsync(comm, &send->proxyConn, ncclProxyMsgConnect, &args, sizeof(netSendConnectArgs), sizeof(struct connectMap), opId));
   } else {
     opId =  send;
   }
@@ -293,15 +309,13 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
 
   if (map->sameProcess && !ncclCuMemEnable()) {
     if (map->cudaDev != comm->cudaDev) {
-      if (!ncclCuMemEnable()) {
-        // Enable P2P access for Legacy IPC
-        cudaError_t err = cudaDeviceEnablePeerAccess(map->cudaDev, 0);
-        if (err == cudaErrorPeerAccessAlreadyEnabled) {
-          cudaGetLastError();
-        } else if (err != cudaSuccess) {
-          WARN("failed to peer with device %d: %d %s", map->cudaDev, err, cudaGetErrorString(err));
-          return ncclInternalError;
-        }
+      // Enable P2P access for Legacy IPC
+      cudaError_t err = cudaDeviceEnablePeerAccess(map->cudaDev, 0);
+      if (err == cudaErrorPeerAccessAlreadyEnabled) {
+        cudaGetLastError();
+      } else if (err != cudaSuccess) {
+        WARN("failed to peer with device %d: %d %s", map->cudaDev, err, cudaGetErrorString(err));
+        return ncclInternalError;
       }
     }
   } else if (!(map->sameProcess && map->cudaDev == comm->cudaDev)) {
@@ -339,9 +353,30 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
 
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
     send->conn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]);
+
+  if (send->proxyConn.sameProcess) {
+    if (send->proxyConn.connection->netDeviceHandle) {
+      send->conn.netDeviceHandle = *send->proxyConn.connection->netDeviceHandle;
+
+      for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
+        send->conn.mhandles[p] = send->proxyConn.connection->mhandles[p];
+    }
+
+    if (send->proxyConn.connection->needsProxyProgress) {
+      send->proxyConn.proxyProgress = sendProxyProgress;
+    } else {
+      send->proxyConn.proxyProgress = NULL;
+    }
+  } else {
+    send->proxyConn.proxyProgress = sendProxyProgress;
+  }
+
   return ncclSuccess;
 }
 
+// Forward declare
+static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args);
+
 /* Connect to this peer */
 static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
   struct connectMap* map = (connectMap*) recv->transportResources;
@@ -353,7 +388,9 @@ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* conne
     opId = recv;
     INFO(NCCL_PROXY, "recvConnect ncclProxyCallAsync opId=%p &recv->proxyConn=%p connectInfo=%p",
        opId, &recv->proxyConn, connectInfo);
-    NCCLCHECK(ncclProxyCallAsync(comm, &recv->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(int), sizeof(struct connectMap), opId));
+    netRecvConnectArgs args = {0};
+    args.proxyRank = *((int*)connectInfo);
+    NCCLCHECK(ncclProxyCallAsync(comm, &recv->proxyConn, ncclProxyMsgConnect, &args, sizeof(netRecvConnectArgs), sizeof(struct connectMap), opId));
   } else {
     opId = recv;
   }
@@ -378,6 +415,24 @@ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* conne
 
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
     recv->conn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]);
+
+  if (recv->proxyConn.sameProcess) {
+    if (recv->proxyConn.connection->netDeviceHandle) {
+      recv->conn.netDeviceHandle = *recv->proxyConn.connection->netDeviceHandle;
+
+      for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
+        recv->conn.mhandles[p] = recv->proxyConn.connection->mhandles[p];
+    }
+
+    if (recv->proxyConn.connection->needsProxyProgress) {
+      recv->proxyConn.proxyProgress = recvProxyProgress;
+    } else {
+      recv->proxyConn.proxyProgress = NULL;
+    }
+  } else {
+    recv->proxyConn.proxyProgress = recvProxyProgress;
+  }
+
   return ncclSuccess;
 }
 
@@ -416,7 +471,7 @@ static ncclResult_t recvFree(struct ncclConnector* recv) {
 }
 
 #define NCCL_SHARED_STEPS 16
-static ncclResult_t sharedBuffersInit(struct ncclProxyState* proxyState, int cuda, int tpLocalRank, int type, int sameProcess,
+static ncclResult_t sharedNetBuffersInit(struct ncclProxyState* proxyState, int cuda, int tpLocalRank, int type, int sameProcess,
     int nChannels, char** gpuPtr, char** cpuPtr, int* size, ncclIpcDesc *ipcDesc) {
   if (cuda == 0 && sameProcess == 0) {
       WARN("PXN should not use host buffers for data");
@@ -462,7 +517,7 @@ static ncclResult_t sharedBuffersGet(struct ncclProxyState* proxyState, int chan
   return ncclSuccess;
 }
 
-static ncclResult_t sharedBuffersDestroy(struct ncclProxyState* proxyState, int tpLocalRank, int type, struct ncclProxyConnection* connection) {
+static ncclResult_t sharedNetBuffersDestroy(struct ncclProxyState* proxyState, int tpLocalRank, int type, struct ncclProxyConnection* connection) {
   if (proxyState->progressState.localPeers == NULL) NCCLCHECK(ncclInternalError);
   struct ncclProxyPeer* peer = proxyState->progressState.localPeers[tpLocalRank];
   if (peer == NULL) NCCLCHECK(ncclInternalError;)
@@ -492,7 +547,7 @@ static ncclResult_t sharedBuffersDestroy(struct ncclProxyState* proxyState, int
 }
 
 static ncclResult_t proxySharedInit(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, int nChannels) {
-  NCCLCHECK(sharedBuffersInit(proxyState, 1, connection->tpLocalRank, 0, connection->sameProcess, nChannels, NULL, NULL, NULL, NULL));
+  NCCLCHECK(sharedNetBuffersInit(proxyState, 1, connection->tpLocalRank, 0, connection->sameProcess, nChannels, NULL, NULL, NULL, NULL));
   return ncclSuccess;
 }
 
@@ -500,7 +555,7 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc
   struct setupReq* req = (struct setupReq*) reqBuff;
   if (reqSize != sizeof(struct setupReq)) return ncclInternalError;
 
-  struct sendResources* resources;
+  struct sendNetResources* resources;
   NCCLCHECK(ncclCalloc(&resources, 1));
   connection->transportResources = resources;
 
@@ -517,6 +572,11 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc
   /* DMA-BUF support */
   resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
   resources->maxRecvs = props.maxRecvs;
+  resources->netDeviceVersion = props.netDeviceVersion;
+  resources->netDeviceType = props.netDeviceType;
+
+  resources->netDeviceVersion = props.netDeviceVersion;
+  resources->netDeviceType = props.netDeviceType;
 
   // We don't return any data
   if (respSize != 0) return ncclInternalError;
@@ -528,7 +588,7 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
   struct setupReq* req = (struct setupReq*) reqBuff;
   if (reqSize != sizeof(struct setupReq)) return ncclInternalError;
 
-  struct recvResources* resources;
+  struct recvNetResources* resources;
   NCCLCHECK(ncclCalloc(&resources, 1));
   connection->transportResources = resources;
 
@@ -546,6 +606,8 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
   /* DMA-BUF support */
   resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
   resources->maxRecvs = props.maxRecvs;
+  resources->netDeviceVersion = props.netDeviceVersion;
+  resources->netDeviceType = props.netDeviceType;
 
   if (respSize != sizeof(ncclNetHandle_t)) return ncclInternalError;
   NCCLCHECK(proxyState->ncclNet->listen(req->netDev, respBuff, &resources->netListenComm));
@@ -554,11 +616,34 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
   return ncclSuccess;
 }
 
-static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
-  struct sendResources* resources = (struct sendResources*)(connection->transportResources);
-  if (reqSize != sizeof(ncclNetHandle_t)) return ncclInternalError;
-  ncclResult_t ret = ncclSuccess;
+// This function embeds plugin-specific rules given the current versions
+static ncclResult_t ncclNetGetDeviceHandle(ncclNetDeviceType type, int version, bool isRecv, ncclNetDeviceHandle_t** handle) {
+  bool needsDeviceHandle  = false;
 
+  if (type == NCCL_NET_DEVICE_UNPACK) {
+    if (version == NCCL_NET_DEVICE_UNPACK_VERSION && isRecv) {
+      needsDeviceHandle  = true;
+    }
+  }
+
+  // Don't re-alloc netDeviceHandles
+  if (needsDeviceHandle && (*handle == NULL)) {
+    NCCLCHECK(ncclCalloc(handle, 1));
+    (*handle)->netDeviceType = type;
+    (*handle)->netDeviceVersion = version;
+  } else if (!needsDeviceHandle) {
+    *handle = NULL;
+  }
+
+  return ncclSuccess;
+}
+
+static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  struct sendNetResources* resources = (struct sendNetResources*)(connection->transportResources);
+  if (reqSize != sizeof(netSendConnectArgs)) return ncclInternalError;
+  ncclResult_t ret = ncclSuccess;
+  netSendConnectArgs* req = (netSendConnectArgs*) reqBuff;
+  NCCLCHECK(ncclNetGetDeviceHandle(resources->netDeviceType, resources->netDeviceVersion, false /*isRecv*/, &resources->netDeviceHandle));
   if (resources->shared) {
     // Shared buffers
     struct ncclProxyProgressState* progressState = &proxyState->progressState;
@@ -577,15 +662,15 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
         NCCLCHECK(ncclCalloc(progressState->netComms + resources->netDev, proxyState->tpnRanks));
       }
       struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev] + resources->tpRemoteRank;
-      if (comms->sendComm[resources->channelId] == NULL) ret = proxyState->ncclNet->connect(resources->netDev, reqBuff, comms->sendComm + resources->channelId);
+      if (comms->sendComm[resources->channelId] == NULL) ret = proxyState->ncclNet->connect(resources->netDev, req->handle, comms->sendComm + resources->channelId, &resources->netDeviceHandle);
       resources->netSendComm = comms->sendComm[resources->channelId];
       if (comms->sendComm[resources->channelId]) comms->sendRefCount[resources->channelId]++;
     } else {
-      ret = proxyState->ncclNet->connect(resources->netDev, reqBuff, &resources->netSendComm);
+      ret = proxyState->ncclNet->connect(resources->netDev, req->handle, &resources->netSendComm, &resources->netDeviceHandle);
     }
   } else {
     // Connect to remote peer
-    ret = proxyState->ncclNet->connect(resources->netDev, reqBuff, &resources->netSendComm);
+    ret = proxyState->ncclNet->connect(resources->netDev, req->handle, &resources->netSendComm, &resources->netDeviceHandle);
     connection->proxyAppendPtr = &connection->proxyAppend;
   }
 
@@ -596,6 +681,13 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
   }
   *done = 1;
 
+  if (resources->netDeviceHandle) {
+    connection->netDeviceHandle = resources->netDeviceHandle;
+    connection->needsProxyProgress = connection->netDeviceHandle->needsProxyProgress;
+  } else {
+    connection->needsProxyProgress = 1;
+  }
+
   // Create structures
   struct connectMap* map = &resources->map;
   map->sameProcess = connection->sameProcess;
@@ -611,7 +703,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
     // Get shared buffers
     int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM;
     struct connectMapMem* mapMem = map->mems+bank;
-    NCCLCHECK(sharedBuffersInit(
+    NCCLCHECK(sharedNetBuffersInit(
           proxyState, resources->useGdr, resources->tpLocalRank, 0, map->sameProcess, proxyState->p2pnChannels,
           &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size, &mapMem->ipcDesc));
     resources->buffSizes[NCCL_PROTO_SIMPLE] = mapMem->size;
@@ -679,6 +771,10 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
       {
         NCCLCHECK(proxyState->ncclNet->regMr(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
       }
+
+      // Copy the mhandle dptr, if implemented
+      if (resources->netDeviceHandle && proxyState->ncclNet->getDeviceMr)
+        NCCLCHECK(proxyState->ncclNet->getDeviceMr(resources->netSendComm, resources->mhandles[p], &connection->mhandles[p]));
     }
   }
 
@@ -689,11 +785,13 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
 }
 
 static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
-  if (reqSize != sizeof(int)) return ncclInternalError;
-  struct recvResources* resources = (struct recvResources*)(connection->transportResources);
-  resources->tpRemoteProxyRank = *(int*)reqBuff;
+  if (reqSize != sizeof(netRecvConnectArgs)) return ncclInternalError;
+  struct recvNetResources* resources = (struct recvNetResources*)(connection->transportResources);
+  netRecvConnectArgs* req = (netRecvConnectArgs*) reqBuff;
+  resources->tpRemoteProxyRank = req->proxyRank;
   ncclResult_t ret = ncclSuccess;
 
+  NCCLCHECK(ncclNetGetDeviceHandle(resources->netDeviceType, resources->netDeviceVersion, true /*isRecv*/, &resources->netDeviceHandle));
   // Finish connection establishment from remote peer
   if (resources->shared) {
     // Shared buffers
@@ -713,15 +811,15 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
         NCCLCHECK(ncclCalloc(progressState->netComms + resources->netDev, proxyState->tpnRanks));
       }
       struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev] + resources->tpRemoteProxyRank;
-      if (comms->recvComm[resources->channelId] == NULL) ret = proxyState->ncclNet->accept(resources->netListenComm, comms->recvComm+resources->channelId);
+      if (comms->recvComm[resources->channelId] == NULL) ret = proxyState->ncclNet->accept(resources->netListenComm, comms->recvComm+resources->channelId, &resources->netDeviceHandle);
       resources->netRecvComm = comms->recvComm[resources->channelId];
       if (comms->recvComm[resources->channelId]) comms->recvRefCount[resources->channelId]++;
     } else {
-      ret = proxyState->ncclNet->accept(resources->netListenComm, &resources->netRecvComm);
+      ret = proxyState->ncclNet->accept(resources->netListenComm, &resources->netRecvComm, &resources->netDeviceHandle);
     }
   } else {
     // Connect to remote peer
-    ret = proxyState->ncclNet->accept(resources->netListenComm, &resources->netRecvComm);
+    ret = proxyState->ncclNet->accept(resources->netListenComm, &resources->netRecvComm, &resources->netDeviceHandle);
     connection->proxyAppendPtr = &connection->proxyAppend;
   }
 
@@ -732,6 +830,13 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
   }
   *done = 1;
 
+  if (resources->netDeviceHandle) {
+    connection->netDeviceHandle = resources->netDeviceHandle;
+    connection->needsProxyProgress = connection->netDeviceHandle->needsProxyProgress;
+  } else {
+    connection->needsProxyProgress = 1;
+  }
+
   NCCLCHECK(proxyState->ncclNet->closeListen(resources->netListenComm));
 
   // Create structures
@@ -749,7 +854,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
     // Get shared buffers
     int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM;
     struct connectMapMem* mapMem = map->mems+bank;
-    NCCLCHECK(sharedBuffersInit(
+    NCCLCHECK(sharedNetBuffersInit(
           proxyState, resources->useGdr, resources->tpLocalRank, 1, 1, proxyState->p2pnChannels,
           &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size, NULL));
     resources->buffSizes[NCCL_PROTO_SIMPLE] = mapMem->size;
@@ -809,6 +914,10 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
       {
         NCCLCHECK(proxyState->ncclNet->regMr(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
       }
+
+      // Copy the mhandle dptr
+      if (resources->netDeviceType != NCCL_NET_DEVICE_HOST && proxyState->ncclNet->getDeviceMr)
+        NCCLCHECK(proxyState->ncclNet->getDeviceMr(resources->netRecvComm, resources->mhandles[p], &connection->mhandles[p]));
     }
   }
 
@@ -819,9 +928,9 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
 }
 
 static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) {
-  struct sendResources* resources = (struct sendResources*)(connection->transportResources);
+  struct sendNetResources* resources = (struct sendNetResources*)(connection->transportResources);
   if (connection->state == connSharedInitialized) { // NVB Preconnect
-    NCCLCHECK(sharedBuffersDestroy(proxyState, connection->tpLocalRank, 0, connection));
+    NCCLCHECK(sharedNetBuffersDestroy(proxyState, connection->tpLocalRank, 0, connection));
     return ncclSuccess;
   }
 
@@ -846,7 +955,7 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct
     }
     if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
     if (resources->shared) {
-      NCCLCHECK(sharedBuffersDestroy(proxyState, resources->tpLocalRank, 0, connection));
+      NCCLCHECK(sharedNetBuffersDestroy(proxyState, resources->tpLocalRank, 0, connection));
       if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
         struct ncclSharedNetComms* comms = proxyState->progressState.netComms[resources->netDev]+resources->tpRemoteRank;
         comms->sendRefCount[resources->channelId]--;
@@ -864,9 +973,9 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct
 }
 
 static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) {
-  struct recvResources* resources = (struct recvResources*)(connection->transportResources);
+  struct recvNetResources* resources = (struct recvNetResources*)(connection->transportResources);
   if (connection->state == connSharedInitialized) { // NVB Preconnect
-    NCCLCHECK(sharedBuffersDestroy(proxyState, connection->tpLocalRank, 1, connection));
+    NCCLCHECK(sharedNetBuffersDestroy(proxyState, connection->tpLocalRank, 1, connection));
     return ncclSuccess;
   }
 
@@ -887,7 +996,7 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
     }
     if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
     if (resources->shared) {
-      NCCLCHECK(sharedBuffersDestroy(proxyState, resources->tpLocalRank, 1, connection));
+      NCCLCHECK(sharedNetBuffersDestroy(proxyState, resources->tpLocalRank, 1, connection));
       if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
         struct ncclSharedNetComms* comms = proxyState->progressState.netComms[resources->netDev] + resources->tpRemoteProxyRank;
         comms->recvRefCount[resources->channelId]--;
@@ -910,7 +1019,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
   if (args->state == ncclProxyOpReady) {
     for (int s=0; s<args->nsubs; s++) {
       struct ncclProxySubArgs* sub = args->subs+s;
-      struct sendResources* resources = (struct sendResources*) (sub->connection->transportResources);
+      struct sendNetResources* resources = (struct sendNetResources*) (sub->connection->transportResources);
       // Round to next multiple of sliceSteps
       sub->base = ROUNDUP(resources->step, args->chunkSteps);
       sub->posted = sub->transmitted = sub->done = 0;
@@ -925,7 +1034,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
     for (int s=0; s<args->nsubs; s++) {
       struct ncclProxySubArgs* sub = args->subs+s;
       if (sub->done == sub->nsteps) continue;
-      struct sendResources* resources = (struct sendResources*) (sub->connection->transportResources);
+      struct sendNetResources* resources = (struct sendNetResources*) (sub->connection->transportResources);
       void* mhandle = resources->mhandles[p];
       int stepSize = resources->buffSizes[p] / NCCL_STEPS;
       char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
@@ -1044,7 +1153,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
       } else if (s>0) { // Find next sub with the same recvComm
         int next;
         for (next=s; next<args->nsubs; next++) {
-          struct recvResources* nextRes = (struct recvResources*) (args->subs[next].connection->transportResources);
+          struct recvNetResources* nextRes = (struct recvNetResources*) (args->subs[next].connection->transportResources);
           if (nextRes->netRecvComm == recvComm) break;
         }
         if (next == args->nsubs) { // Not found
@@ -1057,7 +1166,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
         }
       }
       groupSize++;
-      struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
+      struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
       maxRecvs = resources->maxRecvs;
       recvComm = resources->netRecvComm;
       // Round to next multiple of sliceSteps
@@ -1084,7 +1193,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
         struct ncclProxySubArgs* sub = subGroup + i;
         if (sub->posted < sub->nsteps) {
           if (sub->posted >= sub->done + maxDepth) { subCount = 0; break; }
-          struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
+          struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
           int stepSize = resources->buffSizes[p] / NCCL_STEPS;
           char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
           int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
@@ -1107,10 +1216,12 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
       }
       if (subCount) {
         uint64_t step = subGroup->posted;
-        struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources);
+        struct recvNetResources* resources = (struct recvNetResources*) (subGroup->connection->transportResources);
         void** requestPtr = subGroup->requests+(step%NCCL_STEPS);
         NCCLCHECK(proxyState->ncclNet->irecv(resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, requestPtr));
         if (*requestPtr) {
+          subGroup->recvRequestsCache[step%NCCL_STEPS] = *requestPtr;
+          subGroup->recvRequestsSubCount = subCount;
           for (int i=0; i<subGroup->groupSize; i++) {
             struct ncclProxySubArgs* sub = subGroup+i;
             sub->posted += args->sliceSteps;
@@ -1141,14 +1252,14 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
             sub->received += args->sliceSteps;
             for (uint64_t step=sub->received-args->sliceSteps; step<sub->received; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvFlushWait);
             if (step < sub->nsteps) {
-              struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
+              struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
               if (resources->useGdr) needFlush |= resources->needFlush;
             }
           }
           subGroup->requests[step%NCCL_STEPS] = NULL;
           if (totalSize > 0 && p == NCCL_PROTO_SIMPLE && needFlush) {
             // GDRCOPY support
-            struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources);
+            struct recvNetResources* resources = (struct recvNetResources*) (subGroup->connection->transportResources);
             if (resources->gdcFlush) {
 #if defined (__x86_64__)
               // Force a PCI-E read from GPU memory
@@ -1162,7 +1273,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
               for (int i=0; i<subGroup->groupSize; i++) {
                 struct ncclProxySubArgs* sub = subGroup + i;
                 if (step < sub->nsteps) {
-                  struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
+                  struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
                   int stepSize = resources->buffSizes[p] / NCCL_STEPS;
                   char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
                   int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
@@ -1171,7 +1282,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
                   subCount++;
                 }
               }
-              struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources);
+              struct recvNetResources* resources = (struct recvNetResources*) (subGroup->connection->transportResources);
               NCCLCHECK(proxyState->ncclNet->iflush(resources->netRecvComm, subCount, ptrs, sizes, mhandles, subGroup->requests+(step%NCCL_STEPS)));
             }
           }
@@ -1195,7 +1306,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
             for (uint64_t step=sub->transmitted-args->sliceSteps; step<sub->transmitted; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvGPUWait);
             if (step < sub->nsteps) {
               __sync_synchronize();
-              struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
+              struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
               volatile uint64_t* recvTail = resources->gdcSync ? resources->gdcSync : &resources->recvMem->tail;
               *recvTail = sub->base + sub->transmitted;
               if (resources->gdcSync) wc_store_fence(); // Flush out WC write
@@ -1213,17 +1324,23 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
         struct ncclProxySubArgs* sub = subGroup + i;
         if (sub->done == sub->nsteps) continue;
         if (sub->transmitted > sub->done) {
-          struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
+          struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
           volatile uint64_t* sendHead = &resources->sendMem->head;
           uint64_t done = *sendHead;
           while (done > sub->base + sub->done &&
               // LL and LL128 can acknowledge 0-bytes send before they even happen. Don't go past what we transmitted.
               sub->transmitted > sub->done) {
+            if (subGroup->recvRequestsCache[sub->done%NCCL_STEPS]) {
+              // the multirecv requests are only cached in the first sub.
+              if (proxyState->ncclNet->irecvConsumed)
+                NCCLCHECK(proxyState->ncclNet->irecvConsumed(resources->netRecvComm, subGroup->recvRequestsSubCount, subGroup->recvRequestsCache[sub->done%NCCL_STEPS]));
+              subGroup->recvRequestsCache[sub->done%NCCL_STEPS] = NULL;
+            }
             sub->done += args->sliceSteps;
             for (uint64_t step=sub->done-args->sliceSteps; step<sub->done; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileEnd);
             args->idle = 0;
             if (sub->done == sub->nsteps) {
-              struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
+              struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
               resources->step = sub->base + sub->nsteps;
               args->done++;
               break;
diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc
index 421f0a13a1..8d4313dddc 100644
--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
@@ -333,6 +333,8 @@ ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) {
   props->port = ncclIbDevs[dev].port + ncclIbDevs[dev].realPort;
   props->maxComms = ncclIbDevs[dev].maxQp;
   props->maxRecvs = NCCL_NET_IB_MAX_RECVS;
+  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
+  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
   return ncclSuccess;
 }
 
@@ -348,6 +350,10 @@ struct ncclIbQpInfo {
   uint8_t link_layer;
   uint32_t qpn[NCCL_IB_MAX_QPS];
 
+  // Fields needed for ece (enhanced connection establishment)
+  struct ibv_ece ece[NCCL_IB_MAX_QPS];
+  int ece_supported[NCCL_IB_MAX_QPS];
+
   // For RoCE
   uint64_t spn;
   uint64_t iid;
@@ -608,7 +614,7 @@ ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) {
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
+ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
   struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle;
   struct ncclIbCommStage* stage = &handle->stage;
   struct ncclIbSendComm* comm = (struct ncclIbSendComm*)stage->comm;
@@ -652,7 +658,13 @@ ib_connect_check:
   NCCLCHECK(wrap_ibv_query_port(ctx, ib_port, &portAttr));
   struct ncclIbQpInfo qpInfo;
   qpInfo.ib_port = ib_port;
-  for (int q=0; q<comm->nqps; q++) qpInfo.qpn[q] = comm->qps[q]->qp_num;
+  for (int q=0; q<comm->nqps; q++) {
+    qpInfo.qpn[q] = comm->qps[q]->qp_num;
+
+    // Query ece capabilities (enhanced connection establishment)
+    NCCLCHECK(wrap_ibv_query_ece(comm->qps[q], &qpInfo.ece[q], &qpInfo.ece_supported[q]));
+  }
+
   qpInfo.mtu = portAttr.active_mtu;
 
   // Prepare my fifo
@@ -663,15 +675,20 @@ ib_connect_check:
   // RoCE support
   qpInfo.lid = portAttr.lid;
   qpInfo.link_layer = comm->gidInfo.link_layer = portAttr.link_layer;
-  if (qpInfo.link_layer == IBV_LINK_LAYER_INFINIBAND) { // IB
-    for (int q=0; q<comm->nqps; q++)
-      INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ib_port, qpInfo.qpn[q], qpInfo.mtu, qpInfo.lid);
-  } else { // RoCE
-    NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &comm->gidInfo.localGid));
+  if (qpInfo.link_layer == IBV_LINK_LAYER_ETHERNET) {
+    NCCLCHECK(wrap_ibv_query_gid(ncclIbDevs[dev].context, ncclIbDevs[dev].port, ncclParamIbGidIndex(), &comm->gidInfo.localGid));
     qpInfo.spn = comm->gidInfo.localGid.global.subnet_prefix;
     qpInfo.iid = comm->gidInfo.localGid.global.interface_id;
+  }
+
+  if (qpInfo.link_layer == IBV_LINK_LAYER_INFINIBAND) { // IB
     for (int q=0; q<comm->nqps; q++)
-      INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX)", dev, ib_port, qpInfo.qpn[q], qpInfo.mtu, ncclParamIbGidIndex(), qpInfo.spn, qpInfo.iid);
+      INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ncclIbDevs[dev].port, qpInfo.qpn[q], qpInfo.mtu, qpInfo.lid);
+  } else { // RoCE
+    for (int q=0; q<comm->nqps; q++)
+      INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d query_ece={supported=%d, vendor_id=0x%x, options=0x%x, comp_mask=0x%x} GID %ld (%lX/%lX)",
+        dev, ncclIbDevs[dev].port, qpInfo.qpn[q], qpInfo.mtu, qpInfo.ece_supported[q], qpInfo.ece[q].vendor_id, qpInfo.ece[q].options, qpInfo.ece[q].comp_mask, ncclParamIbGidIndex(),
+        qpInfo.spn, qpInfo.iid);
   }
 
   stage->state = ncclIbCommStateSend;
@@ -699,10 +716,19 @@ ib_connect:
   comm->gidInfo.remoteGid.global.interface_id = remQpInfo.iid;
   for (int q=0; q<comm->nqps; q++) {
     struct ibv_qp* qp = comm->qps[q];
+    if (remQpInfo.ece_supported[q] && qpInfo.ece_supported[q])
+      NCCLCHECK(wrap_ibv_set_ece(qp, &remQpInfo.ece[q], &qpInfo.ece_supported[q]));
+
     NCCLCHECK(ncclIbRtrQp(qp, remQpInfo.qpn[q], &remQpInfo));
     NCCLCHECK(ncclIbRtsQp(qp));
   }
 
+  if (qpInfo.link_layer == IBV_LINK_LAYER_ETHERNET ) { // RoCE
+    for (int q=0; q<comm->nqps; q++)
+      INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d set_ece={supported=%d, vendor_id=0x%x, options=0x%x, comp_mask=0x%x}",
+        dev, ncclIbDevs[dev].port, qpInfo.qpn[q], remQpInfo.ece_supported[q], remQpInfo.ece[q].vendor_id, remQpInfo.ece[q].options, remQpInfo.ece[q].comp_mask);
+  }
+
   comm->ready = 1;
   stage->state = ncclIbCommStateConnected;
   stage->offset = 0;
@@ -720,7 +746,7 @@ ib_send_ready:
 
 NCCL_PARAM(IbGdrFlushDisable, "GDR_FLUSH_DISABLE", 0);
 
-ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
+ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
   struct ncclIbListenComm* lComm = (struct ncclIbListenComm*)listenComm;
   struct ncclIbCommStage* stage = &lComm->stage;
   struct ncclIbRecvComm* rComm = (struct ncclIbRecvComm*)stage->comm;
@@ -781,8 +807,21 @@ ib_recv:
   remQpInfo.mtu = (enum ibv_mtu)std::min(remQpInfo.mtu, portAttr.active_mtu);
 
   // Setup QP
+  struct ncclIbQpInfo qpInfo;
   for (int q=0; q<rComm->nqps; q++) {
     struct ibv_qp* qp = rComm->qps[q];
+
+    // Set the ece (enhanced connection establishment) on this QP before RTR
+    if (remQpInfo.ece_supported[q]) {
+      NCCLCHECK(wrap_ibv_set_ece(qp, &remQpInfo.ece[q], &qpInfo.ece_supported[q]));
+  
+      // Query the reduced ece for this QP (matching enhancements between the requestor and the responder)
+      // Store this in our own qpInfo for returning to the requestor
+      if (qpInfo.ece_supported[q]) {
+        NCCLCHECK(wrap_ibv_query_ece(qp, &qpInfo.ece[q], &qpInfo.ece_supported[q]));
+      }
+    }
+
     NCCLCHECK(ncclIbRtrQp(qp, remQpInfo.qpn[q], &remQpInfo));
     NCCLCHECK(ncclIbRtsQp(qp));
   }
@@ -815,7 +854,6 @@ ib_recv:
   }
 
   // Fill Handle
-  struct ncclIbQpInfo qpInfo;
   qpInfo.lid=portAttr.lid;
   qpInfo.link_layer= rComm->gidInfo.link_layer = portAttr.link_layer;
   qpInfo.ib_port=ib_port;
@@ -1380,6 +1418,8 @@ ncclNet_t ncclNetIb = {
   ncclIbTest,
   ncclIbCloseSend,
   ncclIbCloseRecv,
-  ncclIbCloseListen
+  ncclIbCloseListen,
+  NULL /* getDeviceMr */,
+  NULL /* irecvConsumed */
 };
 
diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc
index 08a8c3a293..502179a217 100644
--- a/src/transport/net_socket.cc
+++ b/src/transport/net_socket.cc
@@ -101,6 +101,8 @@ ncclResult_t ncclNetSocketGetProperties(int dev, ncclNetProperties_t* props) {
   props->port = 0;
   props->maxComms = 65536;
   props->maxRecvs = 1;
+  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
+  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
   return ncclSuccess;
 }
 
@@ -301,7 +303,7 @@ ncclResult_t ncclNetSocketListen(int dev, void* opaqueHandle, void** listenComm)
   return ncclSuccess;
 }
 
-ncclResult_t ncclNetSocketConnect(int dev, void* opaqueHandle, void** sendComm) {
+ncclResult_t ncclNetSocketConnect(int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
   if (dev < 0 || dev >= ncclNetIfs) { // data transfer socket is based on specified dev
     return ncclInternalError;
   }
@@ -346,7 +348,7 @@ socket_send:
   return ncclSuccess;
 }
 
-ncclResult_t ncclNetSocketAccept(void* listenComm, void** recvComm) {
+ncclResult_t ncclNetSocketAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
   struct ncclNetSocketListenComm* lComm = (struct ncclNetSocketListenComm*)listenComm;
   struct ncclNetSocketCommStage* stage = &lComm->stage;
   struct ncclNetSocketComm* rComm = stage->comm;
@@ -609,5 +611,7 @@ ncclNet_t ncclNetSocket = {
   ncclNetSocketTest,
   ncclNetSocketClose,
   ncclNetSocketClose,
-  ncclNetSocketCloseListen
+  ncclNetSocketCloseListen,
+  NULL /* getDeviceMr */,
+  NULL /* irecvConsumed */
 };
diff --git a/src/transport/nvls.cc b/src/transport/nvls.cc
index 07be99d9fe..c9a3bbc289 100644
--- a/src/transport/nvls.cc
+++ b/src/transport/nvls.cc
@@ -10,17 +10,30 @@
 #include "graph.h"
 #include "utils.h"
 #include "proxy.h"
+#include "enqueue.h"
 
 #if CUDART_VERSION >= 12010
 
-// Currently we only support POSIX_FILE_DESCRIPTOR handle exchange
-#define USE_POSIX_FD 1
+struct graphRegData {
+  uintptr_t offset;
+  size_t size;
+};
 
-#if USE_POSIX_FD
-#define NVLS_CU_MEM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
-#else
-#define NVLS_CU_MEM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_NONE
-#endif
+struct localRegData {
+  /* Registration record data */
+  uintptr_t recSendbuff, recRecvbuff;
+  intptr_t recSendOffset, recRecvOffset;
+  /* Registration request data */
+  uintptr_t reqSendbuff, reqRecvbuff;
+  size_t reqSendSize, reqRecvSize;
+  intptr_t reqSendOffset, reqRecvOffset;
+};
+
+struct localRequestData {
+  uintptr_t reqBuff;
+  size_t reqSize;
+  intptr_t reqOffset;
+};
 
 ncclResult_t nvlsCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
   // This transport cannot be used for p2p
@@ -66,24 +79,23 @@ ncclResult_t nvlsGetProperties(struct ncclComm *comm, struct ncclNvlsSharedRes*
   return ncclSuccess;
 }
 
-ncclResult_t nvlsGroupCreate(struct ncclComm *comm, struct ncclNvlsSharedRes* resources, int rank, unsigned int nranks, char* shareableHandle) {
-  size_t size = resources->size;
+ncclResult_t nvlsGroupCreate(struct ncclComm *comm, CUmulticastObjectProp *prop, int rank, unsigned int nranks, CUmemGenericAllocationHandle *mcHandle, char *shareableHandle) {
+  size_t size = prop->size;
 
   // Create a Multicast group
-  CUmulticastObjectProp* prop = &resources->properties;
 
   INFO(NCCL_NVLS, "NVLS Creating Multicast group nranks %d size %zi on rank %d", nranks, size, rank);
-  CUCHECK(cuMulticastCreate(&resources->mcHandle, prop));
+  CUCHECK(cuMulticastCreate(mcHandle, prop));
 
-  if (NVLS_CU_MEM_HANDLE_TYPE != CU_MEM_HANDLE_TYPE_NONE) {
+  if ((NVLS_CU_MEM_HANDLE_TYPE != CU_MEM_HANDLE_TYPE_NONE) && (NVLS_CU_MEM_HANDLE_TYPE != CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR)) {
     // Get a handle to pass to other ranks
-    CUCHECK(cuMemExportToShareableHandle(shareableHandle, resources->mcHandle, NVLS_CU_MEM_HANDLE_TYPE, 0));
+    CUCHECK(cuMemExportToShareableHandle(shareableHandle, *mcHandle, NVLS_CU_MEM_HANDLE_TYPE, 0));
   }
   else {
-    memcpy(shareableHandle, &resources->mcHandle, sizeof(resources->mcHandle));
+    memcpy(shareableHandle, mcHandle, sizeof(CUmemGenericAllocationHandle));
   }
 
-  INFO(NCCL_NVLS, "NVLS Created Multicast group %llx nranks %d size %zi on rank %d", resources->mcHandle, nranks, size, rank);
+  INFO(NCCL_NVLS, "NVLS Created Multicast group %llx nranks %d size %zi on rank %d", *mcHandle, nranks, size, rank);
 
   return ncclSuccess;
 }
@@ -94,7 +106,7 @@ ncclResult_t nvlsGroupAddDevice(struct ncclComm *comm, struct ncclNvlsSharedRes*
   return ncclSuccess;
 }
 
-ncclResult_t nvlsGroupConnect(struct ncclComm *comm, struct ncclNvlsSharedRes* resources, int rank, char* shareableHandle) {
+ncclResult_t nvlsGroupConnect(struct ncclComm *comm, char *shareableHandle, int rank, CUmemGenericAllocationHandle *mcHandle) {
   CUmemAllocationHandleType type = NVLS_CU_MEM_HANDLE_TYPE;
 
   INFO(NCCL_NVLS, "NVLS importing shareableHandle %p from rank %d", shareableHandle, rank);
@@ -102,36 +114,27 @@ ncclResult_t nvlsGroupConnect(struct ncclComm *comm, struct ncclNvlsSharedRes* r
   // Import and map the remote memory descriptor to the local GPU
   if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
     // cuMem UDS support
-    int fd = *(int *)shareableHandle;
-    TRACE(NCCL_NVLS, "NVLS rank %d Importing shareable handle from rank %d fd %d", comm->localRank, rank, fd);
+    int fd = -1;
+    TRACE(NCCL_NVLS, "NVLS rank %d Importing shareable handle %p from rank %d", comm->localRank, shareableHandle, rank);
     struct ncclProxyConnector proxyConn;
     int tpProxyRank = comm->topParentRanks[rank];
     NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, tpProxyRank, &proxyConn));
-    TRACE(NCCL_NVLS, "NVLS rank %d request conversion of fd %d from rank %d", comm->localRank, fd, rank);
-    NCCLCHECK(ncclProxyClientConvertFdBlocking(comm, &proxyConn, fd, (int *)shareableHandle));
-    fd = *(int *)shareableHandle;
+    TRACE(NCCL_NVLS, "NVLS rank %d request conversion of handle 0x%lx from rank %d", comm->localRank, *(uint64_t*)shareableHandle, rank);
+    NCCLCHECK(ncclProxyClientGetFdBlocking(comm, &proxyConn, shareableHandle, &fd));
     TRACE(NCCL_NVLS, "NVLS rank %d received converted fd %d from rank %d", comm->localRank, fd, rank);
-    CUCHECK(cuMemImportFromShareableHandle(&resources->mcHandle, (void *)(uintptr_t)fd, type));
+    CUCHECK(cuMemImportFromShareableHandle(mcHandle, (void *)(uintptr_t)fd, type));
+    (void) close(fd);
   } else {
     if (NVLS_CU_MEM_HANDLE_TYPE != CU_MEM_HANDLE_TYPE_NONE) {
-      CUCHECK(cuMemImportFromShareableHandle(&resources->mcHandle, (void *)shareableHandle, type));
+      CUCHECK(cuMemImportFromShareableHandle(mcHandle, (void *)shareableHandle, type));
     } else {
-      memcpy(&resources->mcHandle, shareableHandle, sizeof(resources->mcHandle));
+      memcpy(mcHandle, shareableHandle, sizeof(CUmemGenericAllocationHandle));
     }
   }
   return ncclSuccess;
 }
 
 ncclResult_t nvlsGroupDisconnect(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) {
-  CUmemAllocationHandleType type = NVLS_CU_MEM_HANDLE_TYPE;
-
-  // Import and map the remote memory descriptor to the local GPU
-  if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
-    // cuMem UDS support
-    int fd = *(int *)resources->shareableHandle;
-    (void) close(fd);
-  }
-
   return ncclSuccess;
 }
 
@@ -147,6 +150,7 @@ ncclResult_t nvlsGroupBindMem(struct ncclComm *comm, struct ncclNvlsSharedRes* r
   prop.location.id = resources->dev;
   prop.requestedHandleTypes = NVLS_CU_MEM_HANDLE_TYPE;
   CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
+  resources->ucGran = granularity;
 
   // Map a VA for UC memory
   CUCHECK(cuMemAddressReserve(&ptr, size, granularity, 0U, 0));
@@ -181,6 +185,14 @@ ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, struct ncclNvlsSharedRes* re
   return ncclSuccess;
 }
 
+ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size) {
+  CUCHECK(cuMulticastUnbind(*mcHandler, dev, 0/*mcOffset*/, size));
+  CUCHECK(cuMemUnmap(ptr, size));
+  CUCHECK(cuMemAddressFree(ptr, size));
+  CUCHECK(cuMemRelease(*mcHandler));
+  return ncclSuccess;
+}
+
 ncclResult_t nvlsGroupMapMem(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) {
   size_t size = resources->size;
   CUdeviceptr ptr = 0;
@@ -263,6 +275,9 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
 
   int nHeads = comm->channels[0].nvls.nHeads;
   int headRank = comm->channels[0].nvls.headRank;
+  char shmPath[sizeof("/dev/shm/nccl-XXXXXX")];
+  uintptr_t *nvlsShmem = NULL;
+  size_t typeSize;
 
   CUdevice dev;
   CUCHECK(cuCtxGetDevice(&dev));
@@ -313,11 +328,11 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
     char* shareableHandle = resources->shareableHandle;
     NCCLCHECKGOTO(nvlsGetProperties(comm, resources, dev, comm->localRanks, nvlsTotalSize), res, cleanup);
     if (comm->localRank == 0) {
-      NCCLCHECKGOTO(nvlsGroupCreate(comm, resources, comm->localRank, comm->localRanks, shareableHandle), res, cleanup);
+      NCCLCHECKGOTO(nvlsGroupCreate(comm, &resources->properties, comm->localRank, comm->localRanks, &resources->mcHandle, shareableHandle), res, cleanup);
       NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), res, cleanup);
     } else {
       NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), res, cleanup);
-      NCCLCHECKGOTO(nvlsGroupConnect(comm, resources, comm->localRankToRank[0], shareableHandle), res, cleanup);
+      NCCLCHECKGOTO(nvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], &resources->mcHandle), res, cleanup);
     }
 
     NCCLCHECKGOTO(nvlsGroupAddDevice(comm, resources), res, cleanup);
@@ -374,6 +389,23 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
     }
   }
 
+  /* create shared memory for fast NVLS buffer registration */
+  typeSize = sizeof(struct localRegData);
+  if (comm->localRank == 0) {
+    shmPath[0] = '\0';
+    NCCLCHECKGOTO(ncclShmOpen(shmPath, (sizeof(size_t) + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, comm->localRanks - 1, &comm->nvlsShmemHandle), res, cleanup);
+    NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shmPath, sizeof(shmPath)), res, cleanup);
+  } else {
+    NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shmPath, sizeof(shmPath)), res, cleanup);
+    NCCLCHECKGOTO(ncclShmOpen(shmPath, (sizeof(size_t) + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, -1, &comm->nvlsShmemHandle), res, cleanup);
+  }
+  /* need 2 pools and a shared counter for shmem-based collectives */
+  comm->nvlsShmem.cnt[0] = (size_t*)nvlsShmem;
+  comm->nvlsShmem.ptr[0] = (void*)((char*)comm->nvlsShmem.cnt[0] + sizeof(size_t));
+  comm->nvlsShmem.cnt[1] = (size_t*)((char*)comm->nvlsShmem.ptr[0] + typeSize * comm->localRanks);
+  comm->nvlsShmem.ptr[1] = (void*)((char*)comm->nvlsShmem.cnt[1] + sizeof(size_t));
+  comm->nvlsShmem.round = 0;
+
   return res;
 
 cleanup:
@@ -394,6 +426,371 @@ ncclResult_t ncclNvlsFree(struct ncclComm* comm) {
   return ncclSuccess;
 }
 
+ncclResult_t tryRegisterBuffer(struct ncclComm *comm, struct localRequestData *reqData, uintptr_t userBuff, size_t buffSize, CUdeviceptr *regAddr, bool *regUsed) {
+  ncclResult_t ret = ncclSuccess;
+  struct ncclRegRecord *regRecord = NULL;
+  struct localRequestData *myReqData = &reqData[comm->localRank];
+  CUdeviceptr regPtr = 0;
+  CUmulticastObjectProp prop;
+  char shareableHandle[NVLS_HANDLE_SIZE];
+  CUmemGenericAllocationHandle mcHandle;
+  size_t granularity;
+  size_t minSize;
+  bool localRegBufUsed = false;
+
+  /* get minimal size of nvls buffers */
+  minSize = reqData[0].reqSize;
+  for (int i = 1; i < comm->localRanks; ++i) {
+    if (minSize > reqData[i].reqSize)
+      minSize = reqData[i].reqSize;
+  }
+
+  /* start registration */
+  memcpy(&prop, &comm->nvlsResources->properties, sizeof(CUmulticastObjectProp));
+  prop.size = minSize;
+  CUCHECKGOTO(cuMulticastGetGranularity(&granularity, &prop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail);
+  if (comm->localRank == 0) {
+    NCCLCHECKGOTO(nvlsGroupCreate(comm, &prop, comm->localRank, comm->localRanks, &mcHandle, shareableHandle), ret, fail);
+    NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
+  } else {
+    NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
+    NCCLCHECKGOTO(nvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], &mcHandle), ret, fail);
+  }
+
+  CUCHECKGOTO(cuMulticastAddDevice(mcHandle, comm->nvlsResources->dev), ret, fail);
+  CUCHECKGOTO(cuMulticastBindAddr(mcHandle, 0, (CUdeviceptr)myReqData->reqBuff, minSize, 0), ret, fail);
+
+  // Create a VA for the NVLS
+  CUCHECKGOTO(cuMemAddressReserve(&regPtr, minSize, granularity, 0U, 0), ret, fail);
+  // Map the VA locally
+  CUCHECKGOTO(cuMemMap(regPtr, minSize, 0, mcHandle, 0), ret, fail);
+  CUCHECKGOTO(cuMemSetAccess(regPtr, minSize, &comm->nvlsResources->accessDesc, 1), ret, fail);
+
+  NCCLCHECKGOTO(ncclCalloc(&regRecord, 1), ret, fail);
+  regRecord->buff = myReqData->reqBuff;
+  regRecord->size = myReqData->reqSize;
+  regRecord->regAddr = regPtr;
+  regRecord->regSize = minSize;
+  regRecord->dev = comm->nvlsResources->dev;
+  regRecord->mcHandle = mcHandle;
+  /* get all buffer addresses */
+  NCCLCHECKGOTO(ncclCalloc(&regRecord->addrs, comm->localRanks), ret, fail);
+  regRecord->addrs[comm->localRank] = regRecord->buff;
+  NCCLCHECKGOTO(ncclShmemAllgather(comm, &comm->nvlsShmem, regRecord->addrs + comm->localRank, regRecord->addrs, sizeof(uintptr_t)), ret, fail);
+  /* enqueue record */
+  ncclIntruQueueEnqueue(&comm->regRecordQueue, regRecord);
+
+  localRegBufUsed = true;
+
+exit:
+  if (localRegBufUsed)
+    *regAddr = (uintptr_t)regPtr + userBuff - myReqData->reqBuff;
+  *regUsed = localRegBufUsed;
+  return ret;
+fail:
+  localRegBufUsed = false;
+  goto exit;
+}
+
+ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) {
+  ncclResult_t ret = ncclSuccess;
+  bool localRegBufUsed = false;
+  struct localRegData *regData = NULL;
+  struct localRequestData *reqData = NULL;
+  struct ncclRegRecord *regRecordHead = NULL, *sendRegRecord = NULL, *recvRegRecord = NULL;
+  struct ncclRegRequest *regRequestHead = NULL, *sendRegRequest = NULL, *recvRegRequest = NULL;
+  bool sendNeedReg = false, recvNeedReg = false;
+  CUdeviceptr regSendPtr = 0;
+  CUdeviceptr regRecvPtr = 0;
+
+  *outRegBufUsed = false;
+
+  NCCLCHECKGOTO(ncclCalloc(&regData, comm->localRanks), ret, fail);
+
+  /* first check whether the buffer has been registered and matches each other globally */
+  regRecordHead = ncclIntruQueueHead(&comm->regRecordQueue);
+  while (regRecordHead && ((sendRegRecord == NULL && sendbuff != NULL) || (recvRegRecord == NULL && recvbuff != NULL))) {
+    /* check send reg record */
+    if (sendRegRecord == NULL && regRecordHead->buff <= (uintptr_t)sendbuff &&
+      regRecordHead->buff + regRecordHead->size >= (uintptr_t)sendbuff + sendbuffSize) {
+      regData[comm->localRank].recSendbuff = regRecordHead->buff;
+      regData[comm->localRank].recSendOffset = (uintptr_t)sendbuff - regRecordHead->buff;
+      sendRegRecord = regRecordHead;
+    }
+
+    /* check recv reg record */
+    if (recvRegRecord == NULL && regRecordHead->buff <= (uintptr_t)recvbuff &&
+      regRecordHead->buff + regRecordHead->size >= (uintptr_t)recvbuff + recvbuffSize) {
+      regData[comm->localRank].recRecvbuff = regRecordHead->buff;
+      regData[comm->localRank].recRecvOffset = (uintptr_t)recvbuff - regRecordHead->buff;
+      recvRegRecord = regRecordHead;
+    }
+    regRecordHead = regRecordHead->next;
+  }
+
+  /* prepare registration request for later reference */
+  regRequestHead = ncclIntruQueueHead(&comm->regRequestQueue);
+  while (regRequestHead && ((sendRegRequest == NULL && sendbuff != NULL) || (recvRegRequest == NULL && recvbuff != NULL))) {
+    /* check send reg request */
+    if (regRequestHead->buff <= (uintptr_t)sendbuff &&
+      regRequestHead->buff + regRequestHead->size >= (uintptr_t)sendbuff + sendbuffSize) {
+      regData[comm->localRank].reqSendbuff = regRequestHead->buff;
+      regData[comm->localRank].reqSendSize = regRequestHead->size;
+      regData[comm->localRank].reqSendOffset = (uintptr_t)sendbuff - regRequestHead->buff;
+      sendRegRequest = regRequestHead;
+    }
+
+    /* check recv reg request */
+    if (regRequestHead->buff <= (uintptr_t)recvbuff &&
+      regRequestHead->buff + regRequestHead->size >= (uintptr_t)recvbuff + recvbuffSize) {
+      regData[comm->localRank].reqRecvbuff = regRequestHead->buff;
+      regData[comm->localRank].reqRecvSize = regRequestHead->size;
+      regData[comm->localRank].reqRecvOffset = (uintptr_t)recvbuff - regRequestHead->buff;
+      recvRegRequest = regRequestHead;
+    }
+    regRequestHead = regRequestHead->next;
+  }
+
+  NCCLCHECKGOTO(ncclShmemAllgather(comm, &comm->nvlsShmem, regData + comm->localRank, regData, sizeof(struct localRegData)), ret, fail);
+
+  /* first check whether all local ranks find their registered buffer */
+  for (int i = 0; i < comm->localRanks; ++i) {
+    if (regData[i].recSendbuff == 0 || sendRegRecord->addrs[i] != regData[i].recSendbuff) {
+      sendNeedReg = true;
+    }
+
+    if (regData[i].recRecvbuff == 0 || recvRegRecord->addrs[i] != regData[i].recRecvbuff) {
+      recvNeedReg = true;
+    }
+  }
+
+  if (sendNeedReg == false) {
+    for (int i = 0; i < comm->localRanks - 1; ++i) {
+      if (regData[i].recSendOffset != regData[i + 1].recSendOffset) {
+        /* offset are different, we cannot apply user buffer registration */
+        goto fail;
+      }
+    }
+
+    /* reuse previous registered buffer if possible */
+    if (!sendNeedReg)
+      regSendPtr = (CUdeviceptr)((uintptr_t)sendRegRecord->regAddr + regData[comm->localRank].recSendOffset);
+  }
+
+  if (recvNeedReg == false) {
+    for (int i = 0; i < comm->localRanks - 1; ++i) {
+      if (regData[i].recRecvOffset != regData[i + 1].recRecvOffset) {
+        goto fail;
+      }
+    }
+
+    if (!recvNeedReg)
+      regRecvPtr = (CUdeviceptr)((uintptr_t)recvRegRecord->regAddr + regData[comm->localRank].recRecvOffset);
+  }
+
+  if ((!sendNeedReg || sendbuff == NULL) && (!recvNeedReg || recvbuff == NULL)) {
+    localRegBufUsed = true;
+    INFO(NCCL_NVLS, "rank %d reuse local-registered sendbuff %p, recvbuff %p, sendbuff size %ld, recvbuff size %ld, reg sendbuff %p, reg recvbuff %p", comm->rank, sendbuff, recvbuff, sendbuffSize, recvbuffSize, (void*)regSendPtr, (void*)regRecvPtr);
+    goto exit;
+  }
+
+  /* Start Registration. Not found registered buffers, then check whether both send and recv buffer locate
+   * in register request cache. */
+  NCCLCHECKGOTO(ncclCalloc(&reqData, comm->localRanks), ret, fail);
+  if (sendNeedReg && sendbuff != NULL) {
+    /* copy request data got from previous shmem AG */
+    intptr_t offset = regData[0].reqSendOffset;
+    for (int i = 0; i < comm->localRanks; ++i) {
+      if (regData[i].reqSendbuff == 0 || offset != regData[i].reqSendOffset) goto fail;
+      reqData[i].reqBuff = regData[i].reqSendbuff;
+      reqData[i].reqSize = regData[i].reqSendSize;
+      reqData[i].reqOffset = regData[i].reqSendOffset;
+    }
+    tryRegisterBuffer(comm, reqData, (uintptr_t)sendbuff, sendbuffSize, &regSendPtr, &localRegBufUsed);
+    if (localRegBufUsed == false) goto fail;
+  }
+
+  if (recvNeedReg && recvbuff != NULL) {
+    intptr_t offset = regData[0].reqRecvOffset;
+    for (int i = 0; i < comm->localRanks; ++i) {
+      if (regData[i].reqRecvbuff == 0 || offset != regData[i].reqRecvOffset) goto fail;
+      reqData[i].reqBuff = regData[i].reqRecvbuff;
+      reqData[i].reqSize = regData[i].reqRecvSize;
+      reqData[i].reqOffset = regData[i].reqRecvOffset;
+    }
+    tryRegisterBuffer(comm, reqData, (uintptr_t)recvbuff, recvbuffSize, &regRecvPtr, &localRegBufUsed);
+    if (localRegBufUsed == false) goto fail;
+  }
+
+  INFO(NCCL_NVLS, "rank %d successfully local-registered sendbuff %p, recvbuff %p, sendbuff size %ld, recvbuff size %ld, reg sendbuff %p, reg recvbuff %p", comm->rank, sendbuff, recvbuff, sendbuffSize, recvbuffSize, (void*)regSendPtr, (void*)regRecvPtr);
+
+exit:
+  *outRegBufSend = (void*)regSendPtr;
+  *outRegBufRecv = (void*)regRecvPtr;
+  *outRegBufUsed = localRegBufUsed;
+  free(regData);
+  free(reqData);
+  return ncclSuccess;
+fail:
+  localRegBufUsed = false;
+  goto exit;
+}
+
+ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKernelPlan *plan, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) {
+  ncclResult_t ret = ncclSuccess;
+  bool localRegBufUsed = false;
+  struct ncclNvlsMcHandleList* sendRecord = NULL;
+  struct ncclNvlsMcHandleList* recvRecord = NULL;
+  CUdeviceptr regSendPtr = 0;
+  CUdeviceptr regRecvPtr = 0;
+  CUmulticastObjectProp prop;
+  char shareableHandle[NVLS_HANDLE_SIZE];
+  CUmemGenericAllocationHandle sendMcHandle, recvMcHandle;
+  size_t sendGran, recvGran;
+  bool *regBufFlags = NULL;
+  struct graphRegData *rdata = NULL;
+  const void *baseSend = NULL;
+  const void *baseRecv = NULL;
+  size_t baseSendSize = 1;
+  size_t baseRecvSize = 1;
+
+  *outRegBufUsed = false;
+  NCCLCHECKGOTO(ncclCalloc(&regBufFlags, comm->localRanks), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&rdata, comm->localRanks), ret, fail);
+
+  if (sendbuffSize > 0 || recvbuffSize > 0) {
+    /* retrieve base pointer and size */
+    if (CUPFN(cuMemGetAddressRange) == nullptr) goto fail;
+    if (sendbuff != NULL)
+      CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &baseSendSize, (CUdeviceptr)sendbuff), ret, fail);
+    if (recvbuff != NULL)
+      CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&baseRecv, &baseRecvSize, (CUdeviceptr)recvbuff), ret, fail);
+
+    memcpy(&prop, &comm->nvlsResources->properties, sizeof(CUmulticastObjectProp));
+    prop.size = baseSendSize;
+    CUCHECKGOTO(cuMulticastGetGranularity(&sendGran, &prop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail);
+    prop.size = baseRecvSize;
+    CUCHECKGOTO(cuMulticastGetGranularity(&recvGran, &prop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail);
+
+    localRegBufUsed = ((uint64_t)baseSend % sendGran != 0 || (uint64_t)baseRecv % recvGran != 0) ? false : true;
+    regBufFlags[comm->localRank] = localRegBufUsed;
+    NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, regBufFlags, sizeof(bool)), ret, fail);
+    for (int i = 0; i < comm->localRanks; ++i)
+      if (regBufFlags[i] == false) goto fail;
+
+    if (sendbuff != NULL) {
+      /* check send buffer offset and size */
+      rdata[comm->localRank].offset = (uintptr_t)sendbuff - (uintptr_t)baseSend;
+      rdata[comm->localRank].size = baseSendSize;
+      NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, rdata, sizeof(struct graphRegData)), ret, fail);
+      baseSendSize = rdata[0].size;
+      for (int i = 1; i < comm->localRanks; ++i) {
+        if (rdata[0].offset != rdata[i].offset) goto fail;
+        if (baseSendSize > rdata[i].size) baseSendSize = rdata[i].size;
+      }
+      if (baseSendSize % sendGran != 0) goto fail;
+
+      prop.size = baseSendSize;
+
+      /* register sendbuff */
+      if (comm->localRank == 0) {
+        NCCLCHECKGOTO(nvlsGroupCreate(comm, &prop, comm->localRank, comm->localRanks, &sendMcHandle, shareableHandle), ret, fail);
+        NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
+      } else {
+        NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
+        NCCLCHECKGOTO(nvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], &sendMcHandle), ret, fail);
+      }
+
+      CUCHECKGOTO(cuMulticastAddDevice(sendMcHandle, comm->nvlsResources->dev), ret, fail);
+      CUCHECKGOTO(cuMulticastBindAddr(sendMcHandle, 0, (CUdeviceptr)baseSend, baseSendSize, 0), ret, fail);
+
+      // Create a VA for the NVLS
+      CUCHECKGOTO(cuMemAddressReserve(&regSendPtr, baseSendSize, sendGran, 0U, 0), ret, fail);
+      // Map the VA locally
+      CUCHECKGOTO(cuMemMap(regSendPtr, baseSendSize, 0, sendMcHandle, 0), ret, fail);
+      CUCHECKGOTO(cuMemSetAccess(regSendPtr, baseSendSize, &comm->nvlsResources->accessDesc, 1), ret, fail);
+
+      sendRecord = ncclMemoryPoolAlloc<struct ncclNvlsMcHandleList>(&comm->memPool_ncclNvlsHandleList, &comm->memPermanent);
+      sendRecord->mcHandle = sendMcHandle;
+      sendRecord->ptr = regSendPtr;
+      sendRecord->dev = comm->nvlsResources->dev;
+      sendRecord->size = baseSendSize;
+    }
+
+    if (recvbuff != NULL) {
+      rdata[comm->localRank].offset = (uintptr_t)recvbuff - (uintptr_t)baseRecv;
+      rdata[comm->localRank].size = baseRecvSize;
+      NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, rdata, sizeof(struct graphRegData)), ret, fail);
+      baseRecvSize = rdata[0].size;
+      for (int i = 1; i < comm->localRanks; ++i) {
+        if (rdata[0].offset != rdata[i].offset) goto fail;
+        if (baseRecvSize > rdata[i].size) baseRecvSize = rdata[i].size;
+      }
+      if (baseRecvSize % recvGran != 0) goto fail;
+
+      prop.size = baseRecvSize;
+      if (comm->localRank == 0) {
+        NCCLCHECKGOTO(nvlsGroupCreate(comm, &prop, comm->localRank, comm->localRanks, &recvMcHandle, shareableHandle), ret, fail);
+        NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
+      } else {
+        NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
+        NCCLCHECKGOTO(nvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], &recvMcHandle), ret, fail);
+      }
+
+      CUCHECKGOTO(cuMulticastAddDevice(recvMcHandle, comm->nvlsResources->dev), ret, fail);
+      CUCHECKGOTO(cuMulticastBindAddr(recvMcHandle, 0, (CUdeviceptr)baseRecv, baseRecvSize, 0), ret, fail);
+
+      // Create a VA for the NVLS
+      CUCHECKGOTO(cuMemAddressReserve(&regRecvPtr, baseRecvSize, recvGran, 0U, 0), ret, fail);
+      // Map the VA locally
+      CUCHECKGOTO(cuMemMap(regRecvPtr, baseRecvSize, 0, recvMcHandle, 0), ret, fail);
+      CUCHECKGOTO(cuMemSetAccess(regRecvPtr, baseRecvSize, &comm->nvlsResources->accessDesc, 1), ret, fail);
+
+      recvRecord = ncclMemoryPoolAlloc<struct ncclNvlsMcHandleList>(&comm->memPool_ncclNvlsHandleList, &comm->memPermanent);
+      recvRecord->mcHandle = recvMcHandle;
+      recvRecord->ptr = regRecvPtr;
+      recvRecord->dev = comm->nvlsResources->dev;
+      recvRecord->size = baseRecvSize;
+    }
+
+    localRegBufUsed = true;
+  }
+
+exit:
+  if (localRegBufUsed == false) {
+    if (sendRecord) {
+      ncclNvlsDeregBuffer(&sendRecord->mcHandle, sendRecord->ptr, sendRecord->dev, sendRecord->size);
+      ncclMemoryPoolFree(&comm->memPool_ncclNvlsHandleList, sendRecord);
+    }
+
+    if (recvRecord) {
+      ncclNvlsDeregBuffer(&recvRecord->mcHandle, recvRecord->ptr, recvRecord->dev, recvRecord->size);
+      ncclMemoryPoolFree(&comm->memPool_ncclNvlsHandleList, recvRecord);
+    }
+  } else {
+    if (sendRecord) {
+      *outRegBufSend = (void*)((uintptr_t)regSendPtr + (uintptr_t)sendbuff - (uintptr_t)baseSend);
+      ncclIntruQueueEnqueue(&plan->nvlsMcHandleQueue, sendRecord);
+    }
+
+    if (recvRecord) {
+      *outRegBufRecv = (void*)((uintptr_t)regRecvPtr + (uintptr_t)recvbuff - (uintptr_t)baseRecv);
+      ncclIntruQueueEnqueue(&plan->nvlsMcHandleQueue, recvRecord);
+    }
+
+    INFO(NCCL_NVLS, "rank %d successfully graph-registered sendbuff %p, recvbuff %p, sendbuff size %ld (register size %ld, sendGran %ld), recvbuff size %ld (register size %ld, recvGran %ld), reg sendbuff %p, reg recvbuff %p", comm->rank, sendbuff, recvbuff, sendbuffSize, baseSendSize, sendGran, recvbuffSize, baseRecvSize, recvGran, (void*)regSendPtr, (void*)regRecvPtr);
+  }
+
+  *outRegBufUsed = localRegBufUsed;
+  free(regBufFlags);
+  free(rdata);
+  /* always return success. */
+  return ncclSuccess;
+fail:
+  localRegBufUsed = false;
+  goto exit;
+}
+
 #else
 
 /*
@@ -413,4 +810,18 @@ ncclResult_t ncclNvlsFree(struct ncclComm* comm) {
   return ncclSuccess;
 }
 
+ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKernelPlan *plan, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) {
+  *outRegBufUsed = false;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) {
+  *outRegBufUsed = false;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size) {
+  return ncclSuccess;
+}
+
 #endif /* CUDA_VERSION >= 12010 */
diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc
index 3630233307..3e4dab7e44 100644
--- a/src/transport/p2p.cc
+++ b/src/transport/p2p.cc
@@ -193,9 +193,13 @@ ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, v
     // cuMem API support
     CUmemAllocationHandleType type = NCCL_P2P_HANDLE_TYPE;
     CUmemGenericAllocationHandle handle;
-
     NCCLCHECK(ncclCuMemAlloc(ptr, &handle, size));
-    CUCHECK(cuMemExportToShareableHandle(&ipcDesc->cuDesc, handle, type, 0));
+    if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
+      // Return the native cuMem handle for later Export/Import via UDS
+      memcpy(&ipcDesc->cuDesc.data, &handle, sizeof(handle));
+    } else {
+      CUCHECK(cuMemExportToShareableHandle(&ipcDesc->cuDesc, handle, type, 0));
+    }
 #else
     return ncclInternalError;
 #endif
@@ -215,17 +219,6 @@ ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, v
 }
 
 ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc) {
-  if (ncclCuMemEnable()) {
-    // cuMem API support
-    CUmemAllocationHandleType type = NCCL_P2P_HANDLE_TYPE;
-
-    if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
-      int fd = *(int *) &ipcDesc->cuDesc.data;
-      if (fd <= 0) return ncclInternalError;
-      (void) close(fd);
-    }
-  }
-
   return ncclSuccess;
 }
 
@@ -242,20 +235,20 @@ ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, siz
     if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
       // UDS fd support
       struct ncclProxyConnector proxyConn;
-      int fd = *(int *)(&cuDesc->data);
-      int newFd = -1;
+      int fd = -1;
+      // Send cuMem handle to remote for conversion to an fd
       NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, tpPeer, &proxyConn));
-      NCCLCHECK(ncclProxyClientConvertFdBlocking(comm, &proxyConn, fd, &newFd));
-      INFO(NCCL_P2P, "UDS converted fd %d -> %d on peer %d", fd, newFd, tpPeer);
-      CUCHECK(cuMemImportFromShareableHandle(&handle, (void *)(uintptr_t)newFd, type));
-      close(newFd);
+      NCCLCHECK(ncclProxyClientGetFdBlocking(comm, &proxyConn, &cuDesc->data, &fd));
+      INFO(NCCL_P2P, "UDS converted handle 0x%lx to fd %d on remote peer %d", *(uint64_t*)&cuDesc->data, fd, tpPeer);
+      CUCHECK(cuMemImportFromShareableHandle(&handle, (void *)(uintptr_t)fd, type));
+      (void) close(fd);
     } else {
       CUCHECK(cuMemImportFromShareableHandle(&handle, cuDesc, type));
     }
     CUCHECK(cuMemAddressReserve(&dptr, size, /* alignment */ 0, /* addr */ 0, /* flags */ 0));
     CUCHECK(cuMemMap(dptr, size, /* offset */ 0, handle, /* flags */ 0));
 
-    TRACE(NCCL_P2P, "Imported shareable buffer size %zi handle 0x%lx dptr %p", size, (long)handle, (void*)dptr);
+    TRACE(NCCL_P2P, "Imported shareable buffer size %zi handle 0x%llx dptr %p", size, handle, (void*)dptr);
 
     // Allow access by the local GPU
     CUmemAccessDesc accessDesc = {};
@@ -263,7 +256,7 @@ ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, siz
     accessDesc.location.id = comm->cudaDev;
     accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
     CUCHECK(cuMemSetAccess(dptr, size, &accessDesc, 1));
-    TRACE(NCCL_P2P, "Set Access for %p size %zi dev %d", (void*)dptr, size, accessDesc.location.id);
+    TRACE(NCCL_P2P, "Set Access for %p size %zi on dev %d", (void*)dptr, size, accessDesc.location.id);
 
     *devMemPtr = (void *)dptr;
 #else
@@ -294,8 +287,8 @@ static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo*
   return ncclSuccess;
 }
 
-static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclP2pBuff* p2pBuff, void** devMem, void** ipcPtr) {
-  if (!ncclCuMemEnable() && myInfo->pidHash == peerInfo->pidHash) {
+static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclProxyConnector* proxyConn, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclP2pBuff* p2pBuff, void** devMem, void** ipcPtr) {
+  if (myInfo->pidHash == peerInfo->pidHash) {
     if (peerInfo->cudaDev != myInfo->cudaDev) {
       // Same PID different GPUs, enable P2P access
       // Legacy CUDA IPC
@@ -307,6 +300,18 @@ static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclPeerInfo* myInfo, s
             peerInfo->cudaDev, peerInfo->busId, err, cudaGetErrorString(err));
         return ncclInternalError;
       }
+#if CUDART_VERSION >= 11030
+      // cuMem API support
+      if (ncclCuMemEnable()) {
+        // Allow direct access to the remote buffer from the local GPU
+        CUmemAccessDesc accessDesc = {};
+        accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+        accessDesc.location.id = myInfo->cudaDev;
+        accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+        INFO(NCCL_P2P, "Set Access for buffer %p size %zi on dev %d", p2pBuff->directPtr, p2pBuff->size, peerInfo->cudaDev);
+        CUCHECK(cuMemSetAccess((CUdeviceptr) p2pBuff->directPtr, p2pBuff->size, &accessDesc, 1));
+      }
+#endif
     }
     *devMem = p2pBuff->directPtr;
     *ipcPtr = NULL;
@@ -349,7 +354,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
 
   if (intermediateRank == -1) {
     info->rank = myInfo->rank;
-    if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0 && !ncclCuMemEnable()) {
+    if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0) {
       resources->type = P2P_DIRECT;
       send->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
       INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%d] -> %d[%d] via P2P/direct pointer%s",
@@ -384,7 +389,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
     memcpy(info->shmName, resources->proxyInfo.shmName, sizeof(info->shmName));
   } else {
     NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
-    NCCLCHECK(p2pMap(comm, myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->sendDevMem, &resources->sendMemIpc));
+    NCCLCHECK(p2pMap(comm, &send->proxyConn, myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->sendDevMem, &resources->sendMemIpc));
   }
 
   return ncclSuccess;
@@ -413,7 +418,7 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
 
   if (intermediateRank == -1) {
     info->rank = myInfo->rank;
-    if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0 && !ncclCuMemEnable()) {
+    if (myInfo->pidHash == peerInfo->pidHash && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0) {
       resources->type = P2P_DIRECT;
       recv->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
     } else {
@@ -437,7 +442,7 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
   NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 0, tpProxyRank, &recv->proxyConn));
   NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &recvSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
 
-  NCCLCHECK(p2pMap(comm, myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->recvDevMem, &resources->recvMemIpc));
+  NCCLCHECK(p2pMap(comm, &recv->proxyConn, myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->recvDevMem, &resources->recvMemIpc));
   return ncclSuccess;
 }
 
@@ -447,7 +452,7 @@ static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* co
   struct ncclRecvMem* remDevMem = NULL;
   struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
 
-  NCCLCHECK(p2pMap(comm, comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->recvMemIpc));
+  NCCLCHECK(p2pMap(comm, &send->proxyConn, comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->recvMemIpc));
 
   char* buff = (char*)(remDevMem+1);
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
@@ -474,6 +479,8 @@ static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* co
     send->conn.ptrExchange = &resources->sendDevMem->ptrExchange;
     send->conn.redOpArgExchange = resources->sendDevMem->redOpArgExchange;
   }
+  // We must assign the proxyConn's proxyProgress property for proper checking at enqueue-time
+  send->proxyConn.proxyProgress = p2pTransport.send.proxyProgress;
   return ncclSuccess;
 }
 
@@ -495,7 +502,7 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
     recv->conn.tail = &resources->devShm->recvMem.tail;
     recv->conn.head = &resources->devShm->sendMem.head;
   } else {
-    NCCLCHECK(p2pMap(comm, comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->sendMemIpc));
+    NCCLCHECK(p2pMap(comm, &recv->proxyConn, comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->sendMemIpc));
 
     struct ncclRecvMem* devMem = resources->recvDevMem;
     recv->conn.tail = &devMem->tail;
diff --git a/src/transport/shm.cc b/src/transport/shm.cc
index aed8dd7d9e..5b24429199 100644
--- a/src/transport/shm.cc
+++ b/src/transport/shm.cc
@@ -164,6 +164,10 @@ static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* co
     send->conn.tail = &proxyInfo.ceRecvMem->tail;
     send->conn.sizesFifo = proxyInfo.ceRecvMem->sizesFifo;
   }
+
+  // We must assign the proxyConn's proxyProgress property for proper checking at enqueue-time
+  send->proxyConn.proxyProgress = shmTransport.send.proxyProgress;
+
   return ncclSuccess;
 }
 
@@ -193,6 +197,10 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co
     recv->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo;
     recv->conn.tail = &proxyInfo.ceRecvMem->tail;
   }
+
+  // We must assign the proxyConn's proxyProgress property for proper checking at enqueue-time
+  recv->proxyConn.proxyProgress = shmTransport.recv.proxyProgress;
+
   return ncclSuccess;
 }