2.19.1-1

Add local user buffer registration for NVLink SHARP. Add tuning plugin support. Increase net API to v7 to allow for device-side packet reordering; remove support for v4 plugins. Add support for RoCE ECE. Add support for C2C links. Better detect SHM allocation failures to avoid crash with Bus Error. Fix missing thread unlocks in bootstrap (Fixes #936). Disable network flush by default on H100. Move device code from src/collectives/device to src/device.
2023-09-26 05:47:28 -07:00
@@ -5,7 +5,7 @@
 #
 NCCL_HOME:=../../build/
 CUDA_HOME:=/usr/local/cuda
-INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include
+INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl
 PLUGIN_SO:=libnccl-net.so

 default: $(PLUGIN_SO)
@@ -24,6 +24,7 @@ typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCC

 typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);

+#include "net_v7.h"
 #include "net_v6.h"
 #include "net_v5.h"
 #include "net_v4.h"
@@ -0,0 +1,31 @@
+/*************************************************************************
+ * Copyright (c) 2023-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_DEVICE_H_
+#define NET_DEVICE_H_
+
+#include "net_device.h"
+
+#define NCCL_NET_DEVICE_INVALID_VERSION      0x0
+#define NCCL_NET_MTU_SIZE                    4096
+
+// Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin
+// version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version.
+#define NCCL_NET_DEVICE_UNPACK_VERSION 0x7  
+
+typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType;
+
+typedef struct {
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+  void* handle;
+  size_t size;
+  int needsProxyProgress;
+} ncclNetDeviceHandle_v7_t;
+
+typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_t;
+
+#endif
@@ -18,8 +18,6 @@ typedef struct {
  int maxRecvs;   // Maximum number of grouped receives.
 }ncclNetProperties_v6_t;

-typedef ncclNetProperties_v6_t ncclNetProperties_t;
-
 typedef struct {
  // Name of the network (mainly for logs)
  const char* name;
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NCCL_NET_V7_H_
+#define NCCL_NET_V7_H_
+
+#include "net_device.h"
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+} ncclNetProperties_v7_t;
+
+typedef ncclNetProperties_v7_t ncclNetProperties_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+} ncclNet_v7_t;
+
+#endif // end include guard
@@ -4,7 +4,7 @@
 * See LICENSE.txt for license information
 ************************************************************************/

-#include <nccl/net.h>
+#include "net.h"

 #define __hidden __attribute__ ((visibility("hidden")))

@@ -15,14 +15,14 @@ __hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess;

 __hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; }
 __hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; }
-__hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_v6_t* props) {
+__hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_v7_t* props) {
  //pluginPciPath(dev, &props.pciPath);
  //pluginPtrSupport(dev, &props.ptrSupport);
  return ncclInternalError;
 }
 __hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; }
-__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm) { return ncclInternalError; }
-__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm) { return ncclInternalError; }
 __hidden ncclResult_t pluginRegMr(void* collComm, void* data, int size, int type, void** mhandle) { return ncclInternalError; }
 __hidden ncclResult_t pluginRegMrDmaBuf(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { return ncclInternalError; }
 __hidden ncclResult_t pluginDeregMr(void* collComm, void* mhandle) { return ncclInternalError;}
@@ -33,10 +33,12 @@ __hidden ncclResult_t pluginTest(void* request, int* done, int* size) { return n
 __hidden ncclResult_t pluginCloseSend(void* sendComm) { return ncclInternalError; }
 __hidden ncclResult_t pluginCloseRecv(void* recvComm) { return ncclInternalError; }
 __hidden ncclResult_t pluginCloseListen(void* listenComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginIrecvConsumed(void* recvComm, int n, void* request) { return ncclInternalError; }
+__hidden ncclResult_t pluginGetDeviceMr(void* comm, void* mhandle, void** dptr_mhandle) { return ncclInternalError; }

 #define PLUGIN_NAME "Plugin"

-const ncclNet_v6_t ncclNetPlugin_v6 = {
+const ncclNet_v7_t ncclNetPlugin_v7 = {
  .name = PLUGIN_NAME,
  .init = pluginInit,
  .devices = pluginDevices,
@@ -54,6 +56,37 @@ const ncclNet_v6_t ncclNetPlugin_v6 = {
  .closeSend = pluginCloseSend,
  .closeRecv = pluginCloseRecv,
  .closeListen = pluginCloseListen,
+  .getDeviceMr = pluginGetDeviceMr,
+  .irecvConsumed = pluginIrecvConsumed,
+};
+
+__hidden ncclResult_t pluginGetProperties_v6(int dev, ncclNetProperties_v6_t* props) {
+  //pluginPciPath(dev, &props.pciPath);
+  //pluginPtrSupport(dev, &props.ptrSupport);
+  return ncclInternalError;
+}
+
+__hidden ncclResult_t pluginConnect_v6(int dev, void* handle, void** sendComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginAccept_v6(void* listenComm, void** recvComm) { return ncclInternalError; }
+
+const ncclNet_v6_t ncclNetPlugin_v6 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit,
+  .devices = pluginDevices,
+  .getProperties = pluginGetProperties_v6,
+  .listen = pluginListen,
+  .connect = pluginConnect_v6,
+  .accept = pluginAccept_v6,
+  .regMr = pluginRegMr,
+  .regMrDmaBuf = pluginRegMrDmaBuf,
+  .deregMr = pluginDeregMr,
+  .isend = pluginIsend,
+  .irecv = pluginIrecv,
+  .iflush = pluginIflush,
+  .test = pluginTest,
+  .closeSend = pluginCloseSend,
+  .closeRecv = pluginCloseRecv,
+  .closeListen = pluginCloseListen
 };

 /* v5 Compat */
@@ -61,10 +94,10 @@ const ncclNet_v5_t ncclNetPlugin_v5 = {
  .name = PLUGIN_NAME,
  .init = pluginInit,
  .devices = pluginDevices,
-  .getProperties = pluginGetProperties,
+  .getProperties = pluginGetProperties_v6,
  .listen = pluginListen,
-  .connect = pluginConnect,
-  .accept = pluginAccept,
+  .connect = pluginConnect_v6,
+  .accept = pluginAccept_v6,
  .regMr = pluginRegMr,
  .deregMr = pluginDeregMr,
  .isend = pluginIsend,
@@ -79,7 +112,7 @@ const ncclNet_v5_t ncclNetPlugin_v5 = {
 /* v4 Compat */
 static ncclResult_t pluginGetProperties_v4(int dev, ncclNetProperties_v4_t* props) {
  ncclNetProperties_v6_t props_v6;
-  ncclResult_t ret = pluginGetProperties(dev, &props_v6);
+  ncclResult_t ret = pluginGetProperties_v6(dev, &props_v6);
  if (ret != ncclSuccess) return ret;
  props->name = props_v6.name;
  props->pciPath = props_v6.pciPath;
@@ -103,14 +136,16 @@ static ncclResult_t pluginIflush_v4(void* recvComm, void* data, int size, void*
 static ncclResult_t pluginConnect_v4(int dev, void* handle, void** sendComm) {
  ncclResult_t ret;
  do {
-    ret = pluginConnect(dev, handle, sendComm);
+    ncclNetDeviceHandle_v7_t* handle = NULL;
+    ret = pluginConnect(dev, handle, sendComm, &handle);
  } while (ret == ncclSuccess && *sendComm == NULL);
  return ret;
 }
 static ncclResult_t pluginAccept_v4(void* listenComm, void** recvComm) {
  ncclResult_t ret;
  do {
-    ret = pluginAccept(listenComm, recvComm);
+    ncclNetDeviceHandle_v7_t* handle = NULL;
+    ret = pluginAccept(listenComm, recvComm, &handle);
  } while (ret == ncclSuccess && *recvComm == NULL);
  return ret;
 }
@@ -151,12 +186,12 @@ static ncclResult_t pluginInit_v3(ncclDebugLogger_t logFunction) {
 static ncclResult_t pluginListen_v3(int dev, void* handle, void** listenComm) {
  char pluginHandle[NCCL_NET_HANDLE_MAXSIZE];
  ncclResult_t ret = pluginListen(dev, &pluginHandle, listenComm);
-  memcpy(handle, &pluginHandle, NCCL_NET_HANDLE_MAXSIZE_V3);
+  memcpy(handle, &pluginHandle, NCCL_NET_HANDLE_MAXSIZE_V4);
  return ret;
 }
 static ncclResult_t pluginConnect_v3(int dev, void* handle, void** sendComm) {
  char pluginHandle[NCCL_NET_HANDLE_MAXSIZE];
-  memcpy(&pluginHandle, handle, NCCL_NET_HANDLE_MAXSIZE_V3);
+  memcpy(&pluginHandle, handle, NCCL_NET_HANDLE_MAXSIZE_V4);
  return pluginConnect_v4(dev, &pluginHandle, sendComm);
 }
 const ncclNet_v3_t ncclNetPlugin_v3 = {
@@ -0,0 +1,17 @@
+#
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+NCCL_HOME:=../../build/
+CUDA_HOME:=/usr/local/cuda
+INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl
+PLUGIN_SO:=libnccl-tuner.so
+
+default: $(PLUGIN_SO)
+
+$(PLUGIN_SO): plugin.c
+	$(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
+
+clean:
+	rm -f $(PLUGIN_SO)
@@ -0,0 +1,77 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_TUNER_H_
+#define NCCL_TUNER_H_
+
+#include "nccl.h"
+
+typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
+typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys;
+
+typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
+
+#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
+typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t;
+
+#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
+#define NCCL_ALGO_UNDEF -1
+#define NCCL_ALGO_TREE 0
+#define NCCL_ALGO_RING 1
+#define NCCL_ALGO_COLLNET_DIRECT 2
+#define NCCL_ALGO_COLLNET_CHAIN 3
+#define NCCL_ALGO_NVLS 4
+#define NCCL_ALGO_NVLS_TREE 5
+
+#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
+#define NCCL_PROTO_UNDEF -1
+#define NCCL_PROTO_LL 0
+#define NCCL_PROTO_LL128 1
+#define NCCL_PROTO_SIMPLE 2
+
+// API to be implemented by external tuner
+typedef struct {
+  // Name of the tuner
+  const char* name;
+
+  // Initializes tuner states.
+  // nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
+  // nNodes: number of nodes in current communicator.
+  // logFunction: a logFunction can be useful to integrate logging together with NCCL core.
+  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction);
+
+  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
+  // Inputs:
+  //   - collType: collective type , e.g., allreduce, allgather…
+  //   - nBytes: collective size in bytes
+  //   - collNetSupport: whether collnet supports this type
+  //   - nvlsSupport: whether nvlink sharp supports this time
+  //   - numPipeOps: number of operations in the group
+  //
+  // Outputs:
+  //   - algorithm: selected algorithm to be used for the given collective
+  //   - protocol: selected protocol to be used for the given collective
+  //   - nChannels: number of channels (hence SMs) to be used.
+  //
+  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
+  // default tuning for the given collective.
+  // Also, the plugin is allowed to not set any output, or set only the
+  // algorithm and protocol, but not only the algorithm or only the protocol.
+  // Unset fields will be set automatically by NCCL.
+  ncclResult_t (*getCollInfo)(ncclFunc_t collType, size_t nBytes,
+                              int collNetSupport, int nvlsSupport, int numPipeOps,
+                              int *algorithm, int *protocol, int* nChannels);
+
+  // Terminates the plugin and cleans up any resources that the plugin allocated.
+  ncclResult_t (*destroy)();
+} ncclTuner_v1_t;
+
+typedef ncclTuner_v1_t ncclTuner_t;
+
+#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v1"
+
+#endif
@@ -0,0 +1,26 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "tuner.h"
+
+#define __hidden __attribute__ ((visibility("hidden")))
+
+__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction) { return ncclSuccess; }
+
+__hidden ncclResult_t pluginGetCollInfo(ncclFunc_t collType, size_t nBytes,
+                              int collNetSupport, int nvlsSupport, int numPipeOps,
+                              int *algorithm, int *protocol, int* nChannels) { *algorithm = NCCL_ALGO_RING; *protocol = NCCL_PROTO_SIMPLE; return ncclSuccess; }
+
+__hidden ncclResult_t pluginDestroy() { return ncclSuccess; }
+
+#define PLUGIN_NAME "Example"
+
+const ncclTuner_v1_t ncclTunerPlugin_v1 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit,
+  .getCollInfo = pluginGetCollInfo,
+  .destroy = pluginDestroy
+};
@@ -9,6 +9,7 @@ PREFIX ?= /usr/local
 VERBOSE ?= 0
 KEEP ?= 0
 DEBUG ?= 0
+ASAN ?= 0
 TRACE ?= 0
 PROFAPI ?= 1
 NVTX ?= 1
@@ -85,6 +86,13 @@ NVCUFLAGS += -O0 -G -g
 CXXFLAGS  += -O0 -g -ggdb3
 endif

+# Make sure to run with ASAN_OPTIONS=protect_shadow_gap=0 otherwise CUDA will fail with OOM
+ifneq ($(ASAN), 0)
+CXXFLAGS += -fsanitize=address
+LDFLAGS += -fsanitize=address -static-libasan
+NVLDFLAGS += -Xcompiler -fsanitize=address,-static-libasan
+endif
+
 ifneq ($(VERBOSE), 0)
 NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
 CXXFLAGS  += -Wall -Wextra
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 18
-NCCL_PATCH   := 5
+NCCL_MINOR   := 19
+NCCL_PATCH   := 1
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
@@ -3,19 +3,17 @@
 #
 # See LICENSE.txt for license information
 #
-
 include ../makefiles/common.mk
 include ../makefiles/version.mk

 ##### src files
 INCEXPORTS  := nccl.h nccl_net.h
-LIBSRCFILES := init.cc init_nvtx.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc net.cc \
-		misc/cudawrap.cc misc/nvmlwrap.cc misc/ibvsymbols.cc misc/ibvwrap.cc misc/gdrwrap.cc \
-		misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc misc/strongstream.cc \
-		misc/ipcsocket.cc \
-		transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc transport/nvls.cc \
-                collectives/sendrecv.cc collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \
-                graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc graph/xml.cc
+LIBSRCFILES := \
+	bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \
+	init.cc init_nvtx.cc net.cc proxy.cc transport.cc \
+	$(wildcard graph/*.cc) \
+	$(wildcard misc/*.cc) \
+	$(wildcard transport/*.cc)

 ##### lib files
 LIBNAME     := libnccl.so
@@ -45,7 +43,7 @@ LIBOBJ     := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o)
 DEPFILES   := $(LIBOBJ:%.o=%.d)
 LDFLAGS    += -L${CUDA_LIB} -l$(CUDARTLIB) -lpthread -lrt -ldl

-DEVICELIB  := $(BUILDDIR)/obj/collectives/device/colldevice.a
+DEVMANIFEST := $(BUILDDIR)/obj/device/manifest

 ##### rules
 build : lib staticlib
@@ -54,8 +52,8 @@ lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET) $(PKGDIR)/$(PKGTARGET)

 staticlib : $(LIBDIR)/$(STATICLIBTARGET)

-$(DEVICELIB): ALWAYS_REBUILD $(INCTARGETS)
-	$(MAKE) -C collectives/device
+$(DEVMANIFEST): ALWAYS_REBUILD $(INCTARGETS)
+	$(MAKE) -C ./device

 # Empty target to force rebuild
 ALWAYS_REBUILD:
@@ -75,21 +73,17 @@ $(INCDIR)/nccl.h : nccl.h.in ../makefiles/version.mk
 	    -e "s/\$${nccl:Version}/$(NCCL_VERSION)/g" \
 	    $< > $@

-$(LIBDIR)/$(LIBTARGET): $(LIBOBJ) $(DEVICELIB)
+$(LIBDIR)/$(LIBTARGET): $(LIBOBJ) $(DEVMANIFEST)
 	@printf "Linking    %-35s > %s\n" $(LIBTARGET) $@
 	mkdir -p $(LIBDIR)
-	$(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LIBOBJ) $(DEVICELIB) $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LIBOBJ) $$(cat $(DEVMANIFEST)) $(LDFLAGS)
 	ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
 	ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)

-null :=
-space := $(null) #
-comma := ,
-
-$(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) $(DEVICELIB)
+$(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) $(DEVMANIFEST)
 	@printf "Archiving  %-35s > %s\n" $(STATICLIBTARGET) $@
 	mkdir -p $(LIBDIR)
-	printf "create $@\naddlib $(DEVICELIB)\naddmod $(subst $(space),$(comma),$(strip $(LIBOBJ)))\nsave\nend" | ar -M
+	ar cr $@ $(LIBOBJ) $$(cat $(DEVMANIFEST))

 $(PKGDIR)/nccl.pc : nccl.pc.in
 	mkdir -p $(PKGDIR)
@@ -126,7 +120,7 @@ $(OBJDIR)/%.o : %.cc $(INCTARGETS)
 	@rm -f $(@:%.o=%.d.tmp)

 clean :
-	$(MAKE) -C collectives/device clean
+	$(MAKE) -C device clean
 	rm -rf ${INCDIR} ${LIBDIR} ${PKGDIR} ${OBJDIR}

 install : build
@@ -12,6 +12,7 @@
 #include <unistd.h>
 #include <sys/types.h>
 #include "proxy.h"
+#include "param.h"

 struct bootstrapRootArgs {
  struct ncclSocket* listenSock;
@@ -28,21 +29,24 @@ ncclResult_t bootstrapNetInit() {
  if (bootstrapNetInitDone == 0) {
    pthread_mutex_lock(&bootstrapNetLock);
    if (bootstrapNetInitDone == 0) {
-      char* env = getenv("NCCL_COMM_ID");
+      const char* env = ncclGetEnv("NCCL_COMM_ID");
      if (env) {
        union ncclSocketAddress remoteAddr;
        if (ncclSocketGetAddrFromString(&remoteAddr, env) != ncclSuccess) {
          WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
+          pthread_mutex_unlock(&bootstrapNetLock);
          return ncclInvalidArgument;
        }
        if (ncclFindInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
          WARN("NET/Socket : No usable listening interface found");
+          pthread_mutex_unlock(&bootstrapNetLock);
          return ncclSystemError;
        }
      } else {
        int nIfs = ncclFindInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1);
        if (nIfs <= 0) {
          WARN("Bootstrap : no socket interface found");
+          pthread_mutex_unlock(&bootstrapNetLock);
          return ncclInternalError;
        }
      }
@@ -189,7 +193,7 @@ ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle) {
  memset(handle, 0, sizeof(ncclBootstrapHandle));
  NCCLCHECK(getRandomData(&handle->magic, sizeof(handle->magic)));

-  char* env = getenv("NCCL_COMM_ID");
+  const char* env = ncclGetEnv("NCCL_COMM_ID");
  if (env) {
    INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env);
    if (ncclSocketGetAddrFromString(&handle->addr, env) != ncclSuccess) {
@@ -0,0 +1,167 @@
+/*************************************************************************
+ * Copyright (c) 2015-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "argcheck.h" // Need some checks here since we access comm
+#include "collectives.h"
+#include "enqueue.h"
+#include "nccl.h"
+
+NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
+  // Just pass the size of one message and not the total bytes sent/received.
+  constexpr nvtxPayloadSchemaEntry_t AllGatherSchema[] = {
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}
+  };
+  size_t msgsize = sendcount * ncclTypeSize(datatype);
+  NVTX3_FUNC_WITH_PARAMS(AllGather, AllGatherSchema, msgsize)
+
+  struct ncclInfo info = { ncclFuncAllGather, "AllGather",
+    sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
+    ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
+}
+
+NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
+ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
+  struct NvtxParamsAllReduce {
+    size_t bytes;
+    ncclRedOp_t op;
+  };
+  // Just pass the size of one message and not the total bytes sent/received.
+  static constexpr nvtxPayloadSchemaEntry_t AllReduceSchema[] = {
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
+    {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
+      offsetof(NvtxParamsAllReduce, op)}
+  };
+  NvtxParamsAllReduce payload{count * ncclTypeSize(datatype), op};
+  NVTX3_FUNC_WITH_PARAMS(AllReduce, AllReduceSchema, payload)
+
+  struct ncclInfo info = { ncclFuncAllReduce, "AllReduce",
+    sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
+    ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
+}
+
+NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream) {
+  struct NvtxParamsBroadcast {
+    size_t bytes;
+    int root;
+  };
+  constexpr nvtxPayloadSchemaEntry_t BroadcastSchema[] = {
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsBroadcast, root)}
+  };
+  NvtxParamsBroadcast payload{count * ncclTypeSize(datatype), root};
+  NVTX3_FUNC_WITH_PARAMS(Broadcast, BroadcastSchema, payload)
+
+  struct ncclInfo info = { ncclFuncBroadcast, "Broadcast",
+    sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
+    BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
+}
+/* Deprecated original "in place" function, similar to MPI */
+NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream) {
+  return ncclBroadcast(buff, buff, count, datatype, root, comm, stream);
+}
+
+NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  struct NvtxParamsReduce {
+    size_t bytes;
+    int root;
+    ncclRedOp_t op;
+  };
+  constexpr nvtxPayloadSchemaEntry_t ReduceSchema[] = {
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsReduce, root)},
+    {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
+      offsetof(NvtxParamsReduce, op)}
+  };
+  NvtxParamsReduce payload{count * ncclTypeSize(datatype), root, op};
+  NVTX3_FUNC_WITH_PARAMS(Reduce, ReduceSchema, payload)
+
+  struct ncclInfo info = { ncclFuncReduce, "Reduce",
+    sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
+    REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
+}
+
+NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
+ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
+  struct NvtxParamsReduceScatter {
+    size_t bytes;
+    ncclRedOp_t op;
+  };
+  constexpr nvtxPayloadSchemaEntry_t ReduceScatterSchema[] = {
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
+    {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
+      offsetof(NvtxParamsReduceScatter, op)}
+  };
+  NvtxParamsReduceScatter payload{recvcount * ncclTypeSize(datatype), op};
+  NVTX3_FUNC_WITH_PARAMS(ReduceScatter, ReduceScatterSchema, payload)
+
+  struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter",
+    sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
+    REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
+}
+
+struct NvtxParamsSendRecv {
+    size_t bytes;
+    int peer;
+};
+constexpr const nvtxPayloadSchemaEntry_t SendRecvSchema[] = {
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Peer rank", nullptr, 0, offsetof(NvtxParamsSendRecv, peer)}
+};
+
+NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
+    ncclComm_t comm, cudaStream_t stream) {
+  NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
+  NVTX3_FUNC_WITH_PARAMS(Send, SendRecvSchema, payload)
+
+  struct ncclInfo info = { ncclFuncSend, "Send",
+    NULL, (void*)sendbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
+    1, 1 };
+  ncclResult_t ret;
+  NCCLCHECK(ncclGroupStart());
+  ret = ncclEnqueueCheck(&info);
+  NCCLCHECK(ncclGroupEnd());
+  return ret;
+}
+
+NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
+    ncclComm_t comm, cudaStream_t stream) {
+  NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
+  NVTX3_FUNC_WITH_PARAMS(Recv, SendRecvSchema, payload)
+
+  struct ncclInfo info = { ncclFuncRecv, "Recv",
+    NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
+    1, 1 };
+  ncclResult_t ret;
+  NCCLCHECK(ncclGroupStart());
+  ret = ncclEnqueueCheck(&info);
+  NCCLCHECK(ncclGroupEnd());
+  return ret;
+}
@@ -1,25 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "enqueue.h"
-#include "collectives.h"
-
-NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
-    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
-ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
-    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
-  // Just pass the size of one message and not the total bytes sent/received.
-  constexpr nvtxPayloadSchemaEntry_t AllGatherSchema[] = {
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}
-  };
-  size_t msgsize = sendcount * ncclTypeSize(datatype);
-  NVTX3_FUNC_WITH_PARAMS(AllGather, AllGatherSchema, msgsize)
-
-  struct ncclInfo info = { ncclFuncAllGather, "AllGather",
-    sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
-    ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS };
-  return ncclEnqueueCheck(&info);
-}
@@ -1,31 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "enqueue.h"
-#include "nccl.h"
-
-NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
-ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
-  struct NvtxParamsAllReduce {
-    size_t bytes;
-    ncclRedOp_t op;
-  };
-  // Just pass the size of one message and not the total bytes sent/received.
-  static constexpr nvtxPayloadSchemaEntry_t AllReduceSchema[] = {
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
-    {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
-      offsetof(NvtxParamsAllReduce, op)}
-  };
-  NvtxParamsAllReduce payload{count * ncclTypeSize(datatype), op};
-  NVTX3_FUNC_WITH_PARAMS(AllReduce, AllReduceSchema, payload)
-
-  struct ncclInfo info = { ncclFuncAllReduce, "AllReduce",
-    sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
-    ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS };
-  return ncclEnqueueCheck(&info);
-}
@@ -1,37 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "enqueue.h"
-#include "collectives.h"
-
-NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream);
-ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream) {
-  struct NvtxParamsBroadcast {
-    size_t bytes;
-    int root;
-  };
-  constexpr nvtxPayloadSchemaEntry_t BroadcastSchema[] = {
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsBroadcast, root)}
-  };
-  NvtxParamsBroadcast payload{count * ncclTypeSize(datatype), root};
-  NVTX3_FUNC_WITH_PARAMS(Broadcast, BroadcastSchema, payload)
-
-  struct ncclInfo info = { ncclFuncBroadcast, "Broadcast",
-    sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
-    BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS };
-  return ncclEnqueueCheck(&info);
-}
-/* Deprecated original "in place" function, similar to MPI */
-NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream);
-ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream) {
-  return ncclBroadcast(buff, buff, count, datatype, root, comm, stream);
-}
-
@@ -1,76 +0,0 @@
-#
-# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
-#
-# See LICENSE.txt for license information
-#
-
-include ../../../makefiles/common.mk
-include ../../../makefiles/version.mk
-
-BUILDDIR ?= $(abspath ../../../build)
-OBJDIR := $(BUILDDIR)/obj/collectives/device
-
-LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu sendrecv.cu onerank_reduce.cu
-
-LIBSRCFILES += functions.cu
-
-DEPFILES   := $(patsubst %.cu, $(OBJDIR)/%.d, $(LIBSRCFILES))
-DEPENDFILES:= $(DEPFILES:%.d=%.dep)
-STATICLIB  := $(OBJDIR)/colldevice.a
-DEVOBJ     := $(OBJDIR)/devlink.o
-RULESFILE  := $(OBJDIR)/Makefile.rules
-
-NVCUFLAGS  += -I. -I.. -I$(BUILDDIR)/include -I../../include --compiler-options "-fPIC -fvisibility=hidden"
-
-
-all: $(STATICLIB)
-
-# Dummy rule so that the extra dependency (%.dep) files are preserved by make
-all_deps: $(DEPENDFILES)
-
-# Auto-generating the rules per op/reduction/datatype/algorithm
-$(RULESFILE) : gen_rules.sh
-	@printf "Generating %-35s > %s\n" rules $@
-	@mkdir -p $(OBJDIR)
-	@CUDA_MAJOR=${CUDA_MAJOR} CUDA_MINOR=${CUDA_MINOR} ./gen_rules.sh $(OBJDIR) > $@
-
-include $(RULESFILE)
-
-LIBOBJ     := $(GENOBJS) $(OBJDIR)/functions.o $(OBJDIR)/onerank_reduce.o
-
-include $(DEPFILES)
-
-$(STATICLIB): $(LIBOBJ) $(DEVOBJ)
-	@printf "Archiving  %-35s > %s\n" objects $@
-	ar cr $@ $^
-
-# We do not want make to build *.d when running make clean.
-# So we only provide targets for .dep which will produce .dep and .d,
-# with only .d being included, and .dep keeping track of what needs to
-# be regenerated.
-$(OBJDIR)/%.dep : %.cu
-	@mkdir -p $(OBJDIR)
-	@$(NVCC) $(NVCUFLAGS) -M $< -o $@.tmp
-	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $@.tmp > $@
-	@sed -e 's/.*://' -e 's/\\$$//' < $@.tmp | fmt -1 | \
-                sed -e 's/^ *//' -e 's/$$/:/' >> $@
-	@rm -f $@.tmp
-	@cp $@ $(@:.dep=.d)
-
-# Compiled kernels and collectives with relocatable device code ...
-$(OBJDIR)/functions.o : functions.cu $(OBJDIR)/functions.dep
-	@printf "Compiling  %-35s > %s\n" $< $@
-	mkdir -p `dirname $@`
-	$(NVCC) $(NVCUFLAGS) -dc $< -o $@
-
-$(OBJDIR)/onerank_reduce.o : onerank_reduce.cu $(OBJDIR)/onerank_reduce.dep
-	@printf "Compiling  %-35s > %s\n" $< $@
-	mkdir -p `dirname $@`
-	$(NVCC) $(NVCUFLAGS) -dc $< -o $@
-
-# ... and create the device-side linked object with all those.
-$(DEVOBJ) : $(LIBOBJ)
-	$(NVCC) $(NVCUFLAGS) -dlink $^ -o $@
-
-clean:
-	rm -f $(LIBOBJ) $(DEVOBJ) $(DEPFILES) $(DEPENDFILES) $(RULESFILE) $(STATICLIB)
@@ -1,11 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "all_gather.h"
-#include "common.h"
-#include "collectives.h"
-
-IMPL_COLL_C(AllGather);
@@ -1,11 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "all_reduce.h"
-#include "common.h"
-#include "collectives.h"
-
-IMPL_COLL_R(AllReduce);
@@ -1,11 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "broadcast.h"
-#include "common.h"
-#include "collectives.h"
-
-IMPL_COLL_C(Broadcast);
@@ -1,122 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "devcomm.h"
-#include "collectives.h"
-#include "common.h"
-
-__shared__ ncclShmemData ncclShmem;
-#if __CUDA_ARCH__ < 700
-  __shared__ ulong2 ncclShmemPerWarp[ncclShmemScratchWarpSize()*(NCCL_MAX_NTHREADS/WARP_SIZE)/sizeof(ulong2)];
-#endif
-
-#define NCCL_FUNC5(func, algo, devredop, type, nullify) \
-  MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL,     devredop, type)), \
-  MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL128,  devredop, type)), \
-  MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, SIMPLE, devredop, type))
-
-#define NCCL_FUNC4(func, devredop, type, nullify) \
-  NCCL_FUNC5(func, TREE,    devredop, type, nullify), \
-  NCCL_FUNC5(func, RING,    devredop, type, nullify), \
-  NCCL_FUNC5(func, COLLNET_DIRECT, devredop, type, nullify), \
-  NCCL_FUNC5(func, COLLNET_CHAIN,  devredop, type, nullify), \
-  NCCL_FUNC5(func, NVLS,           devredop, type, nullify), \
-  NCCL_FUNC5(func, NVLS_TREE,      devredop, type, nullify)
-
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-// Must be consistent with ncclDataType_t
-#define NCCL_FUNCS3A(func, devredop, nullForFloat) \
-  NCCL_FUNC4(func, devredop, int8_t, 0), \
-  NCCL_FUNC4(func, devredop, uint8_t, 0), \
-  NCCL_FUNC4(func, devredop, int32_t, 0), \
-  NCCL_FUNC4(func, devredop, uint32_t, 0), \
-  NCCL_FUNC4(func, devredop, int64_t, 0), \
-  NCCL_FUNC4(func, devredop, uint64_t, 0), \
-  NCCL_FUNC4(func, devredop, half, nullForFloat), \
-  NCCL_FUNC4(func, devredop, float, nullForFloat), \
-  NCCL_FUNC4(func, devredop, double, nullForFloat), \
-  NCCL_FUNC4(func, devredop, __nv_bfloat16, nullForFloat)
-#define NCCL_FUNCS3B(func, devredop) \
-  NCCL_FUNC4(func, devredop, int8_t, 0), \
-  NCCL_FUNC4(func, devredop, int8_t, 0), \
-  NCCL_FUNC4(func, devredop, int8_t, 0), \
-  NCCL_FUNC4(func, devredop, int8_t, 0), \
-  NCCL_FUNC4(func, devredop, int8_t, 0), \
-  NCCL_FUNC4(func, devredop, int8_t, 0), \
-  NCCL_FUNC4(func, devredop, int8_t, 0), \
-  NCCL_FUNC4(func, devredop, int8_t, 0), \
-  NCCL_FUNC4(func, devredop, int8_t, 0), \
-  NCCL_FUNC4(func, devredop, int8_t, 0)
-#else
-// Must be consistent with ncclDataType_t
-#define NCCL_FUNCS3A(func, devredop, nullForFloat) \
-  NCCL_FUNC4(func, devredop, int8_t, 0), \
-  NCCL_FUNC4(func, devredop, uint8_t, 0), \
-  NCCL_FUNC4(func, devredop, int32_t, 0), \
-  NCCL_FUNC4(func, devredop, uint32_t, 0), \
-  NCCL_FUNC4(func, devredop, int64_t, 0), \
-  NCCL_FUNC4(func, devredop, uint64_t, 0), \
-  NCCL_FUNC4(func, devredop, half, nullForFloat), \
-  NCCL_FUNC4(func, devredop, float, nullForFloat), \
-  NCCL_FUNC4(func, devredop, double, nullForFloat)
-#define NCCL_FUNCS3B(func, devredop) \
-  NCCL_FUNC4(func, devredop, int8_t, 0), \
-  NCCL_FUNC4(func, devredop, int8_t, 0), \
-  NCCL_FUNC4(func, devredop, int8_t, 0), \
-  NCCL_FUNC4(func, devredop, int8_t, 0), \
-  NCCL_FUNC4(func, devredop, int8_t, 0), \
-  NCCL_FUNC4(func, devredop, int8_t, 0), \
-  NCCL_FUNC4(func, devredop, int8_t, 0), \
-  NCCL_FUNC4(func, devredop, int8_t, 0), \
-  NCCL_FUNC4(func, devredop, int8_t, 0)
-#endif
-
-// Must be consistent with ncclRedOp_t
-#define NCCL_FUNCS2A(func) \
-  NCCL_FUNCS3A(func, Sum,        /*nullForFloat=*/0), \
-  NCCL_FUNCS3A(func, Prod,       /*nullForFloat=*/0), \
-  NCCL_FUNCS3A(func, Max,        /*nullForFloat=*/0), \
-  NCCL_FUNCS3A(func, Min,        /*nullForFloat=*/0), \
-  NCCL_FUNCS3A(func, PreMulSum,  /*nullForFloat=*/0), \
-  NCCL_FUNCS3A(func, SumPostDiv, /*nullForFloat=*/1)
-
-#define NCCL_FUNCS2B(func) \
-  NCCL_FUNCS3B(func, Sum), \
-  NCCL_FUNCS3B(func, Sum), \
-  NCCL_FUNCS3B(func, Sum), \
-  NCCL_FUNCS3B(func, Sum), \
-  NCCL_FUNCS3B(func, Sum), \
-  NCCL_FUNCS3B(func, Sum)
-
-// Must be consistent with the ncclFuncSet enum
-__device__ ncclKern_t ncclFuncs[1+ncclNumTypes+NCCL_NUM_FUNCTIONS*ncclNumDevRedOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
-// Don't try to initialize the host shadow copy of this device-side global
-// variable. There is no host pointer to a device-side function, which
-// confuses clang. This will be fixed in the next clang release.
-#if __CUDA_ARCH__
-  NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
-  NCCL_ONERANK_REDUCE_NAME(PreMulSum, int8_t),
-  NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint8_t),
-  NCCL_ONERANK_REDUCE_NAME(PreMulSum, int32_t),
-  NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint32_t),
-  NCCL_ONERANK_REDUCE_NAME(PreMulSum, int64_t),
-  NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint64_t),
-  NCCL_ONERANK_REDUCE_NAME(PreMulSum, half),
-  NCCL_ONERANK_REDUCE_NAME(PreMulSum, float),
-  NCCL_ONERANK_REDUCE_NAME(PreMulSum, double),
-  #if defined(__CUDA_BF16_TYPES_EXIST__)
-    NCCL_ONERANK_REDUCE_NAME(PreMulSum, __nv_bfloat16),
-  #endif
-  NCCL_FUNCS2B(Broadcast),
-  NCCL_FUNCS2A(Reduce),
-  NCCL_FUNCS2B(AllGather),
-  NCCL_FUNCS2A(ReduceScatter),
-  NCCL_FUNCS2A(AllReduce)
-#endif
-};
-
-// Workaround for https://reviews.llvm.org/D55580
-__device__ void ncclWorkaroundClangD55580() {}
@@ -1,43 +0,0 @@
-#!/bin/bash
-#
-# Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
-#
-# See LICENSE.txt for license information
-#
-
-dir=$1
-
-datatypes="i8 u8 i32 u32 i64 u64 f16 f32 f64"
-if [ "$CUDA_MAJOR" -ge 11 ]
-then
-    datatypes+=" bf16"
-fi
-
-targets="GENOBJS := \\\\\n"
-
-for base in sendrecv all_reduce all_gather broadcast reduce reduce_scatter; do
-  opn=0
-  for op in sum prod min max premulsum sumpostdiv; do
-    dtn=0
-    # Order must match that of the ncclDataType_t enum
-    for dt in ${datatypes}; do
-      # Generate a unique filename for each compilation unit,
-      # otherwise the __nv_module_id may conflict at link time
-      echo "${dir}/${base}_${op}_${dt}.cu : ${base}.cu"
-      echo "	@printf \"Copying    %-35s > %s\\\\n\" \$< \$@"
-      echo "	cp \$< \$@"
-      echo ""
-      # Compile the file
-      echo "${dir}/${base}_${op}_${dt}.o : ${dir}/${base}_${op}_${dt}.cu ${base}.cu ${dir}/${base}.dep"
-
-      echo "	@printf \"Compiling  %-35s > %s\\\\n\" ${base}.cu ${dir}/${base}_${op}_${dt}.o"
-      echo "	mkdir -p ${dir}"
-      echo "	\${NVCC} -DNCCL_OP=${opn} -DNCCL_TYPE=${dtn} \${NVCUFLAGS} -dc \$< -o \$@"
-      echo ""
-      targets="$targets\t${dir}/${base}_${op}_${dt}.o \\\\\n"
-      dtn=$(($dtn + 1))
-    done
-    opn=$(($opn + 1))
-  done
-done
-echo -e "$targets"
@@ -1,62 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "devcomm.h"
-#include "collectives.h"
-#include "common_kernel.h"
-#include "common.h"
-
-namespace {
-  template<typename T, typename RedOp>
-  __device__ __forceinline__ void oneRankReduce() {
-    ncclWork *w = &ncclShmem.work;
-    int tid = threadIdx.x;
-    int tn = blockDim.x;
-    #pragma unroll 1
-    for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].isUsed; e++) {
-      ncclWorkElem *we = &w->elems[e];
-      intptr_t eltN = we->count;
-      int bid = we->bid;
-      int bn = we->nChannels;
-      T const *src = (T const*)we->sendbuff;
-      T *dst = (T*)we->recvbuff;
-
-      // each block/channel gets a roughly equal segment of 16 byte packs
-      constexpr int EltPerPack = 16/sizeof(T);
-      intptr_t packN = (eltN + EltPerPack-1) - (eltN + EltPerPack-1)%EltPerPack;
-      intptr_t i0 = (bid+0)*(packN/bn) + (bid+0 < packN%bn ? bid+0 : packN%bn);
-      intptr_t i1 = (bid+1)*(packN/bn) + (bid+1 < packN%bn ? bid+1 : packN%bn);
-      i0 *= EltPerPack;
-      i0 = i0 < eltN ? i0 : eltN;
-      i1 *= EltPerPack;
-      i1 = i1 < eltN ? i1 : eltN;
-      src += i0;
-      dst += i0;
-      void *vsrc = (void*)src;
-      void *vdst = (void*)dst;
-      reduceCopy<COLL_UNROLL, RedOp, T, 0,1,1, 0,1,1, /*PreOpSrcs=*/1>
-        (tid, tn, we->redOpArg, &(we->redOpArg), true, 1, &vsrc, 1, &vdst, i1-i0);
-    }
-  }
-}
-
-#define INSTANTIATE(devredop, type) \
-  __device__ void NCCL_ONERANK_REDUCE_NAME(devredop, type)() { \
-    oneRankReduce<type, Func##devredop<type>>(); \
-  }
-
-INSTANTIATE(PreMulSum, int8_t)
-INSTANTIATE(PreMulSum, uint8_t)
-INSTANTIATE(PreMulSum, int32_t)
-INSTANTIATE(PreMulSum, uint32_t)
-INSTANTIATE(PreMulSum, int64_t)
-INSTANTIATE(PreMulSum, uint64_t)
-INSTANTIATE(PreMulSum, half)
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-INSTANTIATE(PreMulSum, __nv_bfloat16)
-#endif
-INSTANTIATE(PreMulSum, float)
-INSTANTIATE(PreMulSum, double)
@@ -1,11 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "reduce.h"
-#include "common.h"
-#include "collectives.h"
-
-IMPL_COLL_R(Reduce);
@@ -1,11 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "reduce_scatter.h"
-#include "common.h"
-#include "collectives.h"
-
-IMPL_COLL_R(ReduceScatter);
@@ -1,11 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "sendrecv.h"
-#include "common.h"
-#include "collectives.h"
-
-IMPL_COLL_P(SendRecv);
@@ -1,33 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "enqueue.h"
-#include "collectives.h"
-#include "nccl.h"
-
-NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
-ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
-  struct NvtxParamsReduce {
-    size_t bytes;
-    int root;
-    ncclRedOp_t op;
-  };
-  constexpr nvtxPayloadSchemaEntry_t ReduceSchema[] = {
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsReduce, root)},
-    {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
-      offsetof(NvtxParamsReduce, op)}
-  };
-  NvtxParamsReduce payload{count * ncclTypeSize(datatype), root, op};
-  NVTX3_FUNC_WITH_PARAMS(Reduce, ReduceSchema, payload)
-
-  struct ncclInfo info = { ncclFuncReduce, "Reduce",
-    sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
-    REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS };
-  return ncclEnqueueCheck(&info);
-}
@@ -1,31 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "enqueue.h"
-#include "collectives.h"
-#include "nccl.h"
-
-NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
-ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
-  struct NvtxParamsReduceScatter {
-    size_t bytes;
-    ncclRedOp_t op;
-  };
-  constexpr nvtxPayloadSchemaEntry_t ReduceScatterSchema[] = {
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
-    {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
-      offsetof(NvtxParamsReduceScatter, op)}
-  };
-  NvtxParamsReduceScatter payload{recvcount * ncclTypeSize(datatype), op};
-  NVTX3_FUNC_WITH_PARAMS(ReduceScatter, ReduceScatterSchema, payload)
-
-  struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter",
-    sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
-    REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS };
-  return ncclEnqueueCheck(&info);
-}
@@ -1,52 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "enqueue.h"
-#include "collectives.h"
-#include "argcheck.h" // Need some checks here since we access comm
-
-struct NvtxParamsSendRecv {
-    size_t bytes;
-    int peer;
-};
-constexpr const nvtxPayloadSchemaEntry_t SendRecvSchema[] = {
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Peer rank", nullptr, 0, offsetof(NvtxParamsSendRecv, peer)}
-};
-
-NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
-    ncclComm_t comm, cudaStream_t stream);
-ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
-    ncclComm_t comm, cudaStream_t stream) {
-  NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
-  NVTX3_FUNC_WITH_PARAMS(Send, SendRecvSchema, payload)
-
-  struct ncclInfo info = { ncclFuncSend, "Send",
-    NULL, (void*)sendbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
-    1, 1 };
-  ncclResult_t ret;
-  NCCLCHECK(ncclGroupStart());
-  ret = ncclEnqueueCheck(&info);
-  NCCLCHECK(ncclGroupEnd());
-  return ret;
-}
-
-NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
-    ncclComm_t comm, cudaStream_t stream);
-ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
-    ncclComm_t comm, cudaStream_t stream) {
-  NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
-  NVTX3_FUNC_WITH_PARAMS(Recv, SendRecvSchema, payload)
-
-  struct ncclInfo info = { ncclFuncRecv, "Recv",
-    NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
-    1, 1 };
-  ncclResult_t ret;
-  NCCLCHECK(ncclGroupStart());
-  ret = ncclEnqueueCheck(&info);
-  NCCLCHECK(ncclGroupEnd());
-  return ret;
-}
@@ -9,6 +9,7 @@
 #include <stdlib.h>
 #include <stdarg.h>
 #include <sys/syscall.h>
+#include "param.h"

 int ncclDebugLevel = -1;
 static int pid = -1;
@@ -25,7 +26,7 @@ static __thread int tid = -1;
 void ncclDebugInit() {
  pthread_mutex_lock(&ncclDebugLock);
  if (ncclDebugLevel != -1) { pthread_mutex_unlock(&ncclDebugLock); return; }
-  const char* nccl_debug = getenv("NCCL_DEBUG");
+  const char* nccl_debug = ncclGetEnv("NCCL_DEBUG");
  int tempNcclDebugLevel = -1;
  if (nccl_debug == NULL) {
    tempNcclDebugLevel = NCCL_LOG_NONE;
@@ -45,7 +46,7 @@ void ncclDebugInit() {
   * This can be a comma separated list such as INIT,COLL
   * or ^INIT,COLL etc
   */
-  char* ncclDebugSubsysEnv = getenv("NCCL_DEBUG_SUBSYS");
+  const char* ncclDebugSubsysEnv = ncclGetEnv("NCCL_DEBUG_SUBSYS");
  if (ncclDebugSubsysEnv != NULL) {
    int invert = 0;
    if (ncclDebugSubsysEnv[0] == '^') { invert = 1; ncclDebugSubsysEnv++; }
@@ -97,7 +98,7 @@ void ncclDebugInit() {
   * then create the debug file. But don't bother unless the
   * NCCL_DEBUG level is > VERSION
   */
-  const char* ncclDebugFileEnv = getenv("NCCL_DEBUG_FILE");
+  const char* ncclDebugFileEnv = ncclGetEnv("NCCL_DEBUG_FILE");
  if (tempNcclDebugLevel > NCCL_LOG_VERSION && ncclDebugFileEnv != NULL) {
    int c = 0;
    char debugFn[PATH_MAX+1] = "";
@@ -0,0 +1,100 @@
+#
+# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+SHELL := /usr/bin/env bash
+MAKEFALGS += -r
+.SUFFIXES:
+.SECONDARY:
+
+NCCLDIR := ../..
+include $(NCCLDIR)/makefiles/common.mk
+include $(NCCLDIR)/makefiles/version.mk
+
+BUILDDIR ?= $(abspath ../../build)
+OBJDIR := $(BUILDDIR)/obj/device
+
+MANIFEST := $(OBJDIR)/manifest
+DEVGLUE_OBJ  := $(OBJDIR)/device_glue.o
+
+INCFLAGS  = -I. -I.. -I$(BUILDDIR)/include -I../include
+NVCUFLAGS += $(INCFLAGS) --compiler-options "-fPIC -fvisibility=hidden"
+CXXFLAGS  += $(INCFLAGS)
+
+SAY = @bash -c 'path="$$2"; [[ "$$(realpath "$$2")" =~ ^$(subst .,\.,$(abspath $(NCCLDIR)))/(.*)$$ ]] && path="$${BASH_REMATCH[1]}"; printf "%-15s %s\n" "$$1" "$$path"' SAY
+
+COMPILE.cu = $(NVCC) $(NVCUFLAGS) -dc $2 -o $1
+COMPILE.cc = $(CXX) $(CXXFLAGS) -c $2 -o $1
+define COMPILE
+@$(SAY) "Compiling" $2;\
+ mkdir -p $(dir $1);\
+ $(call COMPILE$(suffix $2),$1,$2)
+endef
+
+DEPENDS.cu = $(NVCC) $(NVCUFLAGS) -M -dc $1
+DEPENDS.cc = $(CXX) $(CXXFLAGS) -M -c $1
+define DEPENDS
+@$(SAY) "Dependencies" $2;\
+ mkdir -p $(dir $1);\
+ mk=$$($(call DEPENDS$(suffix $2),$2));\
+ [[ $$mk =~ ^[^:]*:(.*)$$ ]];\
+ files=$${BASH_REMATCH[1]};\
+ files=$$(for x in $$files; do case "$$x" in '\'|$$'\t') ;; *) echo "$$x"; esac; done);\
+ files=$$(for x in $$files; do [[ "$$(realpath "$$x")" == "$$(realpath "$(NCCLDIR)")"* ]] && echo "$$x"; done);\
+ echo "$(patsubst %.d,%.o,$1) $1: " $$files > $1
+endef
+
+all: $(MANIFEST)
+
+ifeq (1,1)
+# Case if the <gensrc> directory is generated on-demand:
+$(OBJDIR)/gensrc: generate.py
+	@mkdir -p $@
+	(which python3 >/dev/null || \
+	  (bar='!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'; \
+	   printf "\n$${bar}\nERROR: Building NCCL requires a Python 3 installation invokable as 'python3'.\n$${bar}\n\n" 1>&2; \
+	   exit 1)) \
+	&& ./generate.py $@ "$(ONLY_FUNCS)"
+else
+# Case if the <gensrc> directory is pre-generated and checked in the repo as ./gen:
+$(OBJDIR)/gensrc:
+	@mkdir -p $(OBJDIR); ln -srfn ./gen $@
+endif
+
+# The trailing ";" is necessary to make this an "empty recipe":
+# https://www.gnu.org/software/make/manual/html_node/Empty-Recipes.html
+$(OBJDIR)/gensrc/rules.mk: $(OBJDIR)/gensrc ;
+
+-include $(OBJDIR)/gensrc/rules.mk
+# "gensrc/rules.mk" populates $(LIB_OBJS_GEN)
+
+SRCS = common.cu onerank.cu
+
+LIB_OBJS = $(patsubst %, $(OBJDIR)/%.o, $(SRCS)) $(LIB_OBJS_GEN)
+
+$(OBJDIR)/%.o: % $(OBJDIR)/%.d
+	$(call COMPILE,$@,$<)
+
+$(OBJDIR)/genobj/%.o: $(OBJDIR)/gensrc $(OBJDIR)/genobj/%.d
+	$(call COMPILE,$@,$(OBJDIR)/gensrc/$*)
+
+$(OBJDIR)/%.d: %
+	$(call DEPENDS,$@,$<)
+
+$(OBJDIR)/genobj/%.d: $(OBJDIR)/gensrc/%
+	$(call DEPENDS,$@,$<)
+
+$(DEVGLUE_OBJ): $(LIB_OBJS)
+	$(NVCC) $(NVCUFLAGS) -dlink $^ -o $@
+
+$(MANIFEST): $(LIB_OBJS) $(DEVGLUE_OBJ)
+	@echo $^ > $@
+
+-include $(wildcard $(OBJDIR)/*.d)
+-include $(wildcard $(OBJDIR)/genobj/*.d)
+
+.PHONY: clean
+clean:
+	rm -rf $(OBJDIR)
@@ -4,7 +4,7 @@
 * See LICENSE.txt for license information
 ************************************************************************/

-#include "devcomm.h"
+#include "device.h"
 #include "collectives.h"
 #include "primitives.h"

@@ -108,33 +108,65 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
    const ssize_t chunkSize = int(args->lastChunkSize);
    const ssize_t size = args->count;
    const ssize_t loopSize = nChannels*chunkSize;
+    const ssize_t rank = ncclShmem.comm.rank;

-    const int nThreadsGather = 128;
-    const int nThreadsBcast = 384 + WARP_SIZE;
+    const int nThreadsBcast = args->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : 4 * WARP_SIZE;
+    const int nThreadsGather = args->regUsed ? WARP_SIZE : NCCL_MAX_NTHREADS - nThreadsBcast;
    const int tidEndGather = nThreadsGather;
    const int tidEndBcast = tidEndGather + nThreadsBcast;

-    using Proto = ProtoSimple<1, 1>;
-
-    if (tid < tidEndGather) {
-      // Gather
-      Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
-        prims(tid, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
-           args->redOpArg, 0*Proto::MaxGroupWidth, 0, 0);
-      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-        ssize_t offset = gridOffset + bid*chunkSize;
-        int nelem = min(chunkSize, size-offset);
-        prims.gather(offset, nvls->nHeads*size, nelem, size, -1, 0);
+    if (!args->regUsed) {
+      if (tid < tidEndGather) {
+        // Gather
+        using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
+        Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
+          prims(tid, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
+            args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
+        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+          ssize_t offset = gridOffset + bid * chunkSize;
+          int nelem = min(chunkSize, size - offset);
+          prims.gather(offset, nvls->nHeads * size, nelem, size, -1, 0);
+        }
+      } else if (tid < tidEndBcast) {
+        // Bcast through NVLS
+        using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
+        Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
+          prims(tid - tidEndGather, nThreadsBcast, NULL, &nvls->down, args->sendbuff, NULL,
+            args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0);
+        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+          ssize_t offset = gridOffset + bid * chunkSize;
+          int nelem = min(chunkSize, size - offset);
+          prims.send(offset, nelem);
+        }
      }
-    } else if (tid < tidEndBcast) {
-      // Bcast through NVLS
-      Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
-        prims(tid-tidEndGather, nThreadsBcast, NULL, &nvls->down, args->sendbuff, NULL,
-           args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
-      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-        ssize_t offset = gridOffset + bid*chunkSize;
-        int nelem = min(chunkSize, size-offset);
-        prims.send(offset, nelem);
+    } else {
+      /* direct allgather */
+      if (tid < tidEndGather) {
+        using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
+        Primitives<T, RedOp, FanSymmetric<NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
+          prims(tid, nThreadsGather, nvls->up, nvls->up, NULL, NULL,
+            args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
+
+        /* used as sync */
+        prims.scatter(0, 0, 0, 0, -1, 0);
+
+        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+          prims.gather(0, 0, 0, 0, -1, 0);
+        }
+      } else if (tid < tidEndBcast) {
+        using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
+        Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
+          prims(tid - tidEndGather, nThreadsBcast, &nvls->down, &nvls->down, args->sendbuff, NULL,
+            args->redOpArg, 1 * Proto::MaxGroupWidth, 0, 0, args);
+        /* used as sync */
+        prims.recv(0, 0);
+
+        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+          ssize_t inpOffset = gridOffset + bid * chunkSize;
+          ssize_t outOffset = inpOffset + rank * size;
+          int nelem = min(chunkSize, size - inpOffset);
+          prims.directSend(inpOffset, outOffset, nelem);
+        }
      }
    }
  }
@@ -4,7 +4,7 @@
 * See LICENSE.txt for license information
 ************************************************************************/

-#include "devcomm.h"
+#include "device.h"
 #include "collectives.h"
 #include "primitives.h"

@@ -377,7 +377,6 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
  __device__ __forceinline__ void run(ncclWorkElem *args) {
-  #if NCCL_NVLS_ENABLED
    const int tid = threadIdx.x;
    const int bid = args->bid;
    const int nChannels = args->nChannels;
@@ -387,10 +386,11 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
    const ssize_t loopSize = nChannels*nvls->nHeads*chunkSize;
    const int nranks = ncclShmem.comm.nRanks;
    const bool hasOut = nvls->out != -1;
-    const int reduceWarps = hasOut ? 3 : nranks <= 6 ? 7 : 5;
-    const int bcastWarps = hasOut ? 2 : 0;
-    const int scatterWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps + 1)/2;
-    const int gatherWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps)/2;
+    const int totalWarps = NCCL_MAX_NTHREADS/WARP_SIZE;
+    const int bcastWarps = hasOut ? (args->regUsed ? ((totalWarps - 2) >> 1) - 1 : 2) : 0;
+    const int reduceWarps = args->regUsed ? (totalWarps - bcastWarps - 2) : (hasOut ? 3 : nranks <= 6 ? 7 : 5);
+    const int scatterWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps + 1) >> 1;
+    const int gatherWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps) >> 1;

    const int nThreadsScatter = scatterWarps*WARP_SIZE;
    const int nThreadsGather  = gatherWarps*WARP_SIZE;
@@ -406,67 +406,65 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
      using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
      Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
        prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
-           args->redOpArg, 0*Proto::MaxGroupWidth, 1, 1);
+          args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-        ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize;
-        int nelem = min(nvls->nHeads*chunkSize, size-offset);
+        ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
+        int nelem = args->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset);
        prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
      }
    } else if (tid < tidEndGather) {
      // Gather
      using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
      Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
-        prims(tid-tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
-           args->redOpArg, 1*Proto::MaxGroupWidth, 1, 1);
+        prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
+          args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-        ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize;
-        int nelem = min(nvls->nHeads*chunkSize, size-offset);
+        ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
+        int nelem = args->regUsed ? 0 :min(nvls->nHeads * chunkSize, size - offset);
        prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
      }
    } else if (tid < tidEndReduce && nvls->headRank != -1) {
      if (!hasOut) {
        // Reduce, broadcast through NVLS
        using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
-        Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/0, Proto, 0>
-          prims(tid-tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL,
-             args->redOpArg, 2*Proto::MaxGroupWidth, 0, 0);
+        Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
+          prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL,
+            args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args);
        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
-          int nelem = min(chunkSize, size-offset);
-          prims.recvSend(nelem);
+          ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
+          int nelem = min(chunkSize, size - offset);
+          prims.directRecvDirectSend(offset, offset, nelem);
        }
      } else {
        // Reduce, send to network
        using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
-        Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/0, Proto, 0>
-          prims(tid-tidEndGather, nThreadsReduce, &nvls->down, &nvls->out, NULL, NULL,
-             args->redOpArg, 2*Proto::MaxGroupWidth, 0, 1);
+        Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
+          prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->out, NULL, NULL,
+            args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 1, args);
        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
-          int nelem = min(chunkSize, size-offset);
-          prims.recvSend(nelem);
+          ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
+          int nelem = min(chunkSize, size - offset);
+          prims.directRecvDirectSend(offset, offset, nelem);
        }
      }
    } else if (tid < tidEndBcast && nvls->headRank != -1) {
      // Recv from network, broadcast
      using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
-      Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/0, Proto, 0>
-        prims(tid-tidEndReduce, nThreadsBcast, &nvls->out, &nvls->down, NULL, NULL,
-           args->redOpArg, 3*Proto::MaxGroupWidth, 0, 0);
+      Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
+        prims(tid - tidEndReduce, nThreadsBcast, &nvls->out, &nvls->down, NULL, NULL,
+          args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, args);
      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-        ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
-        int nelem = min(chunkSize, size-offset);
-        prims.recvSend(nelem);
+        ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
+        int nelem = min(chunkSize, size - offset);
+        prims.directRecvDirectSend(offset, offset, nelem);
      }
    }
-  #endif // NCCL_NVLS_ENABLED
  }
 };

 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PROTO_SIMPLE> {
  __device__ __forceinline__ void run(ncclWorkElem *args) {
-  #if NCCL_NVLS_ENABLED
    const int tid = threadIdx.x;
    const int bid = args->bid;
    const int nChannels = args->nChannels;
@@ -478,10 +476,11 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
    const ssize_t loopSize = nChannels*nvls->nHeads*chunkSize;
    const int nranks = ncclShmem.comm.nRanks;
    const bool hasUp = treeUp != -1;
-    const int reduceWarps = hasUp ? 5 : nranks <= 6 ? 7 : 5;
-    const int bcastWarps = hasUp ? 4 : 0;
-    const int scatterWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps + 1)/2;
-    const int gatherWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps)/2;
+    const int totalWarps = NCCL_MAX_NTHREADS/WARP_SIZE;
+    const int bcastWarps = hasUp ? (args->regUsed ? ((totalWarps - 2) >> 1) - 1 : 4) : 0;
+    const int reduceWarps = args->regUsed ? (totalWarps - bcastWarps - 2) : (hasUp ? 5 : nranks <= 6 ? 7 : 5);
+    const int scatterWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps + 1) >> 1;
+    const int gatherWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps) >> 1;

    const int nThreadsScatter = scatterWarps*WARP_SIZE;
    const int nThreadsGather  = gatherWarps*WARP_SIZE;
@@ -497,60 +496,59 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
      using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
      Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
        prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
-           args->redOpArg, 0*Proto::MaxGroupWidth, 1, 1);
+          args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-        ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize;
-        int nelem = min(nvls->nHeads*chunkSize, size-offset);
+        ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
+        int nelem = args->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset);
        prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
      }
    } else if (tid < tidEndGather) {
      // Gather
      using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
      Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
-        prims(tid-tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
-           args->redOpArg, 1*Proto::MaxGroupWidth, 1, 1);
+        prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
+          args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-        ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize;
-        int nelem = min(nvls->nHeads*chunkSize, size-offset);
+        ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
+        int nelem = args->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset);
        prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
      }
    } else if (tid < tidEndReduce && nvls->headRank != -1) {
      if (!hasUp) {
        // Reduce and Broadcast
        using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
-        Primitives<T, RedOp, FanSymmetric<3>, /*Direct=*/0, Proto, 0>
-          prims(tid-tidEndGather, nThreadsReduce, treeDown, treeDown, NULL, NULL,
-             args->redOpArg, 2*Proto::MaxGroupWidth, 0, 0);
+        Primitives<T, RedOp, FanSymmetric<3>, /*Direct=*/1, Proto, 0>
+          prims(tid - tidEndGather, nThreadsReduce, treeDown, treeDown, NULL, NULL,
+            args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args);
        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
-          int nelem = min(chunkSize, size-offset);
-          prims.recvSend(nelem);
+          ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
+          int nelem = min(chunkSize, size - offset);
+          prims.directRecvDirectSend(offset, offset, nelem);
        }
      } else {
        // Reduce, send to network
        using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
-        Primitives<T, RedOp, FanAsymmetric<3, 1>, /*Direct=*/0, Proto, 0>
-          prims(tid-tidEndGather, nThreadsReduce, treeDown, &treeUp, NULL, NULL,
-              args->redOpArg, 2*Proto::MaxGroupWidth, 0, 0);
+        Primitives<T, RedOp, FanAsymmetric<3, 1>, /*Direct=*/1, Proto, 0>
+          prims(tid - tidEndGather, nThreadsReduce, treeDown, &treeUp, NULL, NULL,
+            args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args);
        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
-          int nelem = min(chunkSize, size-offset);
-          prims.recvSend(nelem);
+          ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
+          int nelem = min(chunkSize, size - offset);
+          prims.directRecvDirectSend(offset, offset, nelem);
        }
      }
    } else if (tid < tidEndBcast && nvls->headRank != -1) {
      // Recv from network, broadcast
      using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
-      Primitives<T, RedOp, FanAsymmetric<1, 3>, /*Direct=*/0, Proto, 0>
-        prims(tid-tidEndReduce, nThreadsBcast, &treeUp, treeDown, NULL, NULL,
-           args->redOpArg, 3*Proto::MaxGroupWidth, 0, 0);
+      Primitives<T, RedOp, FanAsymmetric<1, 3>, /*Direct=*/1, Proto, 0>
+        prims(tid - tidEndReduce, nThreadsBcast, &treeUp, treeDown, NULL, NULL,
+          args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, args);
      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-        ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
-        int nelem = min(chunkSize, size-offset);
-        prims.recvSend(nelem);
+        ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
+        int nelem = min(chunkSize, size - offset);
+        prims.directRecvDirectSend(offset, offset, nelem);
      }
    }
-  #endif // NCCL_NVLS_ENABLED
  }
 };

@@ -4,7 +4,7 @@
 * See LICENSE.txt for license information
 ************************************************************************/

-#include "devcomm.h"
+#include "device.h"
 #include "collectives.h"
 #include "primitives.h"

@@ -0,0 +1,24 @@
+/*************************************************************************
+ * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "device.h"
+#include "collectives.h"
+#include "common.h"
+
+__shared__ ncclShmemData ncclShmem;
+#if __CUDA_ARCH__ < 700
+  __shared__ ulong2 ncclShmemPerWarp[ncclShmemScratchWarpSize()*(NCCL_MAX_NTHREADS/WARP_SIZE)/sizeof(ulong2)];
+#endif
+
+struct RunWorkNop {
+  __device__ void run(ncclWork *w) {}
+};
+
+__global__ void ncclDevKernel_Generic(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) {
+  ncclKernelMain<-1, RunWorkNop>(comm, channelMask, workHead);
+}
+
+__device__ void ncclDevFunc_Nop() {}
@@ -8,19 +8,23 @@
 #define NCCL_DEVICE_COMMON_H_

 #include "collectives.h"
-#include "devcomm.h"
+#include "device.h"
 #include "op128.h"
+#include "network/unpack/unpack_defs.h"

 #define COLL_UNROLL (ncclCollUnroll())

-typedef void(*ncclKern_t)();
-extern __device__ ncclKern_t ncclFuncs[];
+typedef void(*ncclDevFuncPtr_t)();
+extern __device__ ncclDevFuncPtr_t const ncclDevFuncTable[];

 struct ncclShmemGroup {
  ncclConnInfo *recvConns[NCCL_MAX_NVLS_ARITY];
  ncclConnInfo *sendConns[NCCL_MAX_NVLS_ARITY];
  void* srcs[NCCL_MAX_NVLS_ARITY+1];
  void* dsts[NCCL_MAX_NVLS_ARITY+1];
+  union {
+    unpackGroupShmem unpack;
+  } devicePlugin;
 };

 struct ncclShmemData {
@@ -31,6 +35,9 @@ struct ncclShmemData {
  alignas(16) struct ncclDevComm comm;
  alignas(16) struct ncclDevChannel channel;
  alignas(16) struct ncclWork work;
+  alignas(16) union {
+    unpackShmem unpack;
+  } devicePlugin;
 };
 static_assert(offsetof(struct ncclShmemData, work)%16 == 0, "shmem.work needs to be 16B aligned");

@@ -111,10 +118,8 @@ static __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) {
  }
 }

-template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto, int FnIndex>
-__device__ void ncclKernel(
-    struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead
-  )  {
+template<int SpecializedFnId, typename SpecializedRunWork>
+__device__ void ncclKernelMain(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) {
  int tid = threadIdx.x;

  // To map blockId to channelId, we need the n'th set bit of channelMask which
@@ -166,7 +171,7 @@ __device__ void ncclKernel(
      bytes = 0;
      break;
    }
-    copyToShmem16(tid%WARP_SIZE, dst, src, bytes);
+    if (bytes) copyToShmem16(tid%WARP_SIZE, dst, src, bytes);
  }
  __syncthreads(); // publish ncclShmem

@@ -184,10 +189,10 @@ __device__ void ncclKernel(
    }
    __syncthreads();

-    if (ncclShmem.work.header.funcIndex == FnIndex) {
-      RunWork<Fn, T, RedOp, Algo, Proto>().run(&ncclShmem.work);
+    if (0 <= SpecializedFnId && ncclShmem.work.header.funcIndex == (unsigned)SpecializedFnId) {
+      SpecializedRunWork().run(&ncclShmem.work);
    } else {
-      ncclFuncs[ncclShmem.work.header.funcIndex]();
+      ncclDevFuncTable[ncclShmem.work.header.funcIndex]();
    }

    int workIxNext = ncclShmem.work.header.workNext;
@@ -204,94 +209,17 @@ __device__ void ncclKernel(
  }
 }

-// Only generate kernels for SUM
-#if NCCL_OP == 0
-#define IMPL_COLL_KERN(func, algo, proto, devredop, type, fIndex) \
-__global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)( \
-    struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead \
-  ) { \
-  ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex> \
-    (comm, channelMask, workHead); \
-}
-#else
-#define IMPL_COLL_KERN(func, algo, proto, devredop, type, fInded)
-#endif
+__global__ void ncclDevKernel_Generic(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead);
+__device__ void ncclDevFunc_Nop();

-// Examples :     AllReduce, RING, LL,    Sum,   uint8
-#define IMPL_COLL_FUNC(func, algo, proto, devredop, type) \
-__device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)() { \
-  RunWork<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto>().run(&ncclShmem.work); \
-}
+#define DEFINE_ncclDevKernel(suffix, coll, redop, ty, algo, proto, specializedFnId) \
+  __global__ void ncclDevKernel_##suffix(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) { \
+    ncclKernelMain<specializedFnId, RunWork<coll, ty, redop<ty>, algo, proto>>(comm, channelMask, workHead); \
+  }

-// Only generate inline kernels for LL
-#define IMPL_COLL4(func, algo, devredop, type, ncclType) \
-  IMPL_COLL_FUNC(func, algo, LL,     devredop, type) \
-  IMPL_COLL_FUNC(func, algo, LL128,  devredop, type) \
-  IMPL_COLL_FUNC(func, algo, SIMPLE, devredop, type) \
-  IMPL_COLL_KERN(func, algo, LL,     devredop, type, FUNC_INDEX(ncclFunc##func, ncclDev##devredop, ncclType, NCCL_ALGO_##algo, NCCL_PROTO_LL)) \
-
-#define IMPL_COLL3(func, devredop, type, ncclType) \
-  IMPL_COLL4(func, TREE,    devredop, type, ncclType) \
-  IMPL_COLL4(func, RING,    devredop, type, ncclType) \
-  IMPL_COLL4(func, COLLNET_DIRECT, devredop, type, ncclType) \
-  IMPL_COLL4(func, COLLNET_CHAIN, devredop, type, ncclType) \
-  IMPL_COLL4(func, NVLS, devredop, type, ncclType) \
-  IMPL_COLL4(func, NVLS_TREE, devredop, type, ncclType)
-
-#if NCCL_TYPE == 0
-#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, int8_t,   ncclInt8)
-#elif NCCL_TYPE == 1
-#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, uint8_t,  ncclUint8)
-#elif NCCL_TYPE == 2
-#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, int32_t,  ncclInt32)
-#elif NCCL_TYPE == 3
-#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, uint32_t, ncclUint32)
-#elif NCCL_TYPE == 4
-#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, int64_t,  ncclInt64)
-#elif NCCL_TYPE == 5
-#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, uint64_t, ncclUint64)
-#elif NCCL_TYPE == 6
-#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, half,     ncclFloat16)
-#elif NCCL_TYPE == 7
-#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, float,    ncclFloat32)
-#elif NCCL_TYPE == 8
-#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, double,   ncclFloat64)
-#elif NCCL_TYPE == 9 && defined(__CUDA_BF16_TYPES_EXIST__)
-#define IMPL_COLL2(func, devredop) IMPL_COLL3(func, devredop, __nv_bfloat16, ncclBfloat16)
-#endif
-
-// Reduction define all functions
-#if NCCL_OP == 0
-#define IMPL_COLL_R(func) IMPL_COLL2(func, Sum);
-#elif NCCL_OP == 1
-#define IMPL_COLL_R(func) IMPL_COLL2(func, Prod);
-#elif NCCL_OP == 2
-#define IMPL_COLL_R(func) IMPL_COLL2(func, Min);
-#elif NCCL_OP == 3
-#define IMPL_COLL_R(func) IMPL_COLL2(func, Max);
-#elif NCCL_OP == 4
-#define IMPL_COLL_R(func) IMPL_COLL2(func, PreMulSum);
-#elif NCCL_OP == 5
-  #if NCCL_TYPE < 6
-    #define IMPL_COLL_R(func) IMPL_COLL2(func, SumPostDiv);
-  #else
-    #define IMPL_COLL_R(func) // skip SumPostDiv for floating point
-  #endif
-#endif
-
-#if NCCL_OP == 0 && NCCL_TYPE == 0
-// Copy primitives only define one function for copy
-#define IMPL_COLL_C(func) IMPL_COLL3(func, Sum, int8_t, ncclInt8);
-
-// Point-to-point primitives only have one function/kernel.
-#define IMPL_COLL_P(func) \
-  IMPL_COLL_FUNC(func, RING, SIMPLE, Sum, int8_t); \
-  IMPL_COLL_KERN(func, RING, SIMPLE, Sum, int8_t, FUNC_INDEX_P2P);
-#else
-#define IMPL_COLL_C(func)
-#define IMPL_COLL_P(func)
-#endif
-
-#define NCCL_NVLS_ENABLED (__CUDA_ARCH__ >= 900 && NCCL_NVLS_SUPPORTS(NCCL_TYPE, NCCL_OP))
+#define DEFINE_ncclDevFunc(suffix, coll, redop, ty, algo, proto) \
+  __device__ void ncclDevFunc_##suffix() { \
+    RunWork<coll, ty, redop<ty>, algo, proto>().run(&ncclShmem.work); \
+  }

 #endif
@@ -7,7 +7,7 @@
 #ifndef NCCL_COMMON_KERNEL_H_
 #define NCCL_COMMON_KERNEL_H_

-#include "devcomm.h"
+#include "device.h"
 #include "op128.h"
 #include "reduce_kernel.h"
 #include <cstdio>
@@ -81,13 +81,13 @@ __device__ __forceinline__ void reduceCopyPacks(
      for (int u=0; u < Unroll; u++) {
        if (0 < MultimemSrcs) {
          // applyLoadMultimem uses relaxed semantics for same reason we use volatile below.
-          acc[u] = applyLoadMultimem<RedFn, BytePerPack>(preFn, minSrcs[0]);
+          acc[u] = applyLoadMultimem<RedFn, BytePerPack>(redFn, minSrcs[0]);
        } else {
          // Use volatile loads in case credits are polled for with volatile (instead of acquire).
          acc[u] = ld_volatile_global<BytePerPack>(minSrcs[0]);
+          if (0 < PreOpSrcs) acc[u] = applyPreOp(preFn, acc[u]);
        }
        minSrcs[0] += WARP_SIZE*BytePerPack;
-        if (0 < PreOpSrcs) acc[u] = applyPreOp(preFn, acc[u]);
      }
    }

@@ -99,7 +99,7 @@ __device__ __forceinline__ void reduceCopyPacks(
      for (int u=0; u < Unroll; u++) {
        if (s < MultimemSrcs) {
          // applyLoadMultimem uses relaxed semantics for same reason we use volatile below.
-          acc[u] = applyLoadMultimem<RedFn, BytePerPack>(preFn, minSrcs[s]);
+          acc[u] = applyLoadMultimem<RedFn, BytePerPack>(redFn, minSrcs[s]);
        } else {
          // Use volatile loads in case credits are polled for with volatile (instead of acquire).
          tmp[u] = ld_volatile_global<BytePerPack>(minSrcs[s]);
@@ -0,0 +1,405 @@
+#!/usr/bin/env python3
+import os
+import sys
+
+# Order of redops, tys, protos, algos must match src/include/device.h
+all_colls =  ["Broadcast","Reduce","AllGather","ReduceScatter","AllReduce","SendRecv"]
+all_redops = ["Sum","Prod","MinMax","PreMulSum","SumPostDiv"]
+all_tys =    ["i8","u8","i32","u32","i64","u64","f16","f32","f64","bf16"]
+all_protos = ["LL","LL128","SIMPLE"]
+all_algos =  ["TREE","RING","COLLNET_DIRECT","COLLNET_CHAIN","NVLS","NVLS_TREE"]
+
+################################################################################
+# The first command line argument is the path to the directory to generate and
+# populate.
+
+gensrc = sys.argv[1]
+
+if os.path.exists(gensrc):
+  for name in os.listdir(gensrc):
+    os.remove(os.path.join(gensrc, name))
+    #os.truncate(os.path.join(gensrc, name), 0)
+else:
+  os.mkdir(gensrc)
+
+################################################################################
+# The second  command line argument is used as a regex to filter the functions
+# which make it into libnccl. This is helpful for reducing the binary when
+# developing device code. The regex supports non-space containing globs '*',
+# parentheses '(x)', and union 'a|b'. The string representing the function has
+# one of the forms:
+#
+# SendRecv
+# (AllGather|Broadcast) <algo> <proto>
+# (AlLReduce|Reduce|ReduceScatter) <redop> <type> <algo> <proto>
+#
+# The possible values for redop, type, algo, proto can be found in the all_<foo>
+# lists at the top of this file.
+#
+# Since the Makefile forwards this from the ONLY_FUNCS variable, useful command
+# line examples are given:
+"""
+# Only send/recv:
+make ONLY_FUNCS="SendRecv"
+
+# Only non-reductions:
+make ONLY_FUNCS="AllGather * *|Broadcast * *|SendRecv"
+
+# Only AllReduce sum f32 (but all algos, protos)
+make ONLY_FUNCS="AllReduce Sum f32 * *"
+
+# Only AllReduce minmax i32 NVLS (but all protos)
+make ONLY_FUNCS="AllReduce MinMax i32 NVLS *"
+
+# AllReduce sum <all floats> RING LL128
+make ONLY_FUNCS="AllReduce Sum f32 RING LL128"
+"""
+
+# Paste all non-None arguments together with `sep`.
+def paste(sep, *args):
+  return sep.join(x for x in args if x is not None)
+
+func_pattern = sys.argv[2:3]
+if func_pattern and func_pattern[0]:
+  import re
+  func_pattern = func_pattern[0]
+  func_pattern = func_pattern.replace("*", "[^ ]*")
+  func_pattern += "$"
+  def func_filter(*fn):
+    return None is not re.match(func_pattern, paste(" ", *fn), flags=re.IGNORECASE)
+else:
+  def func_filter(coll, redop, ty, algo, proto):
+    return True
+
+################################################################################
+
+algos_of_coll = {
+  "AllGather":     ["RING","NVLS"],
+  "AllReduce":     all_algos,
+  "Broadcast":     ["RING"],
+  "Reduce":        ["RING"],
+  "ReduceScatter": ["RING","NVLS"],
+  "SendRecv":      [None]
+}
+
+coll_camel_to_lower = {
+  "AllGather":     "all_gather",
+  "AllReduce":     "all_reduce",
+  "Broadcast":     "broadcast",
+  "Reduce":        "reduce",
+  "ReduceScatter": "reduce_scatter",
+  "SendRecv":      "sendrecv"
+}
+coll_lower_to_camel = {coll_camel_to_lower[x]: x for x in coll_camel_to_lower}
+
+################################################################################
+
+# Returns pair of minimum required values for (CUDART_VERSION, __CUDA_ARCH__)
+# or None if function is never supported. Note that (0, 0) encodes universal
+# support.
+def required_cuda(coll, redop, ty, algo, proto):
+  cudart, arch = 0, 0
+  # kernels mapped to by coll="Nop" functions have coll="Generic"
+  if coll in ("SendRecv", "Generic", "Nop"): return (cudart, arch)
+
+  if proto!="SIMPLE" and algo not in ("RING","TREE"): return None
+
+  if coll in ("AllReduce","Reduce","ReduceScatter"):
+    if redop=="SumPostDiv" and ty[0] not in ("i","u"): return None
+    if ty=="bf16": cudart = max(cudart, 11000)
+
+  if "NVLS" in algo:
+    if coll in ("AllReduce","Reduce","ReduceScatter"):
+      # Must match ncclNvlsSupported() in src/include/device.h
+      nvls_ok = ((ty in ("i32","u32","i64","u64") and redop in ("Sum","MinMax")) or
+                 (ty in ("f32","f64") and redop=="Sum") or
+                 (ty in ("f16","bf16") and redop in ("Sum","MinMax")))
+      if not nvls_ok: return None
+    cudart = max(cudart, 12010)
+    arch = max(arch, 900)
+
+  return (cudart, arch)
+
+# Maps functions to the chosen representative for the equivalence class it
+# belongs to. For instance (sum, signed int) maps to (sum, unsigned int).
+def equivalent_primary(coll, redop, ty, algo, proto):
+  if coll in ("AllReduce", "Reduce", "ReduceScatter"):
+    # map signed integer sum/prod to unsigned
+    if redop in ("Sum","Prod","PreMulSum") and ty[0]=="i":
+      return (coll, redop, "u"+ty[1:], algo, proto)
+    # map signed integer min/max to unsigned for non-NVLS
+    if redop=="MinMax" and ty[0]=="i" and ("NVLS" not in algo):
+      return (coll, redop, "u"+ty[1:], algo, proto)
+  return (coll, redop, ty, algo, proto)
+
+# Map to another func representing the best kernel to use. Every distinct value
+# returned will instantiate a ncclDevKernel specialized to run this func
+# without function call overhead.
+def best_kernel(coll, redop, ty, algo, proto):
+  def best(coll, redop, ty, algo, proto):
+    # Modify this logic to control how many kernels are specialized.
+    if coll=="Nop": return ("Generic", None, None, None, None)
+    if coll=="SendRecv": return ("SendRecv", None, None, None, None)
+    if coll in ("AllGather","Broadcast"): return (coll, None, None, "RING", "LL")
+    return (coll, "Sum", ty, ("TREE" if algo=="TREE" else "RING"), "LL")
+  # Need to ensure kernel is specialize for a primary function
+  kfn = equivalent_primary(*best(coll, redop, ty, algo, proto))
+  # And isn't filtered out.
+  if not func_filter(*kfn): return ("Generic", None, None, None, None)
+  return kfn
+
+# Order rows are enumerated must match formula of `ncclDevFuncId()`:
+def enumerate_func_rows():
+  yield ("SendRecv", None, None, None, None)
+  for coll in ("AllGather", "Broadcast"):
+    algos = algos_of_coll[coll]
+    for algo in algos:
+      for proto in all_protos:
+        yield (coll, None, None, algo, proto)
+  for coll in ("AllReduce", "Reduce", "ReduceScatter"):
+    algos = algos_of_coll[coll]
+    for redop in all_redops:
+      for ty in all_tys:
+        for algo in algos:
+          for proto in all_protos:
+            yield (coll, redop, ty, algo, proto)
+
+################################################################################
+
+def is_built(coll, redop, ty, algo, proto):
+  built = required_cuda(coll, redop, ty, algo, proto)
+  built = built and func_filter(coll, redop, ty, algo, proto)
+  return built
+
+# Returns None if required_cuda(...) is None.
+# Returns the coll="Nop" function if developer has filtered it out.
+# Otherwise just returns func it was given.
+def validate(coll, redop, ty, algo, proto):
+  valid = required_cuda(coll, redop, ty, algo, proto)
+  built = valid and func_filter(coll, redop, ty, algo, proto)
+  if built: return (coll, redop, ty, algo, proto)
+  if valid: return ("Nop", None, None, None, None)
+  return None
+
+# Corresponds to ncclDevFuncRowToId[]
+func_rows = [validate(*fn) for fn in enumerate_func_rows()]
+
+# Corresponds to ncclDevFuncTable[]
+primary_funcs = sorted(set(equivalent_primary(*fn) for fn in func_rows if fn is not None))
+
+# primary_to_index[primary_funcs[i]] == i
+primary_to_index = {fn: i for (i,fn) in zip(range(len(primary_funcs)), primary_funcs)}
+
+kernel_funcs = sorted(set(best_kernel(*fn) for fn in primary_funcs))
+
+################################################################################
+
+# Generate <gensrc>/device_table.cu
+with open(os.path.join(gensrc, "device_table.cu"), "w") as f:
+  out = f.write
+  out('#include "common.h"\n')
+  out("\n")
+
+  for fn in primary_funcs:
+    sym = paste("_", "ncclDevFunc", *fn)
+    cudart, arch = required_cuda(*fn)
+    if (cudart, arch) != (0, 0):
+      out("#if CUDART_VERSION >= %d && __CUDA_ARCH__ >= %d\n" % (cudart, arch))
+    out("__device__ void %s();\n" % sym)
+    if (cudart, arch) != (0, 0):
+      out("#endif\n")
+  out("\n")
+
+  out("__device__ ncclDevFuncPtr_t const ncclDevFuncTable[] = {\n");
+  index = 0
+  for fn in primary_funcs:
+    sym = paste("_", "ncclDevFunc", *fn)
+    cudart, arch = required_cuda(*fn)
+    if (cudart, arch) != (0, 0):
+      out("#if CUDART_VERSION >= %d && __CUDA_ARCH__ >= %d\n" % (cudart ,arch))
+    out("/*%4d*/ %s,\n" % (index, sym))
+    if (cudart, arch) != (0, 0):
+      out("#else\n" "/*%4d*/ nullptr,\n" "#endif\n" % index)
+    index += 1
+  out("nullptr};\n")
+  out("\n")
+
+  out("// Workaround for https://reviews.llvm.org/D55580\n"
+      "__device__ void ncclWorkaroundClangD55580() {}\n")
+
+# Generate <gensrc>/host_table.cc
+with open(os.path.join(gensrc, "host_table.cc"), "w") as f:
+  out = f.write
+  out('#include "device.h"\n')
+  out("\n")
+
+  # The mapping from function rows to valid primary function ids.
+  out("extern int const ncclDevFuncRowToId[] = {\n")
+  index = 0
+  for fn in func_rows:
+    fn_id, comment = -1, ""
+    if fn is not None:
+      fn_id = primary_to_index[equivalent_primary(*fn)]
+      comment = " // " + paste(" ", *fn)
+    out("/*%4d*/ %d,%s\n" % (index, fn_id, comment))
+    index += 1
+  out("-1};\n")
+  out("\n")
+
+  # Forward declarations of kernels.
+  for kfn in kernel_funcs:
+    cudart, _ = required_cuda(*kfn)
+    sym = paste("_", "ncclDevKernel", *kfn)
+    if cudart != 0: out("#if CUDART_VERSION >= %d\n" % cudart)
+    out("__global__ void %s(struct ncclDevComm*, uint64_t, struct ncclWork*);\n" % sym)
+    if cudart != 0: out("#endif\n")
+  out("\n")
+
+  # List of all kernel function pointers.
+  out("extern int const ncclDevKernelCount = %d;\n" % len(kernel_funcs))
+  out("extern void* const ncclDevKernelList[] = {\n")
+  index = 0
+  for kfn in kernel_funcs:
+    cudart, _ = required_cuda(*kfn)
+    sym = paste("_", "ncclDevKernel", *kfn)
+    if cudart != 0: out("#if CUDART_VERSION >= %d\n" % cudart)
+    out("/*%4d*/ (void*)%s,\n" % (index, sym));
+    if cudart != 0: out("#else\n" "/*%4d*/ nullptr,\n" "#endif\n" % index)
+    index += 1
+  out("nullptr};\n")
+  out("\n")
+
+  # Maps primary id to kernel function pointer.
+  out("extern void* const ncclDevKernelForFunc[] = {\n")
+  index = 0
+  for fn in primary_funcs:
+    kfn = best_kernel(*fn)
+    sym = paste("_", "ncclDevKernel", *kfn)
+    cudart, _ = required_cuda(*kfn)
+    if cudart != 0: out("#if CUDART_VERSION >= %d\n" % cudart)
+    out("/*%4d*/ (void*)%s,\n" % (index, sym))
+    if cudart != 0: out("#else\n" "/*%4d*/ nullptr,\n" "#endif\n" % index)
+    index += 1
+  out("nullptr};\n")
+  out("\n")
+
+  # Does the prior map use an explicitly specialized kernel.
+  out("extern bool const ncclDevKernelForFuncIsSpecialized[] = {\n")
+  index = 0
+  for fn in primary_funcs:
+    kfn = best_kernel(*fn)
+    specialized = "1" if fn == kfn else "0"
+    out("/*%4d*/ %s,\n" % (index, specialized))
+    index += 1
+  out("0};\n")
+
+# Maps to .cu filename which implements this func. The only constraint is that
+# "coll" is reflected in the name: formally that no two funcs having different
+# coll's map to the same filename.
+def impl_filename(coll, redop, ty, algo, proto):
+  return "%s.cu" % paste("_", coll_camel_to_lower[coll], redop and redop.lower(), ty)
+
+# Partition the functions and kernels to the .cu filenames. The partition is
+# a dictionary mapping filename to (coll, func-tuple list)
+def partition_by_name(fns):
+  ans = {}
+  for fn in fns:
+    name = impl_filename(*fn)
+    coll = fn[0]
+    if name not in ans:
+      ans[name] = (coll, [])
+    ans[name][1].append(fn)
+  return ans
+
+name_to_funcs = partition_by_name(fn for fn in primary_funcs if fn[0]!="Nop")
+name_to_kernels = partition_by_name(kfn for kfn in kernel_funcs if kfn[0]!="Generic")
+
+# Generate <gensrc>/rules.mk
+with open(os.path.join(gensrc, "rules.mk"), "w") as f:
+  out = f.write
+  impl_names = sorted(name_to_funcs.keys())
+  names = impl_names + ["host_table.cc", "device_table.cu"]
+  out("LIB_OBJS_GEN = $(patsubst %, $(OBJDIR)/genobj/%.o, {names})\n"
+      .format(names=" ".join(names)))
+  out("\n")
+
+  # For each <coll>_<op>_<ty>.cu compile to a .cu.o file. Notice the dependencies
+  # come from the suffix-erased file (e.g. 'gensrc/all_reduce.cu')
+  for name in impl_names:
+    coll = name_to_funcs[name][0]
+    out(
+      "$(OBJDIR)/genobj/{name}.o: $(OBJDIR)/gensrc $(OBJDIR)/genobj/{lower_coll}.cu.d\n"
+      "\t" "$(call COMPILE,$@,$(OBJDIR)/gensrc/{name})\n"
+      "\n"
+      .format(name=name, lower_coll=coll_camel_to_lower[coll])
+    )
+
+# Add the suffix-erased .cu's which are used only for dependency scraping.
+for coll in set(coll for (coll,_,_,_,_) in primary_funcs if coll!="Nop"):
+  name = impl_filename(coll, None, None, None, None)
+  if name not in name_to_funcs:
+    name_to_funcs[name] = (coll, [])
+
+redop_to_cxx = {
+  None: "FuncCopy",
+  "Sum": "FuncSum",
+  "Prod": "FuncProd",
+  "MinMax": "FuncMinMax",
+  "PreMulSum": "FuncPreMulSum",
+  "SumPostDiv": "FuncSumPostDiv"
+}
+
+ty_to_cxx = {
+  None: "int8_t",
+  "i8": "int8_t",
+  "u8": "uint8_t",
+  "i32": "int32_t",
+  "u32": "uint32_t",
+  "i64": "int64_t",
+  "u64": "uint64_t",
+  "f16": "half",
+  "f32": "float",
+  "f64": "double",
+  "bf16": "__nv_bfloat16"
+}
+
+# Generate each <gensrc>/<impl>.cu:
+for name in name_to_funcs.keys():
+  (coll, fns) = name_to_funcs[name]
+  with open(os.path.join(gensrc, name), "w") as f:
+    out = f.write
+    out(
+      '#include "common.h"\n'
+      '#include "{lower_coll}.h"\n'
+      .format(lower_coll=coll_camel_to_lower[coll])
+    )
+
+    (_, kfns) = name_to_kernels.get(name) or (None, [])
+    for kfn in kfns:
+      (coll, redop, ty, algo, proto) = kfn
+      sym = paste("_", coll, redop, ty, algo, proto)
+      fn_id = primary_to_index[kfn]
+      cudart, arch = required_cuda(*kfn)
+      if (cudart, arch) != (0, 0):
+        out("#if CUDART_VERSION >= %d && __CUDA_ARCH__ >= %d\n" % (cudart, arch))
+      out(
+        "DEFINE_ncclDevKernel({sym}, ncclFunc{coll}, {redop_cxx}, {ty_cxx}, NCCL_ALGO_{algo}, NCCL_PROTO_{proto}, {fn_id})\n"
+        .format(sym=sym, coll=coll, redop_cxx=redop_to_cxx[redop], ty_cxx=ty_to_cxx[ty],
+                algo=(algo or "RING"), proto=(proto or "SIMPLE"), fn_id=fn_id)
+      )
+      if (cudart, arch) != (0, 0):
+        out("#endif\n")
+
+    for fn in fns:
+      (coll, redop, ty, algo, proto) = fn
+      sym = paste("_", coll, redop, ty, algo, proto)
+      cudart, arch = required_cuda(*fn)
+      if (cudart, arch) != (0, 0):
+        out("#if CUDART_VERSION >= %d && __CUDA_ARCH__ >= %d\n" % (cudart, arch))
+      out(
+        "DEFINE_ncclDevFunc({sym}, ncclFunc{coll}, {redop_cxx}, {ty_cxx}, NCCL_ALGO_{algo}, NCCL_PROTO_{proto})\n"
+        .format(sym=sym, coll=coll, redop_cxx=redop_to_cxx[redop], ty_cxx=ty_to_cxx[ty],
+                algo=(algo or "RING"), proto=(proto or "SIMPLE"))
+      )
+      if (cudart, arch) != (0, 0):
+        out("#endif\n")
@@ -0,0 +1,280 @@
+/*************************************************************************
+ * Copyright (c) 2023, Google LLC.  All rights reserved.
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef NET_DEVICE_UNPACK_H
+#define NET_DEVICE_UNPACK_H
+
+#include "unpack_defs.h"
+
+#include "op128.h"
+#include "align.h"
+#include "device.h"
+#include "common.h"
+
+// #define ALIGNED_LOAD
+
+inline __device__ void load64gpu(const uint64_t* ptr, uint64_t &v) {
+  #if __CUDA_ARCH__ >= 700
+      asm volatile("ld.relaxed.gpu.u64 {%0}, [%1];"
+      : "=l"(v) : "l"(ptr));
+  #else
+      asm volatile("ld.volatile.global.u64 {%0}, [%1];"
+      : "=l"(v) : "l"(ptr));
+  #endif
+}
+
+#define PAGE_META_SIZE 16
+#define META_LOAD_SIZE 16
+#define DATA_LOAD_SIZE 16
+
+// Map internal association of handle with group and peer index (called once at init time)
+inline __device__ void ncclNetDeviceUnpackSetup(void* ohandle, const int group, const int index) {
+  struct unpackNetDeviceHandle* handle = (struct unpackNetDeviceHandle*) ohandle;
+  ncclShmem.groups[group].devicePlugin.unpack.g_meta[index] = handle->meta;
+  ncclShmem.devicePlugin.unpack.bounce_buf = handle->bounce_buf;
+  ncclShmem.groups[group].devicePlugin.unpack.head = handle->head;
+}
+
+inline __device__ void ncclNetDeviceIncrementHead(const int group) {
+  ncclShmem.groups[group].devicePlugin.unpack.head++;
+}
+
+inline __device__ void ncclNetDeviceSaveHead(void* ohandle, const int group) {
+  struct unpackNetDeviceHandle* handle = (struct unpackNetDeviceHandle*) ohandle;
+  handle->head = ncclShmem.groups[group].devicePlugin.unpack.head;
+}
+
+template <uint8_t sz>
+inline __device__ void bulkLoad(const int t, const uint32_t len, char* cpy_src, char* cpy_dst, BytePack<sz> *reg, const int w, loadMeta* g_meta, loadMeta* s_meta, uint32_t src_off, uint64_t dst_off){
+  bulkLoad<1>(t, len, cpy_src, cpy_dst, reg, w, g_meta, s_meta, src_off, dst_off);
+}
+
+template <>
+inline __device__ void bulkLoad<1>(const int t, const uint32_t len, char* cpy_src, char* cpy_dst, BytePack<1> reg[16], const int w, loadMeta* g_meta, loadMeta* s_meta, uint32_t src_off, uint64_t dst_off){
+  uint64_t data_s;
+  for (data_s = t * DATA_LOAD_SIZE; data_s + DATA_LOAD_SIZE - 1 < len; data_s += WARP_SIZE * DATA_LOAD_SIZE) {
+
+#ifdef ALIGNED_LOAD
+    load128 ((uint64_t*)(cpy_src + data_s), reg.u64[0], reg.u64[1]);
+#else
+#pragma unroll
+    for (int i=0; i<16; i++) {
+      reg[i] = ld_volatile_global<1>((uintptr_t)((uint8_t*)(cpy_src + data_s) + i));
+    }
+#endif
+
+#pragma unroll
+    for (int i=0; i<16; i++) {
+      st_global<1>((uintptr_t)((uint8_t*)(cpy_dst + data_s) + i), reg[i]);
+    }
+  }
+}
+
+template <>
+inline __device__ void bulkLoad<2>(const int t, const uint32_t len, char* cpy_src, char* cpy_dst, BytePack<2> reg[8], const int w, loadMeta* g_meta, loadMeta* s_meta, uint32_t src_off, uint64_t dst_off){
+  uint64_t data_s;
+  for (data_s = t * DATA_LOAD_SIZE; data_s + DATA_LOAD_SIZE - 1 < len; data_s += WARP_SIZE * DATA_LOAD_SIZE) {
+#ifdef ALIGNED_LOAD
+    load128 ((uint64_t*)(cpy_src + data_s), reg.u64[0], reg.u64[1]);
+#else
+#pragma unroll
+    for (int i=0; i<8; i++) {
+      reg[i] = ld_volatile_global<2>((uintptr_t)((uint16_t*)(cpy_src + data_s) + i));
+    }
+#endif
+
+
+#pragma unroll
+    for (int i=0; i<8; i++) {
+      st_global<2>((uintptr_t)((uint16_t*)(cpy_dst + data_s) + i), reg[i]);
+    }
+  }
+}
+
+template <>
+inline __device__ void bulkLoad<4>(const int t, const uint32_t len, char* cpy_src, char* cpy_dst, BytePack<4> reg[4], const int w, loadMeta* g_meta, loadMeta* s_meta, uint32_t src_off, uint64_t dst_off){
+  uint64_t data_s;
+  for (data_s = t * DATA_LOAD_SIZE; data_s + DATA_LOAD_SIZE - 1 < len; data_s += WARP_SIZE * DATA_LOAD_SIZE) {
+#ifdef ALIGNED_LOAD
+    load128 ((uint64_t*)(cpy_src + data_s), reg.u64[0], reg.u64[1]);
+#else
+#pragma unroll
+    for (int i=0; i<4; i++) {
+      reg[i] = ld_volatile_global<4>((uintptr_t)((uint32_t *)(cpy_src + data_s) + i));
+    }
+#endif
+
+#pragma unroll
+    for (int i=0; i<4; i++) {
+      st_global<4>((uintptr_t)((uint32_t*)(cpy_dst + data_s) + i), reg[i]);
+    }
+  }
+}
+
+template <>
+inline __device__ void bulkLoad<8>(const int t, const uint32_t len, char* cpy_src, char* cpy_dst, BytePack<8> reg[2], const int w, loadMeta* g_meta, loadMeta* s_meta, uint32_t src_off, uint64_t dst_off){
+  uint64_t data_s;
+  for (data_s = t * DATA_LOAD_SIZE; data_s + DATA_LOAD_SIZE - 1 < len; data_s += WARP_SIZE * DATA_LOAD_SIZE) {
+#ifdef ALIGNED_LOAD
+    load128 ((uint64_t*)(cpy_src + data_s), reg.u64[0], reg.u64[1]);
+#else
+#pragma unroll
+    for (int i=0; i<2; i++) {
+      reg[i] = ld_volatile_global<8>((uintptr_t)((uint64_t*)(cpy_src + data_s) + i));
+    }
+#endif
+
+#pragma unroll
+    for (int i=0; i<2; i++) {
+      st_global<8>((uintptr_t)((uint64_t*)(cpy_dst + data_s) + i), reg[i]);
+    }
+  }
+}
+
+template <>
+inline __device__ void bulkLoad<16>(const int t, const uint32_t len, char* cpy_src, char* cpy_dst, BytePack<16> reg[1], const int w, loadMeta* g_meta, loadMeta* s_meta, uint32_t src_off, uint64_t dst_off){
+  uint64_t data_s;
+  for (data_s = t * DATA_LOAD_SIZE; data_s + DATA_LOAD_SIZE - 1 < len; data_s += WARP_SIZE * DATA_LOAD_SIZE) {
+    reg[0] = ld_volatile_global<16>((uintptr_t)(cpy_src + data_s));
+    st_global<16>((uintptr_t)(cpy_dst + data_s), reg[0]);
+  }
+}
+
+#ifndef PAGE_SIZE
+#define PAGE_SIZE 4096
+#endif
+inline __device__ int ppw(const int nbytes, int nw) {
+  int v = DIVUP(nbytes, SLICE_PAGE_SIZE);
+  v = DIVUP(v, nw);
+  while (v > WARP_SHM_PAGE_CNT) {
+    v = DIVUP(v, 2);
+  }
+  return v;
+}
+
+// This function is called by all threads
+// Pack data from the internal iovec to the supplied flat buffer using all the
+// threads
+template <int Recv>
+inline __device__ void ncclNetDeviceUnpack(
+    const int tid, const int tidInBlock, const int nworkers, const int group, int mask, int Src, int workSize);
+
+template <>
+inline __device__ void ncclNetDeviceUnpack</*Recv=*/0>(
+    const int tid, const int tidInBlock, const int nworkers, const int group, int mask, int Src, int workSize) {
+  // send unpack empty
+}
+
+inline __device__ void ncclNetDeviceUnpackInner(
+    const int tid, const int tidInBlock, const int nworkers, const int group, const int index,
+    void *src, const int nbytes, const uint64_t step);
+
+template <>
+inline __device__ void ncclNetDeviceUnpack</*Recv=*/1>(
+    const int tid, const int tidInBlock, const int nworkers, const int group, int mask, int Src, int workSize) {
+
+  while (mask != 0) {
+    int ix = __ffs(mask)-1; // Get the first set bit of the mask (this should correlate to a peer index)
+    mask &= mask-1; // Drop the first set bit of the mask
+
+    // Pack data from the internal iovec to the supplied flat srcs buffer using all the threads
+    // + Src is necessary in the case of accessing the user buffer directly
+    ncclNetDeviceUnpackInner(tid, tidInBlock, nworkers, group /* in case they need to use split warps shared memory partitioning*/,
+        ix, ncclShmem.groups[group].srcs[ix + Src], workSize, ncclShmem.groups[group].devicePlugin.unpack.head);
+  }
+}
+
+inline __device__ void ncclNetDeviceUnpackInner(
+    const int tid, const int tidInBlock, const int nworkers, const int group, const int index,
+    void *src, const int nbytes, const uint64_t step) {
+  // from src/collectives/device/common_kernel.h
+  const int w = tid / WARP_SIZE;        // Warp number
+  const int nw = nworkers / WARP_SIZE;  // Number of warps
+  const int t = tid % WARP_SIZE;        // Thread (inside the warp)
+
+  BytePack<16> reg;
+  loadMeta meta;
+
+  uint64_t head;
+  struct netUnpackMeta* g_meta_struct;
+  void* bounce_buf;
+
+  loadMeta* g_meta;
+  loadMeta* s_meta;
+  uint64_t meta_cnt;
+
+  // hack head use per-warp
+  head          = step;
+  g_meta_struct = ncclShmem.groups[group].devicePlugin.unpack.g_meta[index];
+  bounce_buf    = ncclShmem.devicePlugin.unpack.bounce_buf;
+
+  __syncwarp();
+
+  head %= NCCL_NET_DEVICE_UNPACK_MAX_QUEUE_DEPTH;
+
+  g_meta = g_meta_struct->mem[head];
+
+  // Currently, even/odd groups perform send/recv separately. We don't really need space for send side.
+  // Total size is N page per warp * 16 B per page * 20 WARPS max = 320 * N bytes, N == WARP_SHM_PAGE_CNT
+  static_assert(ncclShmemScratchWarpSize() >= WARP_SHM_SIZE, "Each warp must have enough scratch space");
+  s_meta = (loadMeta*) ncclScratchForWarp(tidInBlock / WARP_SIZE); // (loadMeta*) (ncclShmem.devicePlugin.unpack.meta + shm_off);
+
+  load64gpu(g_meta_struct->cnt + head, meta_cnt);
+
+  int PPW = ppw(nbytes, nw);
+
+  for (uint64_t meta_s = w * PPW; meta_s < meta_cnt; meta_s += nw * PPW) {
+
+    uint64_t iter_meta_cnt = meta_cnt - meta_s;
+    iter_meta_cnt = iter_meta_cnt < PPW ? iter_meta_cnt : PPW;
+
+    // TODO: this load size needs to work if not aligned, but since the two are both 16...
+    if (t < PPW * PAGE_META_SIZE / META_LOAD_SIZE && t < iter_meta_cnt) {  // avoid last iter load garbage data
+      load128((const uint64_t*) (g_meta + (meta_s + t)), reg.u64[0], reg.u64[1]);
+
+      storeShmem128(shmemCvtPtr((uint64_t *)(s_meta + (w * PPW + t))), reg.u64[0], reg.u64[1]);
+    }
+
+    __syncwarp();
+
+    for (int x = 0; x < iter_meta_cnt; x++) {
+      int meta_idx = x + w * PPW;
+      
+      // load page offs
+      loadShmem128(shmemCvtPtr((uint64_t*) (s_meta + meta_idx)), meta.r64[0], meta.r64[1]);
+
+      if (meta.len >= DATA_LOAD_SIZE) {
+        // fast path, but need to adapt to alignment issue
+
+        // bulk copy data
+        uint8_t align_off = (meta.src_off | meta.dst_off) % DATA_LOAD_SIZE;
+        align_off = align_off & -align_off;  // keep the lowest bit
+        if (align_off == 0) {  // 0x16
+          bulkLoad<16>(t, meta.len, (char*) bounce_buf + meta.src_off, (char*) src + meta.dst_off, &reg, w, g_meta, s_meta, meta.src_off, meta.dst_off);
+        } else if (align_off & 0x8) {
+          bulkLoad<8>(t, meta.len, (char*) bounce_buf + meta.src_off, (char*) src + meta.dst_off, (BytePack<8>*) &reg, w, g_meta, s_meta, meta.src_off, meta.dst_off);
+        } else if (align_off & 0x4) {
+          bulkLoad<4>(t, meta.len, (char*) bounce_buf + meta.src_off, (char*) src + meta.dst_off, (BytePack<4>*) &reg, w, g_meta, s_meta, meta.src_off, meta.dst_off);
+        } else if (align_off & 0x2) {
+          bulkLoad<2>(t, meta.len, (char*) bounce_buf + meta.src_off, (char*) src + meta.dst_off, (BytePack<2>*) &reg, w, g_meta, s_meta, meta.src_off, meta.dst_off);
+        } else { // if (align_off & 0x1)
+          bulkLoad<1>(t, meta.len, (char*) bounce_buf + meta.src_off, (char*) src + meta.dst_off, (BytePack<1>*) &reg, w, g_meta, s_meta, meta.src_off, meta.dst_off);
+        }
+      }
+
+      // must be less than 16 bytes
+      if (t < meta.len % DATA_LOAD_SIZE) {
+        volatile char* cpy_src = (char*) bounce_buf + meta.src_off + (meta.len / DATA_LOAD_SIZE) * DATA_LOAD_SIZE + t;
+        volatile char* cpy_dst = (char*) src        + meta.dst_off + (meta.len / DATA_LOAD_SIZE) * DATA_LOAD_SIZE + t;
+        *cpy_dst = *cpy_src;
+      }
+    }
+
+    __syncwarp();
+  }
+}
+
+#endif  // NET_DEVICE_UNPACK_DEFS_H_
@@ -0,0 +1,61 @@
+/*************************************************************************
+ * Copyright (c) 2023, Google LLC.  All rights reserved.
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef NET_DEVICE_UNPACK_DEFS_H
+#define NET_DEVICE_UNPACK_DEFS_H
+
+#include <stdint.h>
+
+#include "device.h"
+
+#define NCCL_NET_DEVICE_UNPACK_MAX_QUEUE_DEPTH 16
+
+union alignas(16) loadMeta {
+  uint64_t r64[2];
+  struct {
+    uint32_t src_off;
+    uint32_t len;
+    uint64_t dst_off;
+  };
+};
+static_assert(sizeof(union loadMeta) == 16, "Must be 16-byte aligned");
+
+/****** global memory ******/
+
+#define NET_UNPACK_MAX_QUEUE_DEPTH 16  // MAX_REQUESTS
+#define NET_UNPACK_MAX_SLICE_SIZE 4194304  // 4MB per Irecv call
+#define SLICE_PAGE_SIZE 4096
+#define NET_UNPACK_MAX_SLICE_PAGES \
+  (NET_UNPACK_MAX_SLICE_SIZE / SLICE_PAGE_SIZE * 2)  // * 2 for slack, wasteful..
+
+struct netUnpackMeta {
+  loadMeta mem[NCCL_NET_DEVICE_UNPACK_MAX_QUEUE_DEPTH][NET_UNPACK_MAX_SLICE_PAGES];
+  uint64_t cnt[NCCL_NET_DEVICE_UNPACK_MAX_QUEUE_DEPTH];
+};
+
+struct unpackNetDeviceHandle {
+  struct netUnpackMeta *meta;  // mapped
+  void* bounce_buf;
+  uint64_t head;
+};
+
+/****** shared memory ******/
+
+#define NET_UNPACK_MAX_GROUPS 16 // Forked from NCCL_MAX_GROUPS in devcomm.h
+#define NET_UNPACK_MAX_NPEERS 2  // The most you should have is 2 network peers per-group (indexed by index)
+#define WARP_SHM_PAGE_CNT 4
+#define WARP_SHM_SIZE (WARP_SHM_PAGE_CNT * sizeof(union loadMeta))
+struct unpackShmem {
+  void* bounce_buf;
+};
+
+struct unpackGroupShmem {
+  int unpackNetDeviceIndexMask; // We store a single unpackNetDeviceIndex because only one peer can be network recv
+  uint64_t head;
+  struct netUnpackMeta* g_meta[NET_UNPACK_MAX_NPEERS]; // head of handle to index into meta for meta copy
+};
+
+#endif // NET_DEVICE_UNPACK_DEFS_H_
@@ -0,0 +1,79 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "alloc.h"
+#include "collectives.h"
+#include "common_kernel.h"
+#include "common.h"
+#include <cuda_runtime.h>
+
+namespace {
+  template<typename RedOp>
+  __global__ __launch_bounds__(512, 1)
+  void oneRankReduce(void* dst, void* src, size_t nElts, uint64_t redOpArg, bool redOpArgIsPtr) {
+    using T = typename RedOp::EltType;
+    int tid = threadIdx.x;
+    int tn = blockDim.x;
+    int bid = blockIdx.x;
+    int bn = gridDim.x;
+
+    // each block/channel gets a roughly equal segment of 16 byte packs
+    constexpr int EltPerPack = 16/sizeof(T);
+    intptr_t i0 = (bid+0)*alignUp(nElts/bn, EltPerPack);
+    intptr_t i1 = (bid+1)*alignUp(nElts/bn, EltPerPack);
+    i0 = min(i0, nElts);
+    i1 = min(i1, nElts);
+    src = (T*)src + i0;
+    dst = (T*)dst + i0;
+
+    if (redOpArgIsPtr) {
+      if (redOpArg%2 != 0) {
+        redOpArg = *reinterpret_cast<uint8_t*>(redOpArg);
+      } else if (redOpArg%4 != 0) {
+        redOpArg = *reinterpret_cast<uint16_t*>(redOpArg);
+      } else if (redOpArg%8 != 0) {
+        redOpArg = *reinterpret_cast<uint32_t*>(redOpArg);
+      } else {
+        redOpArg = *reinterpret_cast<uint64_t*>(redOpArg);
+      }
+    }
+    reduceCopy<COLL_UNROLL, RedOp, T, 0,1,1, 0,1,1, /*PreOpSrcs=*/1>
+      (tid, tn, redOpArg, &redOpArg, true, 1, &src, 1, &dst, i1-i0);
+  }
+}
+
+ncclResult_t ncclLaunchOneRank(void* dst, void const* src, size_t nElts, struct ncclDevRedOpFull redOp, ncclDataType_t eltType, cudaStream_t stream) {
+  size_t eltSize = ncclTypeSize(eltType);
+  if (redOp.op != ncclDevPreMulSum) {
+    if (dst != src) {
+      NCCLCHECK(ncclCudaMemcpyAsync((char*)dst, (char*)src, nElts*eltSize, stream));
+    }
+    return ncclSuccess;
+  }
+
+  void const* kernel;
+  switch (eltType) {
+  case ncclInt8:     kernel = (void const*)&oneRankReduce<FuncPreMulSum<int8_t>>; break;
+  case ncclUint8:    kernel = (void const*)&oneRankReduce<FuncPreMulSum<uint8_t>>; break;
+  case ncclInt32:    kernel = (void const*)&oneRankReduce<FuncPreMulSum<int32_t>>; break;
+  case ncclUint32:   kernel = (void const*)&oneRankReduce<FuncPreMulSum<uint32_t>>; break;
+  case ncclInt64:    kernel = (void const*)&oneRankReduce<FuncPreMulSum<int64_t>>; break;
+  case ncclUint64:   kernel = (void const*)&oneRankReduce<FuncPreMulSum<uint64_t>>; break;
+  case ncclFloat16:  kernel = (void const*)&oneRankReduce<FuncPreMulSum<half>>; break;
+  #if defined(__CUDA_BF16_TYPES_EXIST__)
+  case ncclBfloat16: kernel = (void const*)&oneRankReduce<FuncPreMulSum<__nv_bfloat16>>; break;
+  #endif
+  case ncclFloat32:  kernel = (void const*)&oneRankReduce<FuncPreMulSum<float>>; break;
+  case ncclFloat64:  kernel = (void const*)&oneRankReduce<FuncPreMulSum<double>>; break;
+  default: return ncclInvalidArgument;
+  }
+  dim3 grid = {0, 1, 1};
+  grid.x = std::min(32, (int)divUp(nElts*eltSize, 16<<10));
+  dim3 block = {512, 1, 1};
+  void* args[5] = {&dst, &src, &nElts, &redOp.scalarArg, &redOp.scalarArgIsPtr};
+  CUDACHECK(cudaLaunchKernel(kernel, grid, block, args, 0, stream));
+  return ncclSuccess;
+}
@@ -161,21 +161,25 @@ __device__ __forceinline__ T fromPack(typename BytePackOf<T>::Pack pack)  {
 // Load/store of BytePack<?> using integral addresses.

 template<int Size> __device__ BytePack<Size> ld_global(uintptr_t addr);
-template<int Size> __device__ BytePack<Size> ld_volatile_global(uintptr_t addr);
 template<int Size> __device__ BytePack<Size> ld_shared(uint32_t addr);
+template<int Size> __device__ BytePack<Size> ld_volatile_global(uintptr_t addr);
 template<int Size> __device__ BytePack<Size> ld_volatile_shared(uint32_t addr);
+template<int Size> __device__ BytePack<Size> ld_relaxed_gpu_global(uintptr_t addr);
 template<int Size> __device__ void st_global(uintptr_t addr, BytePack<Size> value);
 template<int Size> __device__ void st_shared(uint32_t addr, BytePack<Size> value);
+template<int Size> __device__ void st_relaxed_gpu_global(uintptr_t addr, BytePack<Size> value);

 template<> __device__ __forceinline__ BytePack<0> ld_global<0>(uintptr_t addr) { return {}; }
-template<> __device__ __forceinline__ BytePack<0> ld_volatile_global<0>(uintptr_t addr) { return {}; }
 template<> __device__ __forceinline__ BytePack<0> ld_shared<0>(uint32_t addr) { return {}; }
+template<> __device__ __forceinline__ BytePack<0> ld_volatile_global<0>(uintptr_t addr) { return {}; }
 template<> __device__ __forceinline__ BytePack<0> ld_volatile_shared<0>(uint32_t addr) { return {}; }
+template<> __device__ __forceinline__ BytePack<0> ld_relaxed_gpu_global<0>(uintptr_t addr) { return {}; }
 template<> __device__ __forceinline__ void st_global<0>(uintptr_t addr, BytePack<0> value) {}
 template<> __device__ __forceinline__ void st_shared<0>(uint32_t addr, BytePack<0> value) {}
+template<> __device__ __forceinline__ void st_relaxed_gpu_global<0>(uintptr_t addr, BytePack<0> value) {}

 // Used to define implementations for above prototypes.
-#define DEFINE_ld_st(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty, space, addr_cxx_ty, addr_reg_ty) \
+#define DEFINE_ld_st__size_space(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty, space, addr_cxx_ty, addr_reg_ty) \
  template<> \
  __device__ __forceinline__ BytePack<bytes> ld_##space<bytes>(addr_cxx_ty addr) { \
    data_cxx_ty tmp; \
@@ -197,19 +201,44 @@ template<> __device__ __forceinline__ void st_shared<0>(uint32_t addr, BytePack<
    data_cxx_ty tmp = value.native; \
    asm volatile("st." #space "." #data_ptx_ty " [%0], %1;" :: #addr_reg_ty(addr), #data_reg_ty(tmp) : "memory"); \
  }
+
+#if __CUDA_ARCH__ >= 700
+  #define PTX_relaxed_gpu "relaxed.gpu"
+#else
+  #define PTX_relaxed_gpu "volatile"
+#endif
+
+#define DEFINE_ld_st_gpu_relaxed__size(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty) \
+  template<> \
+  __device__ __forceinline__ BytePack<bytes> ld_relaxed_gpu_global<bytes>(uintptr_t addr) { \
+    data_cxx_ty tmp; \
+    asm("ld." PTX_relaxed_gpu ".global." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : "l"(addr)); \
+    BytePack<bytes> ans; \
+    ans.native = tmp; \
+    return ans; \
+  } \
+  template<> \
+  __device__ __forceinline__ void st_relaxed_gpu_global<bytes>(uintptr_t addr, BytePack<bytes> value) { \
+    data_cxx_ty tmp = value.native; \
+    asm volatile("st." PTX_relaxed_gpu ".global." #data_ptx_ty " [%0], %1;" :: "l"(addr), #data_reg_ty(tmp) : "memory"); \
+  }
+
+#define DEFINE_ld_st__size(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty) \
+  DEFINE_ld_st__size_space(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty, global, uintptr_t, l) \
+  DEFINE_ld_st__size_space(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty, shared, uint32_t, r) \
+  DEFINE_ld_st_gpu_relaxed__size(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty)
+
 // Single-byte types use 4-byte registers since there is no 1-byte register
 // character for asm blocks. See https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html#constraints
-DEFINE_ld_st(1, uint32_t, b8, r, global, uintptr_t, l)
-DEFINE_ld_st(1, uint32_t, b8, r, shared, uint32_t, r)
-DEFINE_ld_st(2, uint16_t, b16, h, global, uintptr_t, l)
-DEFINE_ld_st(2, uint16_t, b16, h, shared, uint32_t, r)
-DEFINE_ld_st(4, uint32_t, b32, r, global, uintptr_t, l)
-DEFINE_ld_st(4, uint32_t, b32, r, shared, uint32_t, r)
-DEFINE_ld_st(8, uint64_t, b64, l, global, uintptr_t, l)
-DEFINE_ld_st(8, uint64_t, b64, l, shared, uint32_t, r)
-#undef DEFINE_ld_st
+DEFINE_ld_st__size(1, uint32_t, b8, r)
+DEFINE_ld_st__size(2, uint16_t, b16, h)
+DEFINE_ld_st__size(4, uint32_t, b32, r)
+DEFINE_ld_st__size(8, uint64_t, b64, l)

-#define DEFINE_ld_st_16(space, addr_cxx_ty, addr_reg_ty) \
+#undef DEFINE_ld_st__size_space
+#undef DEFINE_ld_st__size
+
+#define DEFINE_ld_st_16__space(space, addr_cxx_ty, addr_reg_ty) \
  template<> \
  __device__ __forceinline__ BytePack<16> ld_##space<16>(addr_cxx_ty addr) { \
    BytePack<16> ans; \
@@ -226,10 +255,23 @@ DEFINE_ld_st(8, uint64_t, b64, l, shared, uint32_t, r)
  __device__ __forceinline__ void st_##space<16>(addr_cxx_ty addr, BytePack<16> value) { \
    asm("st." #space ".v2.b64 [%0], {%1,%2};" :: #addr_reg_ty(addr), "l"(value.u64[0]), "l"(value.u64[1]) : "memory"); \
  }
-DEFINE_ld_st_16(global, uintptr_t, l)
-DEFINE_ld_st_16(shared, uint32_t, r)
+DEFINE_ld_st_16__space(global, uintptr_t, l)
+DEFINE_ld_st_16__space(shared, uint32_t, r)
 #undef DEFINE_ld_st_16

+template<>
+__device__ __forceinline__ BytePack<16> ld_relaxed_gpu_global<16>(uintptr_t addr) {
+  BytePack<16> ans;
+  asm("ld." PTX_relaxed_gpu ".global.v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : "l"(addr));
+  return ans;
+}
+template<>
+__device__ __forceinline__ void st_relaxed_gpu_global<16>(uintptr_t addr, BytePack<16> value) {
+  asm volatile("st." PTX_relaxed_gpu ".global.v2.b64 [%0], {%1,%2};" :: "l"(addr), "l"(value.u64[0]), "l"(value.u64[1]) : "memory");
+}
+
+#undef PTX_relaxed_gpu
+
 ////////////////////////////////////////////////////////////////////////////////
 // Atomic load/store using c++ pointers.

@@ -247,6 +289,15 @@ __device__ __forceinline__ uint64_t ld_relaxed_sys_global(uint64_t *ptr) {
  #endif
  return ans;
 }
+__device__ __forceinline__ uint64_t ld_relaxed_gpu_global(uint64_t *ptr) {
+  uint64_t ans;
+  #if __CUDA_ARCH__ >= 700
+    asm("ld.relaxed.gpu.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
+  #else
+    asm("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
+  #endif
+  return ans;
+}
 __device__ __forceinline__ uint64_t ld_acquire_sys_global(uint64_t *ptr) {
  uint64_t ans;
  #if __CUDA_ARCH__ >= 700
@@ -323,7 +323,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
  __device__  Primitives(
      const int tid, const int nthreads, int const *recvPeers, int const *sendPeers,
      void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
-      uint8_t connIndexRecv=0, uint8_t connIndexSend=0
+      uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclWorkElem* e = nullptr, int stepSize_=0
    ):
    redOp(redOpArg),
    tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group),
@@ -364,7 +364,7 @@ public:
  __device__ Primitives(
      const int tid, const int nthreads, int const *recvPeers, int const *sendPeers,
      void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
-      uint8_t connIndexRecv=0, uint8_t connIndexSend=0
+      uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclWorkElem* e = nullptr, int stepSize_=0
    ):
    redOp(redOpArg),
    tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE),
@@ -4,6 +4,8 @@
 * See LICENSE.txt for license information
 ************************************************************************/

+#include "network/unpack/unpack.h"
+
 template<typename T, typename RedOp, typename Fan, int Direct,
         int SlicePerChunk, int StepPerSlice, int Unroll, int P2p, int MultimemSrcs, int MultimemDsts>
 class Primitives<
@@ -23,7 +25,11 @@ class Primitives<
                       DirectWrite = 0x200,
                       DirectRead = 0x400,
                       ThreadsSynced = 0x800,
-                       NvlsMinPolling = 0x1000;
+                       NvlsMinPolling = 0x1000,
+                       NetDeviceUnpack = 0x2000,
+                       AnyNetDeviceUnpack = 0x4000,
+                       NvlsDirectRead = 0x8000,
+                       NvlsDirectWrite = 0x10000;
  const int tid, tidInBlock;
  const int nthreads;
  int nworkers;
@@ -44,6 +50,8 @@ class Primitives<
  };
  uint64_t *connStepPtr;
  uint64_t connStepCache; // Cache last seen value of (*connStepPtr)
+  void*    mhandle;
+  void*    netDeviceHandle;

  // Don't use barrier 0 as it's used by the final sync
  __device__ void barrier() {
@@ -141,7 +149,7 @@ class Primitives<
      if (flags & OffsFifoEnabled)
        ptrs[index] = connEltsFifo + loadInt(connOffsFifoPtr + (step%NCCL_STEPS))/sizeof(T);
      else if (isSendNotRecv && DirectSend) {
-        if (flags & DirectWrite) {
+        if (flags & (DirectWrite | NvlsDirectWrite)) {
          ptrs[index] = directBuff + dstIx + offset;
        } else if (flags & DirectRead) {  // empty send
          ptrs[index] = nullptr;
@@ -149,7 +157,7 @@ class Primitives<
          ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
        }
      } else if (!isSendNotRecv && DirectRecv) {
-        if (flags & DirectRead) {
+        if (flags & (DirectRead | NvlsDirectRead)) {
          ptrs[index] = directBuff + srcIx + offset;
        } else if (flags & DirectWrite) {
          ptrs[index] = directBuff + dstIx + offset;  // send to next from my output buffer
@@ -160,6 +168,9 @@ class Primitives<
      else {
        ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
      }
+      if ((flags & (AnyNetDeviceUnpack)) && (flags & (Recv*RoleWaitRecv))) {
+        ncclNetDeviceIncrementHead(group);
+      }
      step += StepPerSlice;
    }
  }
@@ -229,7 +240,16 @@ class Primitives<
        /* if user abort the kernel, we don't need to actually perform copy/reduce; just set size
         * to 0 to avoid unnecessary workload. */
        int workSize = ncclShmem.aborted ? 0 : sliceSize;
-        if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) {
+        if (flags & AnyNetDeviceUnpack) {
+          ncclNetDeviceUnpack<Recv>(tid, tidInBlock, nworkers, group, ncclShmem.groups[group].devicePlugin.unpack.unpackNetDeviceIndexMask, Src, workSize);
+          // Sync here to make sure all workers are reading from the updated srcs)
+          subBarrier();
+        }
+
+        if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]
+            /* NVLS can have srcs[0] == dsts[0], but we cannot enter this "if branch",
+             * so we need to check whether MultimemSrcs and MultimemDsts are 0. */
+            && MultimemSrcs == 0 && MultimemDsts == 0) {
          // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
          if (Send) {
            reduceCopy<Unroll, RedOp, T, 0, 1, 1, 0, 1, MaxSend, /*PreOpSrcs*/0>
@@ -286,7 +306,7 @@ class Primitives<
  // shift: peer offset to avoid all ranks sending to or receiving from same peer
  template <int DirectRecv1, int DirectSend1, int Recv, int Send>
  __device__ __forceinline__ void
-  ScatterGatherOp(intptr_t inpIx, intptr_t outIx, int totalElem, int peerElem, int peerOffset, int skip, int shift, bool postOp) {
+  ScatterGatherOp(intptr_t inpIx, intptr_t outIx, ssize_t totalElem, int peerElem, ssize_t peerOffset, int skip, int shift, bool postOp) {
    constexpr int DirectRecv = 1 && Direct && DirectRecv1;
    constexpr int DirectSend = 1 && Direct && DirectSend1;
    int offset = 0; // slice offset
@@ -295,7 +315,7 @@ class Primitives<

    #pragma unroll
    for (int slice=0; slice<SlicePerChunk; ++slice) {
-      int realSize = max(0, min(dataSize, peerElem-offset));
+      ssize_t realSize = max(0, min(dataSize, peerElem-offset));
      bool fenceNeeded = false;
      if (tid < nworkers) {
        if (Send) {
@@ -309,11 +329,11 @@ class Primitives<
          // Loop over peers
          for (int j=0; j<fan.nsend(); j++) {
            int i = (j+shift)%fan.nsend();
-            int pOffset = i*peerOffset;
+            ssize_t pOffset = i*peerOffset;
            // Skip the data I am responsible of reducing myself
            if (skip >= 0 && i >= skip) pOffset += peerElem;
            void* src0 = (T*)ncclShmem.groups[group].srcs[0] + pOffset;
-            int realPeerSize = min(realSize, totalElem-pOffset);
+            ssize_t realPeerSize = min(realSize, totalElem-pOffset);
            if (realPeerSize > 0 && ncclShmem.groups[group].dsts[i] != nullptr) {
              reduceCopy<Unroll, RedOp, T, 0,1,1, 0,1,1, PreOpSrcs>(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, 1, &src0, 1, ncclShmem.groups[group].dsts+i, realPeerSize);
              // Mark for threadfence at the end
@@ -322,10 +342,10 @@ class Primitives<
          }
        } else if (Recv) {
          if (flags & RoleOutput) ncclShmem.groups[group].dsts[0] = userBuff + outIx + offset;
-          int pOffset = index*peerOffset;
+          ssize_t pOffset = index*peerOffset;
          if (skip >= 0 && index >= skip) pOffset += peerElem;
          // Adjust remote index with peer offset in case we are directly pulling from peer's output buffer
-          waitPeer<DirectRecv, 0, 1, 0, 0, 1>(outIx, outIx+pOffset, offset, realSize);
+          waitPeer<DirectRecv, 0, 1, 0, 0, 1>(outIx+pOffset, outIx+pOffset, offset, realSize);
          subBarrier();
          #pragma unroll
          for (int j=0; j<fan.nrecv(); j++) {
@@ -333,7 +353,7 @@ class Primitives<
            pOffset = i*peerOffset;
            if (skip >= 0 && i >= skip) pOffset += peerElem;
            void* dst0 = (T*)ncclShmem.groups[group].dsts[0] + pOffset;
-            int realPeerSize = min(realSize, totalElem-pOffset);
+            ssize_t realPeerSize = min(realSize, totalElem-pOffset);
            if (DirectRecv && ncclShmem.groups[group].srcs[i] == dst0) realPeerSize = 0;
            if (realPeerSize > 0) reduceCopy<Unroll, RedOp, T, 0,1,1, 0,1,1, /*PreOpSrcs=*/0>(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp, 1, ncclShmem.groups[group].srcs+i, 1, &dst0, realPeerSize);
          }
@@ -348,6 +368,13 @@ class Primitives<
  __device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, struct ncclWorkElem* e) {
    if (flags & (RoleWaitRecv|RolePostRecv)) {
      auto *conn = &peer->recv[connIndex];
+      if (conn->netDeviceHandle.netDeviceType == NCCL_NET_DEVICE_UNPACK) {
+        // handle must be a device ptr
+        netDeviceHandle = conn->netDeviceHandle.handle;
+        // Cache the handle
+        ncclNetDeviceUnpackSetup(netDeviceHandle, group, index);
+        flags |= NetDeviceUnpack;
+      }
      step = conn->step;
      step = roundUp(step, SlicePerChunk*StepPerSlice);
      if (flags & RolePostRecv) {
@@ -377,6 +404,9 @@ class Primitives<
              // otherwise, in one-to-multi send, we could mix empty send and intermediate send
              flags |= (conn->flags & NCCL_DIRECT_WRITE) ? DirectWrite : 0;
            }
+          } else if ((conn->flags & NCCL_NVLS_MIN_POLL) && e != nullptr && e->regUsed) {
+            /* NVLS direct */
+            flags |= NvlsDirectRead;
          }
        }
        if (flags & OffsFifoEnabled)
@@ -393,6 +423,7 @@ class Primitives<
      step = roundUp(step, SlicePerChunk*StepPerSlice);
      if (flags & RolePostSend) {
        connStepPtr = conn->tail;
+        connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
      }
      if (flags & RoleWaitSend) {
        ncclShmem.groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs()
@@ -424,6 +455,9 @@ class Primitives<
              // otherwise, in one-to-multi send, we could mix empty send and intermediate send
              flags |= (conn->flags & NCCL_DIRECT_WRITE) ? DirectWrite : 0;
            }
+          } else if ((conn->flags & NCCL_NVLS_MIN_POLL) && e != nullptr && e->regUsed) {
+            /* NVLS direct */
+            flags |= NvlsDirectWrite;
          }
        }
      }
@@ -434,10 +468,10 @@ class Primitives<
  __device__ Primitives(
      int tid, int nthreads, int const *recvPeers, int const *sendPeers,
      void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
-      uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclWorkElem* e = nullptr
+      uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclWorkElem* e = nullptr, int stepSize_=0
    ):
    tid(tid), nthreads(nthreads), tidInBlock(threadIdx.x), group(group),
-    stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T)) {
+    stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_) {

    // For send operations, we need an extra warp to overlap the threadfence and the copy
    this->nworkers = nthreads - (MaxSend > 0 && nthreads-WARP_SIZE >= 64 ? WARP_SIZE : 0);
@@ -473,6 +507,20 @@ class Primitives<
    loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, e);
    loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, e);

+    if (barrierAny(flags & NetDeviceUnpack)) {
+      flags |= AnyNetDeviceUnpack;
+      // g == 0 is the first ThreadPerSync # of threads of this warp
+      // g == 0 is also the RoleWaitRecv threads of this group, thus the thread ID will correlate to the peer index
+      if (g == 0) {
+        uint32_t mask = __ballot_sync((1U << ThreadPerSync) - 1, (flags & NetDeviceUnpack) ? 1 : 0);
+
+        // We only want to update the shared memory variable with a single thread
+        if (tid == 0) {
+          ncclShmem.groups[this->group].devicePlugin.unpack.unpackNetDeviceIndexMask = mask;
+        }
+      }
+    }
+
    setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclWorkElemReg*)e);
  }

@@ -485,8 +533,10 @@ class Primitives<
      auto *conns = (flags & RolePostSend) ? ncclShmem.groups[group].sendConns : ncclShmem.groups[group].recvConns;
      conns[index]->step = step;
    }
-    // Make sure all threads are done writing back conn->step and done using
-    // ncclShmem.groups[group]
+    
+    if ((flags & (AnyNetDeviceUnpack)) && (flags & (RoleWaitRecv))) {
+      ncclNetDeviceSaveHead(netDeviceHandle, group);
+    }
    barrier();
  }

@@ -497,33 +547,41 @@ class Primitives<
    }
    if (flags & RoleOutput) userBuff = (T*)outputBuf;
    bool recvProvider = flags == (flags|RoleWaitRecv|DirectWrite);
-    bool sendAcceptor = flags == (flags|RoleWaitSend|DirectWrite);
+    bool sendAcceptor = (flags == (flags|RoleWaitSend|DirectWrite)) || (flags == (flags|RoleWaitSend|NvlsDirectWrite));
    bool sendProvider = flags == (flags|RoleWaitSend|DirectRead); // sender provides direct buffer (to be fetched)
-    bool recvAcceptor = flags == (flags|RoleWaitRecv|DirectRead); // receiver accepts direct buffer
+    bool recvAcceptor = flags == (flags|RoleWaitRecv|DirectRead) || (flags == (flags|RoleWaitRecv|NvlsDirectRead)); // receiver accepts direct buffer
    int regUsed = e != nullptr ? e->elem.regUsed : 0;

    if (Direct && recvProvider) {
      int spins = 0;
      void *volatile *slot = ncclShmem.groups[group].recvConns[index]->ptrExchange;
      // Wait for consumer to consume previous value before trampling it.
-      while (*slot != nullptr && !checkAbort(spins));
-      directBuff = (T*)outputBuf;
-      // Encode pointer by XOR'ing against some address they definitely wouldn't send
-      // since we want to allow them sending us nullptr while not colliding with
-      // the empty slot value.
-      *slot = reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(directBuff) ^ reinterpret_cast<uintptr_t>(slot));
+      if (slot) {
+        while (*slot != nullptr && !checkAbort(spins));
+        directBuff = (T*)outputBuf;
+        // Encode pointer by XOR'ing against some address they definitely wouldn't send
+        // since we want to allow them sending us nullptr while not colliding with
+        // the empty slot value.
+        *slot = reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(directBuff) ^ reinterpret_cast<uintptr_t>(slot));
+      }
    }
    if (Direct && sendAcceptor) {
      int spins = 0;
      void *volatile *slot = ncclShmem.groups[group].sendConns[index]->ptrExchange;
      void *ptr;
-      while (true) {
+      while (slot) {
        ptr = *slot;
        if (ptr != nullptr || checkAbort(spins)) break;
      }
-      directBuff = regUsed ? (T*)(e->dnOutputs[index]) :
+
+      if (slot) {
+        directBuff = regUsed ? (T*)(e->dnOutputs[index]) :
                   reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(ptr) ^ reinterpret_cast<uintptr_t>(slot));
-      *slot = nullptr;
+        *slot = nullptr;
+      } else {
+        /* slot is NULL, it must be regUsed == 1 */
+        directBuff = (T*)e->dnOutputs[index];
+      }
    }
    if (Direct && sendProvider) {
      int spins = 0;
@@ -531,17 +589,19 @@ class Primitives<
      volatile uint64_t* argSlot0 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange;
      volatile uint64_t* argSlot1 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange+1;
      // Wait for consumer to consume previous value before trampling it.
-      while ((*slot != nullptr || *argSlot0 != 0 || *argSlot1 !=0) && !checkAbort(spins));
-      // If there is no recv, then we are directly pulling from input buffer (e.g. directScatter)
-      // Otherwise, we are pulling from output buffer (e.g. recvCopyDirectSend)
-      directBuff = MaxRecv == 0 ? (T*)inputBuf : (T*)outputBuf;
-      // Exchange pre-scalers for use in direct pull
-      *argSlot0 = (uint64_t(1)<<32) | (uint32_t)redOpArg;
-      *argSlot1 = (uint64_t(1)<<32) | (uint32_t)(redOpArg>>32);
-      // Encode pointer by XOR'ing against some address they definitely wouldn't send
-      // since we want to allow them sending us nullptr while not colliding with
-      // the empty slot value.
-      *slot = reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(directBuff) ^ reinterpret_cast<uintptr_t>(slot));
+      if (slot && argSlot0 && argSlot1) {
+        while ((*slot != nullptr || *argSlot0 != 0 || *argSlot1 !=0) && !checkAbort(spins));
+        // If there is no recv, then we are directly pulling from input buffer (e.g. directScatter)
+        // Otherwise, we are pulling from output buffer (e.g. recvCopyDirectSend)
+        directBuff = MaxRecv == 0 ? (T*)inputBuf : (T*)outputBuf;
+        // Exchange pre-scalers for use in direct pull
+        *argSlot0 = (uint64_t(1)<<32) | (uint32_t)redOpArg;
+        *argSlot1 = (uint64_t(1)<<32) | (uint32_t)(redOpArg>>32);
+        // Encode pointer by XOR'ing against some address they definitely wouldn't send
+        // since we want to allow them sending us nullptr while not colliding with
+        // the empty slot value.
+        *slot = reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(directBuff) ^ reinterpret_cast<uintptr_t>(slot));
+      }
    }
    if (Direct && recvAcceptor) {
      int spins = 0;
@@ -549,24 +609,29 @@ class Primitives<
      volatile uint64_t* argSlot0 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange;
      volatile uint64_t* argSlot1 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange+1;
      void *ptr;
-      while (true) {
+      while (slot) {
        ptr = *slot;
        if (ptr != nullptr || checkAbort(spins)) break;
      }
-      directBuff = regUsed ? (T*)(MaxSend == 0 ? e->upOutputs[index] : e->dnInputs[index]) :
-                   reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(ptr) ^ reinterpret_cast<uintptr_t>(slot));
-      if (MaxSend != 0) { // reduce group rather than gather group
-        // Store scalers for remote inputs
-        uint64_t arg0, arg1;
-        while (true) {
-          arg0 = *argSlot0;
-          arg1 = *argSlot1;
-          if ((arg0 != 0 && arg1 != 0) || checkAbort(spins)) break;
+
+      if (slot && argSlot0 && argSlot1) {
+        directBuff = regUsed ? (T*)(MaxSend == 0 ? e->upOutputs[index] : e->dnInputs[index]) :
+          reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(ptr) ^ reinterpret_cast<uintptr_t>(slot));
+        if (MaxSend != 0) { // reduce group rather than gather group
+          // Store scalers for remote inputs
+          uint64_t arg0, arg1;
+          while (true) {
+            arg0 = *argSlot0;
+            arg1 = *argSlot1;
+            if ((arg0 != 0 && arg1 != 0) || checkAbort(spins)) break;
+          }
+          ncclShmem.redOpArgs[1 + index] = ((arg1 & 0xffffffff) << 32) | (arg0 & 0xffffffff);
        }
-        ncclShmem.redOpArgs[1+index] = ((arg1 & 0xffffffff)<<32) | (arg0 & 0xffffffff);
+        *argSlot0 = 0; *argSlot1 = 0;
+        *slot = nullptr;
+      } else {
+        directBuff = (T*)e->dnInputs[index];
      }
-      *argSlot0 = 0; *argSlot1 = 0;
-      *slot = nullptr;
    }
  }

@@ -594,6 +659,9 @@ class Primitives<
  __device__ __forceinline__ void directRecv(intptr_t outIx, int eltN) {
    genericOp<1, 0, 1, 0, -1, Output>(-1, outIx, eltN, /*postOp=*/false);
  }
+  __device__ __forceinline__ void directRecvCopy(intptr_t inpIx, intptr_t outIx, int eltN) {
+    genericOp<1, 0, 1, 0, -1, Output>(inpIx, outIx, eltN, /*postOp=*/false);
+  }

  __device__ __forceinline__ void copySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
    genericOp<0, 0, 0, 1, Input, Output>(inpIx, outIx, eltN, postOp);
@@ -611,6 +679,9 @@ class Primitives<
  __device__ __forceinline__ void directRecvCopySend(intptr_t outIx, int eltN) {
    genericOp<1, 1, 1, 1, -1, Output>(-1, outIx, eltN, false);
  }
+  __device__ __forceinline__ void directRecvDirectSend(intptr_t inpIx, intptr_t outIx, int eltN) {
+    genericOp<1, 1, 1, 1, -1, -1>(inpIx, outIx, eltN, false);
+  }
  __device__ __forceinline__ void recvCopyDirectSend(intptr_t outIx, int eltN, bool postOp=false) {
    genericOp<0, 1, 1, 1, -1, Output>(-1, outIx, eltN, postOp);
  }
@@ -635,20 +706,20 @@ class Primitives<
  }

  __device__ __forceinline__ void
-  scatter(intptr_t inpIx, int totalElem, int peerElem, int peerOffset, int skip, int shift) {
+  scatter(intptr_t inpIx, ssize_t totalElem, int peerElem, ssize_t peerOffset, int skip, int shift) {
    ScatterGatherOp<0, 0, 0, 1>(inpIx, -1, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false);
  }
  __device__ __forceinline__ void
-  directScatter(intptr_t inpIx, int totalElem, int peerElem, int peerOffset, int skip, int shift) {
+  directScatter(intptr_t inpIx, ssize_t totalElem, int peerElem, ssize_t peerOffset, int skip, int shift) {
    ScatterGatherOp<0, 1, 0, 1>(inpIx, -1, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false);
  }

  __device__ __forceinline__ void
-  gather(intptr_t outIx, int totalElem, int peerElem, int peerOffset, int skip, int shift, bool postOp=false) {
+  gather(intptr_t outIx, ssize_t totalElem, int peerElem, ssize_t peerOffset, int skip, int shift, bool postOp=false) {
    ScatterGatherOp<0, 0, 1, 0>(-1, outIx, totalElem, peerElem, peerOffset, skip, shift, postOp);
  }
  __device__ __forceinline__ void
-  directGather(intptr_t outIx, int totalElem, int peerElem, int peerOffset, int skip, int shift) {
+  directGather(intptr_t outIx, ssize_t totalElem, int peerElem, ssize_t peerOffset, int skip, int shift) {
    ScatterGatherOp<1, 0, 1, 0>(-1, outIx, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false);
  }
 };
@@ -4,7 +4,7 @@
 * See LICENSE.txt for license information
 ************************************************************************/

-#include "devcomm.h"
+#include "device.h"
 #include "collectives.h"
 #include "primitives.h"

@@ -12,6 +12,19 @@
 #include <limits>
 #include <type_traits>

+template<typename T>
+struct IsFloatingPoint: std::false_type {};
+template<>
+struct IsFloatingPoint<half>: std::true_type {};
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template<>
+struct IsFloatingPoint<__nv_bfloat16>: std::true_type {};
+#endif
+template<>
+struct IsFloatingPoint<float>: std::true_type {};
+template<>
+struct IsFloatingPoint<double>: std::true_type {};
+
 ////////////////////////////////////////////////////////////////////////////////
 // The reduction function classes. All classes must:
 //  1. Expose the `EltType` typedef.
@@ -19,16 +32,21 @@
 //  3. Have constructor taking `uint64_t opArg`.

 template<typename T>
-struct FuncNull { using EltType = T; __device__ FuncNull(uint64_t opArg=0) {}; };
+struct FuncCopy { using EltType = T; __device__ FuncCopy(uint64_t opArg=0) {}; };
 template<typename T>
 struct FuncSum  { using EltType = T; __device__ FuncSum(uint64_t opArg=0) {}; };
 template<typename T>
 struct FuncProd { using EltType = T; __device__ FuncProd(uint64_t opArg=0) {}; };
 template<typename T>
-struct FuncMin  { using EltType = T; __device__ FuncMin(uint64_t opArg=0) {}; };
-template<typename T>
-struct FuncMax  { using EltType = T; __device__ FuncMax(uint64_t opArg=0) {}; };
-
+struct FuncMinMax {
+  using EltType = T;
+  BytePack<sizeof(T)> xormask; // only used by integers
+  bool isMinNotMax; // only used by floats
+  __device__ FuncMinMax(uint64_t opArg=0) {
+    xormask.native = opArg;
+    isMinNotMax = (opArg&1)==0;
+  }
+};
 template<typename T> struct FuncPreMulSum;
 template<typename T> struct FuncSumPostDiv;

@@ -127,8 +145,8 @@ struct Apply_Reduce {

 // Base case definitions (EltPerPack == 1)
 template<typename T>
-struct Apply_Reduce<FuncNull<T>, /*EltPerPack=*/1> {
-  __device__ static BytePack<sizeof(T)> reduce(FuncSum<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
+struct Apply_Reduce<FuncCopy<T>, /*EltPerPack=*/1> {
+  __device__ static BytePack<sizeof(T)> reduce(FuncCopy<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
    return a;
  }
 };
@@ -145,15 +163,9 @@ struct Apply_Reduce<FuncProd<T>, /*EltPerPack=*/1> {
  }
 };
 template<typename T>
-struct Apply_Reduce<FuncMin<T>, /*EltPerPack=*/1> {
-  __device__ static BytePack<sizeof(T)> reduce(FuncMin<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
-    return toPack<T>(min(fromPack<T>(a), fromPack<T>(b)));
-  }
-};
-template<typename T>
-struct Apply_Reduce<FuncMax<T>, /*EltPerPack=*/1> {
-  __device__ static BytePack<sizeof(T)> reduce(FuncMax<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
-    return toPack<T>(max(fromPack<T>(a), fromPack<T>(b)));
+struct Apply_Reduce<FuncMinMax<T>, /*EltPerPack=*/1> {
+  __device__ static BytePack<sizeof(T)> reduce(FuncMinMax<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
+    return (a.native ^ fn.xormask.native) < (b.native ^ fn.xormask.native) ? a : b;
  }
 };

@@ -161,57 +173,55 @@ struct Apply_Reduce<FuncMax<T>, /*EltPerPack=*/1> {
 template<>
 struct Apply_Reduce<FuncSum<uint8_t>, /*EltPerPack=*/4> {
  __device__ static BytePack<4> reduce(FuncSum<uint8_t> fn, BytePack<4> a, BytePack<4> b) {
-    constexpr uint32_t lo = 0x00ff00ff;
-    constexpr uint32_t hi = ~lo;
-    uint32_t x = a.u32;
-    uint32_t y = b.u32;
-    a.u32 = (((x&lo) + (y&lo))&lo) + (((x&hi) + (y&hi))&hi);
+    constexpr uint32_t even = 0x00ff00ffu;
+    uint32_t x = (a.native &  even) + (b.native &  even);
+    uint32_t y = (a.native & ~even) + (b.native & ~even);
+    //a.native = (x & even) | (y & ~even);
+    a.native = __byte_perm(x, y, 0x7250);
    return a;
  }
 };
+
 template<>
-struct Apply_Reduce<FuncSum<int8_t>, /*EltPerPack=*/4> {
-  __device__ static BytePack<4> reduce(FuncSum<int8_t> fn, BytePack<4> a, BytePack<4> b) {
-    return Apply_Reduce<FuncSum<uint8_t>, 4>::reduce(FuncSum<uint8_t>(), a, b);
+struct Apply_Reduce<FuncMinMax<uint8_t>, /*EltPerPack=*/4> {
+  __device__ static BytePack<4> reduce(FuncMinMax<uint8_t> fn, BytePack<4> a, BytePack<4> b) {
+    constexpr uint32_t ones = 0x01010101u;
+    constexpr uint32_t even = 0x00ff00ffu; // even byte mask
+    // Replicate xormask to all bytes
+    uint32_t x = fn.xormask.native * ones;
+    // Transform inputs by xormask
+    uint32_t ax = a.native ^ x;
+    uint32_t bx = b.native ^ x;
+    // Use 9-bit arithmetic to compute d=a-b
+    uint32_t d0 = (ax    & even) + (~bx      & even) + ones;
+    uint32_t d1 = (ax>>8 & even) + (~(bx>>8) & even) + ones;
+    // Move sign bit of each 9-bit delta into the least bit of origin byte
+    //uint32_t s = (d0>>8 & ones & even) | (d1 & ones & ~even);
+    uint32_t s = __byte_perm(d0, d1, 0x7351) & ones;
+    // Broadcast least bit across whole byte
+    s *= 0xffu;
+    // Compose result by selecting bytes via: signbit(a-b)==1 ? a : b
+    a.native = (a.native & s) | (b.native & ~s);
+    return a;
  }
 };

-#if 300 <= __CUDA_ARCH__ && __CUDA_ARCH__ < 500
-  template<>
-  struct Apply_Reduce<FuncMin<uint8_t>, /*EltPerPack=*/4> {
-    __device__ static BytePack<4> reduce(FuncMin<uint8_t> fn, BytePack<4> a, BytePack<4> b) {
-      uint32_t z=0;
-      asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(a.u32) : "r"(a.u32), "r"(b.u32), "r"(z));
-      return a;
-    }
-  };
-  template<>
-  struct Apply_Reduce<FuncMin<int8_t>, /*EltPerPack=*/4> {
-    __device__ static BytePack<4> reduce(FuncMin<int8_t> fn, BytePack<4> a, BytePack<4> b) {
-      int32_t z=0;
-      asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(a.u32) : "r"(a.u32), "r"(b.u32), "r"(z));
-      return a;
-    }
-  };
-  template<>
-  struct Apply_Reduce<FuncMax<uint8_t>, /*EltPerPack=*/4> {
-    __device__ static BytePack<4> reduce(FuncMax<uint8_t> fn, BytePack<4> a, BytePack<4> b) {
-      uint32_t z=0;
-      asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(a.u32) : "r"(a.u32), "r"(b.u32), "r"(z));
-      return a;
-    }
-  };
-  template<>
-  struct Apply_Reduce<FuncMax<int8_t>, /*EltPerPack=*/4> {
-    __device__ static BytePack<4> reduce(FuncMax<int8_t> fn, BytePack<4> a, BytePack<4> b) {
-      int32_t z=0;
-      asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(a.u32) : "r"(a.u32), "r"(b.u32), "r"(z));
-      return a;
-    }
-  };
-#endif
+template<>
+struct Apply_Reduce<FuncProd<uint8_t>, /*EltPerPack=*/4> {
+  __device__ static BytePack<4> reduce(FuncProd<uint8_t> fn, BytePack<4> apack, BytePack<4> bpack) {
+    uint32_t a = apack.native;
+    uint32_t b = bpack.native;
+    uint32_t ab0 = (a*b) & 0xffu;
+    asm("mad.lo.u32 %0, %1, %2, %0;" : "+r"(ab0) : "r"(a&0xff00u), "r"(b&0xff00u));
+    uint32_t ab1;
+    asm("mul.hi.u32 %0, %1, %2;"     : "=r"(ab1) : "r"(a&0xff0000), "r"(b&0xff0000));
+    asm("mad.hi.u32 %0, %1, %2, %0;" : "+r"(ab1) : "r"(a&0xff000000u), "r"(b&0xff000000u));
+    apack.native = __byte_perm(ab0, ab1, 0x6420);
+    return apack;
+  }
+};

-#define SPECIALIZE_REDUCE(Fn, T, EltPerPack, Vec, expr_of_x_y) \
+#define SPECIALIZE_REDUCE(Fn, T, EltPerPack, Vec, expr_of_fn_x_y) \
  template<> \
  struct Apply_Reduce<Fn<T>, EltPerPack> { \
    __device__ __forceinline__ static BytePack<sizeof(Vec)> reduce( \
@@ -219,10 +229,13 @@ struct Apply_Reduce<FuncSum<int8_t>, /*EltPerPack=*/4> {
      ) { \
      Vec x = fromPack<Vec>(a); \
      Vec y = fromPack<Vec>(b); \
-      return toPack<Vec>(expr_of_x_y); \
+      return toPack<Vec>(expr_of_fn_x_y); \
    } \
  };

+SPECIALIZE_REDUCE(FuncMinMax, float, 1, float, fn.isMinNotMax ? fminf(x, y) : fmaxf(x, y))
+SPECIALIZE_REDUCE(FuncMinMax, double, 1, double, fn.isMinNotMax ? fmin(x, y) : fmax(x, y))
+
 #if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
  SPECIALIZE_REDUCE(FuncSum, half, 1, half, __hadd(x, y))
  SPECIALIZE_REDUCE(FuncSum, half, 2, half2, __hadd2(x, y))
@@ -234,13 +247,10 @@ struct Apply_Reduce<FuncSum<int8_t>, /*EltPerPack=*/4> {
 #endif

 #if __CUDA_ARCH__ >= 800
-  SPECIALIZE_REDUCE(FuncMin, half, 1, half, __hmin(x, y))
-  SPECIALIZE_REDUCE(FuncMin, half, 2, half2, __hmin2(x, y))
-  SPECIALIZE_REDUCE(FuncMax, half, 1, half, __hmax(x, y))
-  SPECIALIZE_REDUCE(FuncMax, half, 2, half2, __hmax2(x, y))
+  SPECIALIZE_REDUCE(FuncMinMax, half, 1, half, fn.isMinNotMax ? __hmin(x, y) : __hmax(x, y))
+  SPECIALIZE_REDUCE(FuncMinMax, half, 2, half2, fn.isMinNotMax ? __hmin2(x, y) : __hmax2(x, y))
 #else
-  SPECIALIZE_REDUCE(FuncMin, half, 1, half, __float2half(fminf(__half2float(x), __half2float(y))))
-  SPECIALIZE_REDUCE(FuncMax, half, 1, half, __float2half(fmaxf(__half2float(x), __half2float(y))))
+  SPECIALIZE_REDUCE(FuncMinMax, half, 1, half, __float2half(fn.isMinNotMax ? fminf(__half2float(x), __half2float(y)) : fmaxf(__half2float(x), __half2float(y))))
 #endif

 #if defined(__CUDA_BF16_TYPES_EXIST__)
@@ -249,15 +259,12 @@ struct Apply_Reduce<FuncSum<int8_t>, /*EltPerPack=*/4> {
  SPECIALIZE_REDUCE(FuncSum, __nv_bfloat16, 2, __nv_bfloat162, __hadd2(x, y))
  SPECIALIZE_REDUCE(FuncProd, __nv_bfloat16, 1, __nv_bfloat16, __hmul(x, y))
  SPECIALIZE_REDUCE(FuncProd, __nv_bfloat16, 2, __nv_bfloat162, __hmul2(x, y))
-  SPECIALIZE_REDUCE(FuncMin, __nv_bfloat16, 1, __nv_bfloat16, __hmin(x, y))
-  SPECIALIZE_REDUCE(FuncMin, __nv_bfloat16, 2, __nv_bfloat162, __hmin2(x, y))
-  SPECIALIZE_REDUCE(FuncMax, __nv_bfloat16, 1, __nv_bfloat16, __hmax(x, y))
-  SPECIALIZE_REDUCE(FuncMax, __nv_bfloat16, 2, __nv_bfloat162, __hmax2(x, y))
+  SPECIALIZE_REDUCE(FuncMinMax, __nv_bfloat16, 1, __nv_bfloat16, fn.isMinNotMax ? __hmin(x, y) : __hmax(x, y))
+  SPECIALIZE_REDUCE(FuncMinMax, __nv_bfloat16, 2, __nv_bfloat162, fn.isMinNotMax ? __hmin2(x, y) : __hmax2(x, y))
 #else
  SPECIALIZE_REDUCE(FuncSum, __nv_bfloat16, 1, __nv_bfloat16, __float2bfloat16(__bfloat162float(x) + __bfloat162float(y)))
  SPECIALIZE_REDUCE(FuncProd, __nv_bfloat16, 1, __nv_bfloat16, __float2bfloat16(__bfloat162float(x) * __bfloat162float(y)))
-  SPECIALIZE_REDUCE(FuncMin, __nv_bfloat16, 1, __nv_bfloat16, __float2bfloat16(fminf(__bfloat162float(x), __bfloat162float(y))))
-  SPECIALIZE_REDUCE(FuncMax, __nv_bfloat16, 1, __nv_bfloat16, __float2bfloat16(fmaxf(__bfloat162float(x), __bfloat162float(y))))
+  SPECIALIZE_REDUCE(FuncMinMax, __nv_bfloat16, 1, __nv_bfloat16, __float2bfloat16(fn.isMinNotMax ? fminf(__bfloat162float(x), __bfloat162float(y)) : fmaxf(__bfloat162float(x), __bfloat162float(y))))
 #endif
 #endif

@@ -479,19 +486,6 @@ struct Apply_PreOp<FuncPreMulSum<half>, /*EltPerPack=*/1> {
 ////////////////////////////////////////////////////////////////////////////////
 // FuncSumPostDiv

-template<typename T>
-struct IsFloatingPoint: std::false_type {};
-template<>
-struct IsFloatingPoint<half>: std::true_type {};
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-template<>
-struct IsFloatingPoint<__nv_bfloat16>: std::true_type {};
-#endif
-template<>
-struct IsFloatingPoint<float>: std::true_type {};
-template<>
-struct IsFloatingPoint<double>: std::true_type {};
-
 template<typename T, bool IsFloating=IsFloatingPoint<T>::value>
 struct FuncSumPostDiv_IntOnly;

@@ -543,25 +537,44 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
 #define SIZEOF_BytePack_field_u64 8
 #define PTX_REG_BytePack_field_u64 "l"

-#define DEFINE_Apply_LoadMultimem(Fn, T, op, ptx_ty, pack_field) \
+#define DEFINE_Apply_LoadMultimem_sum(T, ptx_ty, pack_field) \
  template<> \
-  struct Apply_LoadMultimem<Fn<T>, SIZEOF_BytePack_field_##pack_field> { \
+  struct Apply_LoadMultimem<FuncSum<T>, SIZEOF_BytePack_field_##pack_field> { \
    static constexpr int PackSize = SIZEOF_BytePack_field_##pack_field; \
-    __device__ static BytePack<PackSize> load(Fn<T> fn, uintptr_t addr) { \
+    __device__ static BytePack<PackSize> load(FuncSum<T> fn, uintptr_t addr) { \
      BytePack<PackSize> ans; \
-      asm("multimem.ld_reduce.relaxed.sys.global." #op "." #ptx_ty " %0, [%1];" \
+      asm("multimem.ld_reduce.relaxed.sys.global.add." #ptx_ty " %0, [%1];" \
        : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \
        : "l"(addr)); \
      return ans; \
    } \
  };
-#define DEFINE_Apply_LoadMultimem_v4(Fn, T, op, ptx_ty, pack_field) \
+#define DEFINE_Apply_LoadMultimem_minmax(T, ptx_ty, pack_field) \
  template<> \
-  struct Apply_LoadMultimem<Fn<T>, 4*(SIZEOF_BytePack_field_##pack_field)> { \
-    static constexpr int PackSize = 4*(SIZEOF_BytePack_field_##pack_field); \
-    __device__ static BytePack<PackSize> load(Fn<T> fn, uintptr_t addr) { \
+  struct Apply_LoadMultimem<FuncMinMax<T>, SIZEOF_BytePack_field_##pack_field> { \
+    static constexpr int PackSize = SIZEOF_BytePack_field_##pack_field; \
+    __device__ static BytePack<PackSize> load(FuncMinMax<T> fn, uintptr_t addr) { \
      BytePack<PackSize> ans; \
-      asm("multimem.ld_reduce.relaxed.sys.global." #op ".v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
+      if (fn.isMinNotMax) { \
+        asm("multimem.ld_reduce.relaxed.sys.global.min." #ptx_ty " %0, [%1];" \
+          : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \
+          : "l"(addr)); \
+      } else { \
+        asm("multimem.ld_reduce.relaxed.sys.global.max." #ptx_ty " %0, [%1];" \
+          : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \
+          : "l"(addr)); \
+      } \
+      return ans; \
+    } \
+  };
+
+#define DEFINE_Apply_LoadMultimem_sum_v4(T, ptx_ty, pack_field) \
+  template<> \
+  struct Apply_LoadMultimem<FuncSum<T>, 4*(SIZEOF_BytePack_field_##pack_field)> { \
+    static constexpr int PackSize = 4*(SIZEOF_BytePack_field_##pack_field); \
+    __device__ static BytePack<PackSize> load(FuncSum<T> fn, uintptr_t addr) { \
+      BytePack<PackSize> ans; \
+      asm("multimem.ld_reduce.relaxed.sys.global.add.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
        : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \
          "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \
          "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \
@@ -570,18 +583,61 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
      return ans; \
    } \
  };
-#define DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(Fn, T, op, ptx_ty, pack_field) \
-  DEFINE_Apply_LoadMultimem_v4(Fn, T, op, ptx_ty, pack_field) \
+#define DEFINE_Apply_LoadMultimem_minmax_v4(T, ptx_ty, pack_field) \
  template<> \
-  struct Apply_LoadMultimem<Fn<T>, sizeof(T)> { \
-    __device__ static BytePack<sizeof(T)> load(Fn<T> fn, uintptr_t addr) { \
+  struct Apply_LoadMultimem<FuncMinMax<T>, 4*(SIZEOF_BytePack_field_##pack_field)> { \
+    static constexpr int PackSize = 4*(SIZEOF_BytePack_field_##pack_field); \
+    __device__ static BytePack<PackSize> load(FuncMinMax<T> fn, uintptr_t addr) { \
+      BytePack<PackSize> ans; \
+      if (fn.isMinNotMax) { \
+        asm("multimem.ld_reduce.relaxed.sys.global.min.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
+          : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \
+            "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \
+            "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \
+            "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[3]) \
+          : "l"(addr)); \
+      } else { \
+        asm("multimem.ld_reduce.relaxed.sys.global.max.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
+          : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \
+            "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \
+            "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \
+            "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[3]) \
+          : "l"(addr)); \
+      } \
+      return ans; \
+    } \
+  };
+
+#define DEFINE_Apply_LoadMultimem_sum_v4x2_and_subhalf(T, ptx_ty, pack_field) \
+  DEFINE_Apply_LoadMultimem_sum_v4(T, ptx_ty, pack_field) \
+  template<> \
+  struct Apply_LoadMultimem<FuncSum<T>, sizeof(T)> { \
+    __device__ static BytePack<sizeof(T)> load(FuncSum<T> fn, uintptr_t addr) { \
      BytePack<2*sizeof(T)> tmp; \
-      asm("multimem.ld_reduce.relaxed.sys.global." #op "." #ptx_ty " %0, [%1];" \
+      asm("multimem.ld_reduce.relaxed.sys.global.add." #ptx_ty " %0, [%1];" \
        : "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
        : "l"(addr & -uintptr_t(sizeof(T)))); \
      return tmp.half[(addr/sizeof(T))%2]; \
    } \
  };
+#define DEFINE_Apply_LoadMultimem_minmax_v4x2_and_subhalf(T, ptx_ty, pack_field) \
+  DEFINE_Apply_LoadMultimem_minmax_v4(T, ptx_ty, pack_field) \
+  template<> \
+  struct Apply_LoadMultimem<FuncMinMax<T>, sizeof(T)> { \
+    __device__ static BytePack<sizeof(T)> load(FuncMinMax<T> fn, uintptr_t addr) { \
+      BytePack<2*sizeof(T)> tmp; \
+      if (fn.isMinNotMax) { \
+        asm("multimem.ld_reduce.relaxed.sys.global.min." #ptx_ty " %0, [%1];" \
+          : "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
+          : "l"(addr & -uintptr_t(sizeof(T)))); \
+      } else { \
+        asm("multimem.ld_reduce.relaxed.sys.global.max." #ptx_ty " %0, [%1];" \
+          : "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
+          : "l"(addr & -uintptr_t(sizeof(T)))); \
+      } \
+      return tmp.half[(addr/sizeof(T))%2]; \
+    } \
+  };

 template<typename Fn, int BytePerPack>
 struct Apply_LoadMultimem {
@@ -598,46 +654,39 @@ struct Apply_LoadMultimem {
    static constexpr bool IsSum = std::is_same<Fn, FuncSum<T>>::value ||
                                  std::is_same<Fn, FuncPreMulSum<T>>::value ||
                                  std::is_same<Fn, FuncSumPostDiv<T>>::value;
-    static constexpr bool IsMinOrMax = std::is_same<Fn, FuncMin<T>>::value ||
-                                       std::is_same<Fn, FuncMax<T>>::value;
+    static constexpr bool IsMinMax = std::is_same<Fn, FuncMinMax<T>>::value;
    static constexpr bool IsFloat = IsFloatingPoint<T>::value;
    static constexpr int BigPackSize =
      IsFloat && IsSum && sizeof(T) < 8 ? 16 :
      IsFloat && IsSum ? 8 :
-      IsFloat && IsMinOrMax && sizeof(T)==2 ? 16 :
-      !IsFloat && (IsSum||IsMinOrMax) && sizeof(T)>=4 ? sizeof(T) :
+      IsFloat && IsMinMax && sizeof(T)==2 ? 16 :
+      !IsFloat && (IsSum||IsMinMax) && sizeof(T)>=4 ? sizeof(T) :
      /*multimem.ld_reduce not supported:*/ 0;
  };

-  DEFINE_Apply_LoadMultimem(FuncSum, uint32_t, add, u32, u32)
-  DEFINE_Apply_LoadMultimem(FuncMin, uint32_t, min, u32, u32)
-  DEFINE_Apply_LoadMultimem(FuncMax, uint32_t, max, u32, u32)
+  DEFINE_Apply_LoadMultimem_sum(uint32_t, u32, u32)
+  DEFINE_Apply_LoadMultimem_minmax(uint32_t, u32, u32)

-  DEFINE_Apply_LoadMultimem(FuncSum, int32_t, add, s32, u32)
-  DEFINE_Apply_LoadMultimem(FuncMin, int32_t, min, s32, u32)
-  DEFINE_Apply_LoadMultimem(FuncMax, int32_t, max, s32, u32)
+  DEFINE_Apply_LoadMultimem_sum(int32_t, s32, u32)
+  DEFINE_Apply_LoadMultimem_minmax(int32_t, s32, u32)

-  DEFINE_Apply_LoadMultimem(FuncSum, uint64_t, add, u64, u64)
-  DEFINE_Apply_LoadMultimem(FuncMin, uint64_t, min, u64, u64)
-  DEFINE_Apply_LoadMultimem(FuncMax, uint64_t, max, u64, u64)
+  DEFINE_Apply_LoadMultimem_sum(uint64_t, u64, u64)
+  DEFINE_Apply_LoadMultimem_minmax(uint64_t, u64, u64)

-  DEFINE_Apply_LoadMultimem(FuncSum, int64_t, add, u64, u64)
-  DEFINE_Apply_LoadMultimem(FuncMin, int64_t, min, s64, u64)
-  DEFINE_Apply_LoadMultimem(FuncMax, int64_t, max, s64, u64)
+  DEFINE_Apply_LoadMultimem_sum(int64_t, u64, u64)
+  DEFINE_Apply_LoadMultimem_minmax(int64_t, s64, u64)

-  DEFINE_Apply_LoadMultimem(FuncSum, float, add, f32, u32)
-  DEFINE_Apply_LoadMultimem_v4(FuncSum, float, add, f32, u32)
+  DEFINE_Apply_LoadMultimem_sum(float, f32, u32)
+  DEFINE_Apply_LoadMultimem_sum_v4(float, f32, u32)

-  DEFINE_Apply_LoadMultimem(FuncSum, double, add, f64, u64)
+  DEFINE_Apply_LoadMultimem_sum(double, f64, u64)

-  DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncSum, half, add, f16x2, u32)
-  DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMin, half, min, f16x2, u32)
-  DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMax, half, max, f16x2, u32)
+  DEFINE_Apply_LoadMultimem_sum_v4x2_and_subhalf(half, f16x2, u32)
+  DEFINE_Apply_LoadMultimem_minmax_v4x2_and_subhalf(half, f16x2, u32)

  #if defined(__CUDA_BF16_TYPES_EXIST__)
-    DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncSum, __nv_bfloat16, add, bf16x2, u32)
-    DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMin, __nv_bfloat16, min, bf16x2, u32)
-    DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMax, __nv_bfloat16, max, bf16x2, u32)
+    DEFINE_Apply_LoadMultimem_sum_v4x2_and_subhalf(__nv_bfloat16, bf16x2, u32)
+    DEFINE_Apply_LoadMultimem_minmax_v4x2_and_subhalf(__nv_bfloat16, bf16x2, u32)
  #endif
 #else
  template<typename Fn>
@@ -4,7 +4,7 @@
 * See LICENSE.txt for license information
 ************************************************************************/

-#include "devcomm.h"
+#include "device.h"
 #include "collectives.h"
 #include "primitives.h"

@@ -98,33 +98,69 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROT
    const ssize_t chunkSize = int(args->lastChunkSize);
    const ssize_t size = args->count;
    const ssize_t loopSize = nChannels*chunkSize;
+    const int rank = ncclShmem.comm.rank;
+    const int nranks = ncclShmem.comm.nRanks;

-    const int nThreadsScatter = 128 + WARP_SIZE;
-    const int nThreadsReduce = 384;
+    /* if we are direct NVLS, we only need to allocate 1 warp to scatter for sync; 
+     * if not, based on #ranks, we allocate 7 or 5 warps to reduce to saturate bandwidth
+     * and the rest are allocated to scatter. */
+    const int nThreadsReduce = args->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : (nranks <= 6 ? 7 * WARP_SIZE : 5 * WARP_SIZE);
+    const int nThreadsScatter = args->regUsed ? WARP_SIZE : (NCCL_MAX_NTHREADS - nThreadsReduce);
    const int tidEndScatter = nThreadsScatter;
    const int tidEndReduce = tidEndScatter + nThreadsReduce;

-    using Proto = ProtoSimple<1, 1>;
-
-    if (tid < tidEndScatter) {
-      // Scatter
-      Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
-        prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
-            args->redOpArg, 0*Proto::MaxGroupWidth, 0, 0);
-      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-        ssize_t offset = gridOffset + bid*chunkSize;
-        int nelem = min(chunkSize, size-offset);
-        prims.scatter(offset, nvls->nHeads*size, nelem, size, -1, 0);
+    if (!args->regUsed) {
+      if (tid < tidEndScatter) {
+        // Scatter
+        using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
+        Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
+          prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
+            args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
+        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+          ssize_t offset = gridOffset + bid * chunkSize;
+          int nelem = min(chunkSize, size - offset);
+          prims.scatter(offset, nvls->nHeads * size, nelem, size, -1, 0);
+        }
+      } else if (tid < tidEndReduce) {
+        // Reduce through NVLS
+        using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
+        Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
+          prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, args->recvbuff,
+            args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0);
+        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+          ssize_t offset = gridOffset + bid * chunkSize;
+          int nelem = min(chunkSize, size - offset);
+          prims.recv(offset, nelem);
+        }
      }
-    } else if (tid < tidEndReduce) {
-      // Reduce through NVLS
-      Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
-        prims(tid-tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, args->recvbuff,
-           args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
-      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-        ssize_t offset = gridOffset + bid*chunkSize;
-        int nelem = min(chunkSize, size-offset);
-        prims.recv(offset, nelem);
+    } else {
+      if (tid < tidEndScatter) {
+        // Scatter
+        using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
+        Primitives<T, RedOp, FanSymmetric<NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
+          prims(tid, nThreadsScatter, nvls->up, nvls->up, NULL, NULL,
+            args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
+        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+          prims.scatter(0, 0, 0, 0, -1, 0);
+        }
+
+        /* gather used as sync */
+        prims.gather(0, 0, 0, 0, -1, 0);
+      } else if (tid < tidEndReduce) {
+        // Reduce through NVLS
+        using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
+        Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
+          prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, &nvls->down, NULL, args->recvbuff,
+            args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, args);
+        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+          ssize_t outOffset = gridOffset + bid * chunkSize;
+          ssize_t inpOffset = outOffset + rank * size;
+          int nelem = min(chunkSize, size - outOffset);
+          prims.directRecvCopy(inpOffset, outOffset, nelem);
+        }
+
+        /* send for sync */
+        prims.send(0, 0);
      }
    }
  }
@@ -4,7 +4,7 @@
 * See LICENSE.txt for license information
 ************************************************************************/

-#include "devcomm.h"
+#include "device.h"
 #include "collectives.h"
 #include "primitives.h"

@@ -26,7 +26,7 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
      if (args->proto == NCCL_PROTO_LL) chunkSize /= 2;
      int const peer = args->peer;
      Primitives<T, RedOp, FanAsymmetric<0, 1>, 1, Proto, 1> prims
-        (tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group, 1, 1);
+        (tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group, 1, 1, nullptr, ncclShmem.comm.p2pChunkSize/sizeof(T));
      size_t offset = 0;
      do {
        int nelem = min(size_t(chunkSize), count-offset);
@@ -45,7 +45,7 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
      if (args->proto == NCCL_PROTO_LL) chunkSize /= 2; // This is to account for chunkEffectiveSize
      int const peer = args->peer;
      Primitives<T, RedOp, FanAsymmetric<1, 0>, 1, Proto, 1> prims
-        (tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group, 1, 1);
+        (tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group, 1, 1, nullptr, ncclShmem.comm.p2pChunkSize/sizeof(T));
      size_t offset = 0;
      do {
        int nelem = min(size_t(chunkSize), count-offset);
@@ -11,83 +11,16 @@
 #include "bootstrap.h"
 #include "channel.h"
 #include "cudawrap.h"
+#include "transport.h"

 #include <cstring> // std::memcpy
 #include <cinttypes> // PRIx64

-static void* const ncclKernelGeneric = (void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t);
-
-struct ncclKernelMatch {
-  void* kernelFn;
-  bool specialized;
-};
-
-// Only generate inline kernels for LL
-#define NCCL_FUNC5(func, algo, devredop, dtype, specialized) \
-  /*LL    */{(void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype), true && specialized}, \
-  /*LL128 */{(void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype), false && specialized}, \
-  /*SIMPLE*/{(void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype), false && specialized}
-
-#define NCCL_FUNC4(func, devredop, type, specialized) \
-  NCCL_FUNC5(func, TREE,           devredop, type, specialized), \
-  NCCL_FUNC5(func, RING,           devredop, type, specialized), \
-  NCCL_FUNC5(func, COLLNET_DIRECT, devredop, type, specialized), \
-  NCCL_FUNC5(func, COLLNET_CHAIN,  devredop, type, specialized), \
-  NCCL_FUNC5(func, NVLS,           devredop, type, specialized), \
-  NCCL_FUNC5(func, NVLS_TREE,      devredop, type, specialized)
-
-#ifdef __CUDA_BF16_TYPES_EXIST__
-  #define HAVE_BFLOAT16 1
-#else
-  #define HAVE_BFLOAT16 0
-#endif
-
-// Must be consistent with ncclDataType_t
-#define NCCL_FUNCS3(func, devredop, reduction, specialized) \
-  NCCL_FUNC4(func, devredop, MACRO_IF(reduction, int8_t, int8_t), specialized), \
-  NCCL_FUNC4(func, devredop, MACRO_IF(reduction, uint8_t, int8_t), specialized), \
-  NCCL_FUNC4(func, devredop, MACRO_IF(reduction, int32_t, int8_t), specialized), \
-  NCCL_FUNC4(func, devredop, MACRO_IF(reduction, uint32_t, int8_t), specialized), \
-  NCCL_FUNC4(func, devredop, MACRO_IF(reduction, int64_t, int8_t), specialized), \
-  NCCL_FUNC4(func, devredop, MACRO_IF(reduction, uint64_t, int8_t), specialized), \
-  NCCL_FUNC4(func, devredop, MACRO_IF(reduction, half, int8_t), specialized), \
-  NCCL_FUNC4(func, devredop, MACRO_IF(reduction, float, int8_t), specialized), \
-  NCCL_FUNC4(func, devredop, MACRO_IF(reduction, double, int8_t), specialized) \
-  MACRO_IF(HAVE_BFLOAT16, \
-    SINGLE_ARG(, NCCL_FUNC4(func, devredop, MACRO_IF(reduction, __nv_bfloat16, int8_t), specialized)), \
-    /*nothing*/ \
-  )
-
-// Must be consistent with ncclDevRedOp_t -- but we only generate kernel for sums.
-#define NCCL_FUNCS2(func, reduction) \
-  NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/1), /*Sum*/ \
-  NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/0), /*Prod*/ \
-  NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/0), /*Max*/ \
-  NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/0), /*Min*/ \
-  NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/0), /*PreMulSum*/ \
-  NCCL_FUNCS3(func, Sum, reduction, /*specialized=*/0)  /*SumPostDiv*/
-
-// Must be consistent with the ncclFuncSet enum
-static const ncclKernelMatch ncclKerns[1+ncclNumTypes+NCCL_NUM_FUNCTIONS*ncclNumDevRedOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
-  {(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), true},
-  // We don't bake special kernels for the one-rank reductions
-  {/*int8*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
-  {/*uint8*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
-  {/*int32*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
-  {/*uint32*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
-  {/*int64*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
-  {/*uint64*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
-  {/*half*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
-  {/*float*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
-  {/*double*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
-  #if HAVE_BFLOAT16
-    {/*bfloat16*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), false},
-  #endif
-  NCCL_FUNCS2(Broadcast, /*reduction=*/0),
-  NCCL_FUNCS2(Reduce, /*reduction=*/1),
-  NCCL_FUNCS2(AllGather, /*reduction=*/0),
-  NCCL_FUNCS2(ReduceScatter, /*reduction=*/1),
-  NCCL_FUNCS2(AllReduce, /*reduction=*/1)
+enum ncclRegBufferType {
+  NCCL_REGULAR_BUFFER = 0,
+  NCCL_IPC_REG_BUFFER = 1,
+  NCCL_NVLS_REG_BUFFER = 2,
+  NCCL_REG_BUFFER_NUM = 3
 };

 static ncclResult_t computeColl(struct ncclInfo* info /* input */, int* workFuncIndex, struct ncclWorkElem* work, struct ncclProxyOp* proxyOp /* output */);
@@ -96,19 +29,14 @@ NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0);

 // Returns maximum kernel stack size of all CUDA kernels
 ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize) {
-  constexpr int KernelCount = sizeof(ncclKerns)/sizeof(ncclKerns[0]);
  ncclResult_t result = ncclSuccess;

  if (maxStackSize) *maxStackSize = 0;
  int carveout = ncclParamL1SharedMemoryCarveout();

-  // Keep track if we already visited a function pointer.
-  void* lru[2] = {nullptr, nullptr};
-  for (int i=0; i < KernelCount; i++) {
-    void* fn = ncclKerns[i].kernelFn;
-    if (fn == lru[0] || fn == lru[1]) goto next_kernel;
-    lru[1] = lru[0];
-    lru[0] = fn;
+  for (int k=0; k < ncclDevKernelCount; k++) {
+    void* fn = ncclDevKernelList[k];
+    if (fn == nullptr) continue;

    if (maxStackSize) {
      cudaFuncAttributes attr = {0};
@@ -116,14 +44,12 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize) {
      if (attr.localSizeBytes > *maxStackSize) *maxStackSize = attr.localSizeBytes;
    ignore0:;
    }
-
    if (carveout) {
      CUDACHECKGOTO(cudaFuncSetAttribute(fn,
        cudaFuncAttributePreferredSharedMemoryCarveout, carveout),
        result, ignore1);
    ignore1:;
    }
-
    if (ncclShmemDynamicSize(cudaArch) != 0) {
      CUDACHECKGOTO(cudaFuncSetAttribute(fn,
        cudaFuncAttributeMaxDynamicSharedMemorySize, ncclShmemDynamicSize(cudaArch)),
@@ -218,7 +144,7 @@ static void appendWorkElemP2p(
    struct ncclComm* comm, struct ncclKernelPlan* plan, int channelId,
    struct ncclWorkElemP2p const *elem, bool fuseOk
  ) {
-  constexpr int funcIndex = FUNC_INDEX_P2P;
+  int funcIndex = ncclDevFuncId_P2p();
  struct ncclKernelPlan::Channel* chan = &plan->channels[channelId];
  struct ncclWorkList* q = ncclIntruQueueTail(&chan->workQueue);
  if (q && funcIndex == q->work.header.funcIndex) {
@@ -240,7 +166,7 @@ static void appendWorkElemP2p(
  }
  q = ncclMemoryStackAlloc<struct ncclWorkList>(&comm->memScoped);
  q->work.header.type = ncclWorkTypeP2p;
-  q->work.header.funcIndex = FUNC_INDEX_P2P;
+  q->work.header.funcIndex = ncclDevFuncId_P2p();
  chan->p2pTailElem[ncclWorkP2pTypeRecv-1] = 0;
  chan->p2pTailElem[ncclWorkP2pTypeSend-1] = 1;
  q->work.p2pElems[chan->p2pTailElem[elem->p2pType-1]] = *elem; // C++ struct assignment
@@ -265,7 +191,7 @@ static ncclResult_t addProxyOpIfNeeded(struct ncclComm* comm, struct ncclKernelP
 static ncclResult_t addCollToPlan(
    struct ncclComm* comm, struct ncclKernelPlan* plan, int* nWorkBudget, int funcIndex,
    struct ncclWorkElem const* workElem, struct ncclProxyOp const* proxyOp,
-    int nCollChannels, int nBid, size_t bytes, bool regBufUsed, void* regBufSend[], void* regBufRecv[]
+    int nCollChannels, int nBid, size_t bytes, ncclRegBufferType regBufType, void* regBufSend[], void* regBufRecv[]
  ) {
  struct ncclKernelPlan::Channel *chans = plan->channels;

@@ -307,10 +233,9 @@ static ncclResult_t addCollToPlan(

    // Add work elem
    *nWorkBudget += chans[c].nWork;
-    if (!regBufUsed) {
+    if (regBufType == NCCL_REGULAR_BUFFER) {
      appendWorkElemColl(comm, plan, c, funcIndex, workElem, bid);
-    } else {
-      // Buffer registration in play which could only for CollNet at the moment.
+    } else if (regBufType == NCCL_IPC_REG_BUFFER) {
      struct ncclChannel* channel = &comm->channels[c];
      struct ncclWorkElemReg workElemReg;
      workElemReg.elem = *workElem; // C++ struct assignment
@@ -330,6 +255,18 @@ static ncclResult_t addCollToPlan(
        workElemReg.upOutputs[i] = regBufRecv[j];
      }
      appendWorkElemColl(comm, plan, c, funcIndex, &workElemReg, bid);
+    } else if (regBufType == NCCL_NVLS_REG_BUFFER) {
+      struct ncclWorkElemReg workElemReg;
+      workElemReg.elem = *workElem; // C++ struct assignment
+      workElemReg.elem.regUsed = 1;
+      /* NVLS only has one send and recv buffer registered */
+      workElemReg.dnInputs[0] = regBufSend[0];
+      workElemReg.dnOutputs[0] = regBufRecv[0];
+      appendWorkElemColl(comm, plan, c, funcIndex, &workElemReg, bid);
+    } else {
+      /* impossible value */
+      WARN("Invalid regBufType %d\n", regBufType);
+      return ncclInvalidArgument;
    }
    *nWorkBudget -= chans[c].nWork; // subtract delta of chans[c].nWork

@@ -417,68 +354,118 @@ static void finishPlan(struct ncclKernelPlan* plan) {
  plan->threadPerBlock = std::max(plan->threadPerBlock, 3*WARP_SIZE);
 }

+int64_t ncclParamLocalRegister();
+NCCL_PARAM(GraphRegister, "GRAPH_REGISTER", 1);
+
 static ncclResult_t registerIntraNodeBuffers(
    struct ncclComm* comm, struct ncclKernelPlan* plan, struct ncclInfo* info,
-    bool* outRegBufUsed,
    void* outRegBufSend[NCCL_MAX_LOCAL_RANKS],
-    void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS]
+    void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS],
+    ncclRegBufferType *outRegBufType
  ) {
-  *outRegBufUsed = false;
  ncclResult_t result = ncclSuccess;

+  *outRegBufType = NCCL_REGULAR_BUFFER;
 #if CUDART_VERSION >= 11030
-  int localRank = comm->localRank;
+  if ((info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) && comm->nvlsRegSupport) {
+    bool regBufUsed = false;
+    const void *sendbuff = info->sendbuff;
+    void *recvbuff = info->recvbuff;
+    cudaPointerAttributes sattr, rattr;
+    bool query = false;
+    
+    if (info->coll == ncclFuncAllGather)
+      sendbuff = NULL;
+    else if (info->coll == ncclFuncReduceScatter)
+      recvbuff = NULL;

-  if (CUPFN(cuMemGetAddressRange) == nullptr) return ncclSuccess;
+    /* first try local registration. */
+    if (ncclParamLocalRegister()) {
+      CUDACHECK(cudaPointerGetAttributes(&sattr, info->sendbuff));
+      CUDACHECK(cudaPointerGetAttributes(&rattr, info->recvbuff));
+      query = true;
+      if (sattr.type == cudaMemoryTypeDevice && rattr.type == cudaMemoryTypeDevice)
+        ncclNvlsLocalRegisterBuffer(comm, sendbuff, recvbuff, info->sendbuffSize, info->recvbuffSize, &regBufUsed, outRegBufSend, outRegBufRecv);
+    }

-  struct HandlePair {
-    cudaIpcMemHandle_t ipc[2]; // {send, recv}
-    size_t offset[2]; // {send, recv}
-  };
-  struct HandlePair handles[NCCL_MAX_LOCAL_RANKS];
+    if (regBufUsed == false && plan->persistent && ncclParamGraphRegister()) {
+      if (!query) {
+        CUDACHECK(cudaPointerGetAttributes(&sattr, info->sendbuff));
+        CUDACHECK(cudaPointerGetAttributes(&rattr, info->recvbuff));
+      }
+      if (sattr.type == cudaMemoryTypeDevice && rattr.type == cudaMemoryTypeDevice)
+        ncclNvlsGraphRegisterBuffer(comm, plan, sendbuff, recvbuff, info->sendbuffSize, info->recvbuffSize, &regBufUsed, outRegBufSend, outRegBufRecv);
+    }

-  CUDACHECKGOTO(cudaIpcGetMemHandle(&handles[localRank].ipc[0], (void*)info->sendbuff), result, fallback);
-  CUDACHECKGOTO(cudaIpcGetMemHandle(&handles[localRank].ipc[1], (void*)info->recvbuff), result, fallback);
+    if (regBufUsed) {
+      /* tweak NVLS channels usage; for registered NVLS buffer, we only need 4/5 channels to
+       * saturate bandwidth. */
+      if (info->coll == ncclFuncReduceScatter)
+        info->nChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 5));
+      else
+        info->nChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 4));
+      *outRegBufType = NCCL_NVLS_REG_BUFFER;
+    }
+  } else if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT &&   // limited to CollNetDirect for now
+    comm->intraHighestTransportType == TRANSPORT_P2P && // only when all ranks can p2p each other
+    comm->intraRanks < comm->localRanks &&  // only with inter-process & intra-node peers
+    plan->persistent && 0) {
+    /* Disable CollnetDirect registration since it does not support cuMem* allocated memory. */
+    int localRank = comm->localRank;
+    cudaPointerAttributes sattr, rattr;

-  void *baseSend, *baseRecv;
-  size_t size;
-  CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &size, (CUdeviceptr)info->sendbuff));
-  handles[localRank].offset[0] = (char*)info->sendbuff - (char*)baseSend;
-  CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseRecv, &size, (CUdeviceptr)info->recvbuff));
-  handles[localRank].offset[1] = (char*)info->recvbuff - (char*)baseRecv;
+    CUDACHECK(cudaPointerGetAttributes(&sattr, info->sendbuff));
+    CUDACHECK(cudaPointerGetAttributes(&rattr, info->recvbuff));
+    if (sattr.type != cudaMemoryTypeDevice || rattr.type != cudaMemoryTypeDevice) return ncclSuccess;

-  NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, handles, sizeof(struct HandlePair)));
+    if (CUPFN(cuMemGetAddressRange) == nullptr) return ncclSuccess;

-  // Open handles locally
-  for (int i=0; i < comm->localRanks; i++) {
-    if (i == localRank) { // Skip self
-      outRegBufSend[i] = nullptr;
-      outRegBufRecv[i] = nullptr;
-    } else {
-      for (int sr=0; sr < 2; sr++) {
-        // Get base address of mapping
-        void* base;
-        CUDACHECK(cudaIpcOpenMemHandle(&base, handles[i].ipc[sr], cudaIpcMemLazyEnablePeerAccess));
-        // Get real buffer address by adding offset in the mapping
-        (sr==0 ? outRegBufSend : outRegBufRecv)[i] = (char*)base + handles[i].offset[sr];
-        // Enqueue reminder to close memory handle
-        struct ncclPointerList* q = ncclMemoryPoolAlloc<struct ncclPointerList>(&comm->memPool_ncclPointerList, &comm->memPermanent);
-        q->ptr = base;
-        ncclIntruQueueEnqueue(&plan->ipcMemQueue, q);
+    struct HandlePair {
+      cudaIpcMemHandle_t ipc[2]; // {send, recv}
+      size_t offset[2]; // {send, recv}
+    };
+    struct HandlePair handles[NCCL_MAX_LOCAL_RANKS];
+
+    CUDACHECKGOTO(cudaIpcGetMemHandle(&handles[localRank].ipc[0], (void*)info->sendbuff), result, fallback);
+    CUDACHECKGOTO(cudaIpcGetMemHandle(&handles[localRank].ipc[1], (void*)info->recvbuff), result, fallback);
+
+    void *baseSend, *baseRecv;
+    size_t size;
+    CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &size, (CUdeviceptr)info->sendbuff));
+    handles[localRank].offset[0] = (char*)info->sendbuff - (char*)baseSend;
+    CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseRecv, &size, (CUdeviceptr)info->recvbuff));
+    handles[localRank].offset[1] = (char*)info->recvbuff - (char*)baseRecv;
+
+    NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, handles, sizeof(struct HandlePair)));
+
+    // Open handles locally
+    for (int i=0; i < comm->localRanks; i++) {
+      if (i == localRank) { // Skip self
+        outRegBufSend[i] = nullptr;
+        outRegBufRecv[i] = nullptr;
+      } else {
+        for (int sr=0; sr < 2; sr++) {
+          // Get base address of mapping
+          void* base;
+          CUDACHECK(cudaIpcOpenMemHandle(&base, handles[i].ipc[sr], cudaIpcMemLazyEnablePeerAccess));
+          // Get real buffer address by adding offset in the mapping
+          (sr==0 ? outRegBufSend : outRegBufRecv)[i] = (char*)base + handles[i].offset[sr];
+          // Enqueue reminder to close memory handle
+          struct ncclPointerList* q = ncclMemoryPoolAlloc<struct ncclPointerList>(&comm->memPool_ncclPointerList, &comm->memPermanent);
+          q->ptr = base;
+          ncclIntruQueueEnqueue(&plan->ipcMemQueue, q);
+        }
      }
    }
+    *outRegBufType = NCCL_IPC_REG_BUFFER;
  }
-  *outRegBufUsed = true;
-
 fallback:
 #endif
  return result;
 }

-NCCL_PARAM(GraphRegister, "GRAPH_REGISTER", 0);
-
-static ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNetTypeSupport);
-static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, int numPipeOps);
+static ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNetSupport);
+static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetSupport, int nvlsSupport, int numPipeOps);

 static ncclResult_t scheduleCollTasksToPlan(
    struct ncclComm* comm, struct ncclKernelPlan* plan, int* nWorkBudget
@@ -517,6 +504,7 @@ static ncclResult_t scheduleCollTasksToPlan(
    int nAggChannels = 0;
    int nAggOps = 1;
    struct ncclTaskColl* aggEnd = head->next;
+    int nvlsSupport = comm->nvlsSupport && ncclNvlsSupported(aggInfo.opFull.op, aggInfo.datatype);
    int collNetSupport = 0;
    NCCLCHECK(getCollNetSupport(&aggInfo, &collNetSupport));

@@ -537,7 +525,7 @@ static ncclResult_t scheduleCollTasksToPlan(
      NCCLCHECK(ncclInfoSetDerived(&aggInfo, comm->nRanks));
      aggInfo.nChannels = std::min(comm->nChannels, nAggChannels);
      int opPerChannel = DIVUP(nAggChannels, aggInfo.nChannels);
-      NCCLCHECK(getAlgoInfo(&aggInfo, collNetSupport, opPerChannel));
+      NCCLCHECK(getAlgoInfo(&aggInfo, collNetSupport, nvlsSupport, opPerChannel));
    }

    while (head != aggEnd) {
@@ -566,23 +554,26 @@ static ncclResult_t scheduleCollTasksToPlan(
      int workFuncIndex;
      struct ncclWorkElem workElem = {};
      struct ncclProxyOp proxyOp = {};
-      NCCLCHECK(computeColl(&info, &workFuncIndex, &workElem, &proxyOp));
+      // Check whether algo and proto have been preset (as in aggregation case)
+      // If so, skip the calculation
+      if (info.nChannels <= 0 || info.nThreads <= 0) {
+        NCCLCHECK(getAlgoInfo(&info, collNetSupport, nvlsSupport, 1));
+      }

      if (*nWorkBudget < info.nChannels) return ncclSuccess; // Ensure room for addCollToPlan()

-      bool regBufUsed = false;
+      /* if possible, start registration  */
+      ncclRegBufferType regBufType = NCCL_REGULAR_BUFFER;
      void* regBufSend[NCCL_MAX_LOCAL_RANKS];
      void* regBufRecv[NCCL_MAX_LOCAL_RANKS];
-      if (plan->persistent && ncclParamGraphRegister() &&
-          info.algorithm == NCCL_ALGO_COLLNET_DIRECT &&   // limited to CollNetDirect for now
-          comm->intraHighestTransportType == TRANSPORT_P2P && // only when all ranks can p2p each other
-          comm->intraRanks < comm->localRanks) { // only with inter-process & intra-node peers
-        NCCLCHECK(registerIntraNodeBuffers(comm, plan, &info, &regBufUsed, regBufSend, regBufRecv));
-      }
+
+      registerIntraNodeBuffers(comm, plan, &info, regBufSend, regBufRecv, &regBufType);
+
+      NCCLCHECK(computeColl(&info, &workFuncIndex, &workElem, &proxyOp));

      int maxChannels = info.algorithm == NCCL_ALGO_NVLS || aggInfo.algorithm == NCCL_ALGO_NVLS_TREE ? comm->nvlsChannels : comm->nChannels;
      NCCLCHECK(addCollToPlan(comm, plan, nWorkBudget, workFuncIndex, &workElem, &proxyOp,
-        maxChannels, info.nChannels, info.nBytes, regBufUsed, regBufSend, regBufRecv));
+        maxChannels, info.nChannels, info.nBytes, regBufType, regBufSend, regBufRecv));
      tasks->nTasksColl -= 1;
      tasks->collBytesTotal -= info.nBytes;
      ncclIntruQueueDequeue(&tasks->collQueue);
@@ -590,8 +581,8 @@ static ncclResult_t scheduleCollTasksToPlan(

      plan->threadPerBlock = std::max(plan->threadPerBlock, info.nThreads);
      if (!plan->kernelSpecialized) {
-        plan->kernelFn = ncclKerns[workFuncIndex].kernelFn;
-        plan->kernelSpecialized = ncclKerns[workFuncIndex].specialized;
+        plan->kernelFn = ncclDevKernelForFunc[workFuncIndex];
+        plan->kernelSpecialized = ncclDevKernelForFuncIsSpecialized[workFuncIndex];
      }
    }
  }
@@ -619,8 +610,8 @@ static ncclResult_t scheduleP2pTasksToPlan(

  plan->threadPerBlock = std::max(plan->threadPerBlock, NCCL_MAX_NTHREADS);
  if (!plan->kernelSpecialized) {
-    plan->kernelFn = ncclKerns[FUNC_INDEX_P2P].kernelFn;
-    plan->kernelSpecialized = ncclKerns[FUNC_INDEX_P2P].specialized;
+    plan->kernelFn = ncclDevKernelForFunc[ncclDevFuncId_P2p()];
+    plan->kernelSpecialized = ncclDevKernelForFuncIsSpecialized[ncclDevFuncId_P2p()];
  }

  // Compute how much to split operations
@@ -893,6 +884,13 @@ static ncclResult_t reclaimPlan(struct ncclComm* comm, struct ncclCommCallback*
      CUDACHECKIGNORE(cudaIpcCloseMemHandle(q->ptr));
      ncclMemoryPoolFree(&comm->memPool_ncclPointerList, q);
    }
+    /* free mcHandle */
+    while (!ncclIntruQueueEmpty(&plan->nvlsMcHandleQueue)) {
+      struct ncclNvlsMcHandleList* obj = ncclIntruQueueDequeue(&plan->nvlsMcHandleQueue);
+      NCCLCHECK(ncclNvlsDeregBuffer(&obj->mcHandle, obj->ptr, obj->dev, obj->size));
+      INFO(NCCL_NVLS, "rank %d - deregistered buffer %p on device %d, size %ld", comm->rank, (void*)obj->ptr, obj->dev, obj->size);
+      ncclMemoryPoolFree(&comm->memPool_ncclNvlsHandleList, obj);
+    }
  }
  ncclMemoryPoolTakeAll(&comm->memPool_ncclProxyOp, &plan->memPool_ncclProxyOp);
  ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan);
@@ -1142,45 +1140,64 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
 /* Enqueueing system : computation of kernel and proxy operations parameters */
 /*****************************************************************************/

-static inline ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNetTypeSupport) {
+static inline ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNetSupport) {
  // Translate ncclAvg and PreMulSum
  ncclRedOp_t netOp = info->op == ncclAvg || info->op >= ncclNumOps ? ncclSum : info->op;
-  *collNetTypeSupport = info->comm->collNetSupportMatrix[netOp][info->datatype];
+  *collNetSupport = info->comm->collNetSupport && info->comm->collNetSupportMatrix[netOp][info->datatype];
  return ncclSuccess;
 }

 // numPipeOps: number of pipelined ops. Can be greater than 1 in aggregation mode. Used to adjust latency.
-static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, int numPipeOps) {
+static ncclResult_t topoGetAlgoInfo(struct ncclInfo* info, int collNetSupport, int nvlsSupport, int numPipeOps) {
  struct ncclComm* comm = info->comm;
  if (comm->nRanks == 1) {
    info->algorithm = NCCL_ALGO_RING;
    info->protocol = NCCL_PROTO_SIMPLE;
  }
-  else {
+  else if (info->algorithm == NCCL_ALGO_UNDEF || info->protocol == NCCL_PROTO_UNDEF) {
    float minTime = 3600000000.0; // Hopefully no operation will take an hour to complete.
+    float backupMinTime = 3600000000.0;
+    bool backup = false;
+    int backupAlgo = NCCL_ALGO_UNDEF; // back up algo and proto if no algo/proto is picked up.
+    int backupProto = NCCL_PROTO_UNDEF;
    // Find algorithm / protocol.
    info->algorithm = -1;
    info->protocol = -1;
    int nAlgos = NCCL_NUM_ALGORITHMS;
    for (int a=0; a<nAlgos; a++) {
-      if ((a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) && collNetTypeSupport != 1) continue;
-      if (a == NCCL_ALGO_NVLS && !NCCL_NVLS_SUPPORTS(info->datatype, info->opFull.op)) continue;
-      if (a == NCCL_ALGO_NVLS && collNetTypeSupport != 1 && comm->nNodes > 1) continue;
-      if (a == NCCL_ALGO_NVLS_TREE && !NCCL_NVLS_SUPPORTS(info->datatype, info->opFull.op)) continue;
+      if ((a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) && collNetSupport != 1) continue;
+      if (a == NCCL_ALGO_NVLS && nvlsSupport != 1 && info->coll != ncclFuncAllGather) continue;
+      if (a == NCCL_ALGO_NVLS && collNetSupport != 1 && comm->nNodes > 1) continue;
+      /* now we only support single-node NVLS allgather and reducescatter */
+      if (a == NCCL_ALGO_NVLS && (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) && comm->nNodes > 1) continue;
+      if (a == NCCL_ALGO_NVLS_TREE && nvlsSupport != 1) continue;

      for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
        float time;
-        NCCLCHECK(ncclTopoGetAlgoTime(info, a, p, numPipeOps, &time));
-        if (time >= 0 && time < minTime) {
-          info->algorithm = a;
-          info->protocol = p;
-          minTime = time;
+        NCCLCHECK(ncclTopoGetAlgoTime(info, a, p, numPipeOps, &time, &backup));
+        if (!backup) {
+          if (time >= 0 && time < minTime) {
+            info->algorithm = a;
+            info->protocol = p;
+            minTime = time;
+          }
+        } else {
+          if (time >= 0 && time < backupMinTime) {
+            backupAlgo = a;
+            backupProto = p;
+            backupMinTime = time;
+          }
        }
      }
    }
-    if (info->algorithm == -1 || info->protocol == -1) {
-      WARN("Error : no algorithm/protocol available");
-      return ncclInternalError;
+
+    if (info->algorithm == NCCL_ALGO_UNDEF || info->protocol == NCCL_PROTO_UNDEF) {
+      if (backupAlgo == NCCL_ALGO_UNDEF || backupProto == NCCL_PROTO_UNDEF) {
+        WARN("Error : no algorithm/protocol available");
+        return ncclInternalError;
+      }
+      info->algorithm = backupAlgo;
+      info->protocol = backupProto;
    }
    //if (comm->rank == 0) INFO(NCCL_TUNING, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);
    TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);
@@ -1222,6 +1239,25 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
  return ncclSuccess;
 }

+// Use the default topo-based tuner if tuner plugin is not successful.
+// Call the plugin first. Let it set algo+proto, and/or nChannels.
+// Then, topoGetAlgoInfo will set algo/proto if not set, then nChannels and nThreads based on algo/proto.
+// Finally, nChannels will be overriden by the plugin setting.
+static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetSupport, int nvlsSupport, int numPipeOps) {
+  info->algorithm = NCCL_ALGO_UNDEF;
+  info->protocol = NCCL_PROTO_UNDEF;
+  int nChannels = 0;
+  if (info->comm->tuner != NULL) {
+    NCCLCHECK(info->comm->tuner->getCollInfo(
+          info->coll, info->nBytes,
+          collNetSupport, nvlsSupport, numPipeOps,
+          &info->algorithm, &info->protocol, &nChannels));
+  }
+  NCCLCHECK(topoGetAlgoInfo(info, collNetSupport, nvlsSupport, numPipeOps));
+  if (nChannels) info->nChannels = nChannels; // Set by plugin; override default.
+  return ncclSuccess;
+}
+
 static ncclResult_t getPatternInfo(struct ncclInfo* info) {
  switch (info->coll) {
    case ncclFuncBroadcast:
@@ -1275,14 +1311,6 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
 }

 static ncclResult_t computeColl(struct ncclInfo* info /* input */, int* workFuncIndex, struct ncclWorkElem* work, struct ncclProxyOp* proxyOp /* output */) {
-  int collNetTypeSupport = 0;
-  // Check whether algo and proto have been preset (as in aggregation case)
-  // If so, skip the calculation
-  if (info->nChannels > 0 && info->nThreads > 0) goto comp_next;
-  NCCLCHECK(getCollNetSupport(info, &collNetTypeSupport));
-  NCCLCHECK(getAlgoInfo(info, collNetTypeSupport, 1));
-
-comp_next:
  // Set nstepsPerLoop and nchunksPerLoop
  NCCLCHECK(getPatternInfo(info));
  NCCLCHECK(getLoopInfo(info));
@@ -1295,14 +1323,7 @@ comp_next:
  work->nWarps = info->nThreads / WARP_SIZE;
  work->redOpArg = info->opFull.scalarArg;
  work->redOpArgIsPtr = info->opFull.scalarArgIsPtr;
-
-  if (info->comm->nRanks == 1) {
-    // one-rank reduce index
-    *workFuncIndex = 1 + int(info->datatype);
-    return ncclSuccess;
-  }
-
-  *workFuncIndex = FUNC_INDEX(info->coll, info->opFull.op, info->datatype, info->algorithm, info->protocol);
+  *workFuncIndex = ncclDevFuncId(info->coll, info->opFull.op, info->datatype, info->algorithm, info->protocol);

  int stepSize   = info->comm->buffSizes[info->protocol]/NCCL_STEPS;
  int chunkSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->chunkSteps : 1;
@@ -1381,7 +1402,7 @@ comp_next:
  proxyOp->protocol = info->protocol;
  proxyOp->dtype = info->datatype;
  proxyOp->redOp = info->opFull.op==ncclDevPreMulSum || info->opFull.op==ncclDevSumPostDiv ? ncclSum : // Network sees avg as sum
-                     info->op;
+                     info->opFull.proxyOp;
  proxyOp->pattern = info->pattern;
  proxyOp->root = info->root;
  // This is used by P2P to reduce the receive buffer size. We don't use it in collectives
@@ -1399,27 +1420,37 @@ static ncclResult_t hostToDevRedOp(
    ncclDevRedOpFull *opFull, ncclRedOp_t op, ncclDataType_t datatype, ncclComm *comm
  ) {
  union {
-    int8_t i8;
-    uint8_t u8;
-    int32_t i32;
-    uint32_t u32;
-    int64_t i64;
-    uint64_t u64;
-    half f16;
+    int8_t   i8; uint8_t   u8;
+    int32_t i32; uint32_t u32;
+    int64_t i64; uint64_t u64;
+    half f16; float f32; double f64;
    #if defined(__CUDA_BF16_TYPES_EXIST__)
      __nv_bfloat16 bf16;
    #endif
-    float f32;
-    double f64;
    void *ptr;
  };
  u64 = 0;
  opFull->scalarArgIsPtr = false;
+  opFull->proxyOp = op;
+
+  int nbits = 8*ncclTypeSize(datatype);
+  uint64_t allBits = uint64_t(-1)>>(64-nbits);
+  uint64_t signBit = allBits^(allBits>>1);
+
  switch (int(op)) {
  case ncclSum:  opFull->op = ncclDevSum;  break;
  case ncclProd: opFull->op = ncclDevProd; break;
-  case ncclMax:  opFull->op = ncclDevMax;  break;
-  case ncclMin:  opFull->op = ncclDevMin;  break;
+  case ncclMin:
+  case ncclMax:
+    opFull->op = ncclDevMinMax;
+    opFull->scalarArg = 0;
+    // The xormask used by ncclFuncMinMax<[u]int> is the XOR of the sign bit
+    // for signed (opposed to unsigned) types and all the bits for max (opposed to min).
+    if (datatype==ncclInt8 || datatype==ncclInt32 || datatype==ncclInt64) {
+      opFull->scalarArg ^= signBit;
+    }
+    opFull->scalarArg ^= (op == ncclMax) ? allBits : 0;
+    break;
  case ncclAvg:
    switch ((int)datatype) {
    case ncclInt8:  case ncclInt32:  case ncclInt64:
@@ -1513,12 +1544,8 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo const* inf
    struct ncclDevRedOpFull opFull;
    NCCLCHECK(hostToDevRedOp(&opFull, info->op, info->datatype, comm));

-    // User-defined reduction ops may need alter the data even for unitary reductions
-    if (comm->nRanks == 1 && opFull.op < ncclDevPreMulSum) {
-      if (info->sendbuff != info->recvbuff) {
-        size_t bytes = info->count*ncclTypeSize(info->datatype);
-        CUDACHECK(cudaMemcpyAsync(info->recvbuff, info->sendbuff, bytes, cudaMemcpyDeviceToDevice, info->stream));
-      }
+    if (comm->nRanks == 1) {
+      NCCLCHECK(ncclLaunchOneRank(info->recvbuff, info->sendbuff, info->count, opFull, info->datatype, info->stream));
      return ncclSuccess;
    } else {
      // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`.
@@ -370,13 +370,18 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
      treeToParent[c*nNodes+n] = allTopoRanks[r]->treeToParent[c];
      treeToChild0[c*nNodes+n] = allTopoRanks[r]->treeToChild0[c];
      treeToChild1[c*nNodes+n] = allTopoRanks[r]->treeToChild1[c];
-      nvlsHeads[c*nNodes+n] = allTopoRanks[r]->nvlsHeads[c];
    }
    for (int r=0; r<nranks; r++) {
      ringPrev[c*nranks+r] = allTopoRanks[r]->ringPrev[c];
      ringNext[c*nranks+r] = allTopoRanks[r]->ringNext[c];
    }
  }
+  for (int c=0; c<graphs[NCCL_ALGO_NVLS]->nChannels; c++) {
+    for (int n=0; n<nNodes; n++) {
+      int r = firstRanks[n];
+      nvlsHeads[c*nNodes+n] = allTopoRanks[r]->nvlsHeads[c];
+    }
+  }

  // Connect rings and trees. This should also duplicate the channels.
  NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext));
@@ -70,7 +70,7 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
        if ((remPath->bw == 0 || remPath->count > path->count) && remPath->bw < bw) {
          // Find reverse link
          for (int l=0; l<remNode->nlinks; l++) {
-            if (remNode->links[l].remNode == node) {
+            if (remNode->links[l].remNode == node && remNode->links[l].type == link->type) {
              remPath->list[0] = remNode->links+l;
              break;
            }
@@ -126,7 +126,7 @@ static void printNodePaths(struct ncclTopoSystem* system, struct ncclTopoNode* n
      for (int i=0; i<node->paths[t][n].count; i++) {
        struct ncclTopoLink* link = node->paths[t][n].list[i];
        struct ncclTopoNode* remNode = link->remNode;
-        sprintf(line+offset, "--%s->%s/%lX", topoLinkTypeStr[link->type], topoNodeTypeStr[remNode->type], remNode->id);
+        sprintf(line+offset, "--%s(%g)->%s/%lX", topoLinkTypeStr[link->type], link->bw, topoNodeTypeStr[remNode->type], remNode->id);
        offset = strlen(line);
      }
      INFO(NCCL_GRAPH, "%s (%f)", line, node->paths[t][n].bw);
@@ -212,14 +212,14 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
  if (*level == -1) {
    int l = -1;
    if (disableEnv) {
-      char* str = getenv(disableEnv);
+      const char* str = ncclGetEnv(disableEnv);
      if (str) {
        int disable = strtol(str, NULL, 0);
        if (disable == 1) l = 0;
      }
    }
    if (l == -1) {
-      char* str = getenv(levelEnv);
+      const char* str = ncclGetEnv(levelEnv);
      if (str) {
        for (int i=0; i<=PATH_SYS; i++) {
          if (strcmp(str, topoPathTypeStr[i]) == 0) {
@@ -318,14 +318,15 @@ compare:
        status = ncclNvmlDevicePairs[indexes[i-1]][indexes[i-0]].p2pStatusWrite;
        good &= status == NVML_P2P_STATUS_OK;
        if (!good) {
-          if (ncclParamIgnoreDisabledP2p()) {
-            *p2p = 0;
-          } else if (path->type <= PATH_NVB) {
-            WARN("P2P is disabled between NVLINK connected GPUs %d and %d. This should not be the case given their connectivity, and is probably due to a hardware issue. If you still want to proceed, you can set NCCL_IGNORE_DISABLED_P2P=1.", indexes[i-1], indexes[i-0]);
-            return ncclUnhandledCudaError;
-          } else if (path->type < PATH_SYS) {
-            INFO(NCCL_INIT, "P2P is disabled between connected GPUs %d and %d. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.", indexes[i-1], indexes[i-0]);
+          if (!ncclParamIgnoreDisabledP2p()) {
+            if (path->type <= PATH_NVB) {
+              WARN("P2P is disabled between NVLINK connected GPUs %d and %d. This should not be the case given their connectivity, and is probably due to a hardware issue. If you still want to proceed, you can set NCCL_IGNORE_DISABLED_P2P=1.", indexes[i-1], indexes[i-0]);
+              return ncclUnhandledCudaError;
+            } else if (path->type < PATH_SYS) {
+              INFO(NCCL_INIT, "P2P is disabled between connected GPUs %d and %d. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.", indexes[i-1], indexes[i-0]);
+            }
          }
+          *p2p = 0;
        }
      }
    }
@@ -360,7 +361,8 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int
  if (read) { // For reads (sends) only enable under certain conditions
    int gdrReadParam = ncclParamNetGdrRead();
    if (gdrReadParam == 0) return ncclSuccess;
-    if (gdrReadParam < 0) {
+    // Disable GDR Reads pre-Ampere when we have other PCI flows
+    if (gdrReadParam < 0 && gpu->gpu.cudaCompCap < 80) {
      int nvlink = 0;
      // Since we don't know whether there are other communicators,
      // it's better to keep things local if we have a single GPU.
@@ -400,7 +402,7 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int
 }

 // Set to 0 to disable the flush on Hopper when using GDR
-NCCL_PARAM(NetForceFlush, "NET_FORCE_FLUSH", 1);
+NCCL_PARAM(NetForceFlush, "NET_FORCE_FLUSH", 0);

 // Determine whether we need to flush the GDR recv buffers
 ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int* flush) {
@@ -49,10 +49,10 @@ ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system) {
  return ncclSuccess;
 }

-static ncclResult_t findRevLink(struct ncclTopoNode* node1, struct ncclTopoNode* node2, struct ncclTopoLink** revLink) {
+static ncclResult_t findRevLink(struct ncclTopoNode* node1, struct ncclTopoNode* node2, int type, struct ncclTopoLink** revLink) {
  for (int l=0; l<node2->nlinks; l++) {
    struct ncclTopoLink* link = node2->links+l;
-    if (link->remNode == node1) {
+    if (link->remNode == node1 && link->type == type) {
      *revLink = link;
      return ncclSuccess;
    }
@@ -85,11 +85,11 @@ static ncclResult_t followPath(struct ncclTopoLinkList* path, struct ncclTopoNod
    float fwBw = link->type == LINK_PCI ? pciBw : bw;
    float revBw = 0;
    if (link->remNode->type == GPU && link->remNode->gpu.cudaCompCap < 80 && start->type != GPU) {
-      if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, &revLink));
+      if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, link->type, &revLink));
      revBw += fwBw/8;
    }
-    if (link->remNode->type == CPU && link->type == LINK_NVL) {
-      if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, &revLink));
+    if (link->remNode->type == CPU && link->remNode->cpu.arch == NCCL_TOPO_CPU_ARCH_POWER && link->type == LINK_NVL) {
+      if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, link->type, &revLink));
      revBw += fwBw;
    }
    if (link->bw < fwBw || (revBw && revLink->bw < revBw)) { *steps = step; return ncclSuccess; }
@@ -267,7 +267,7 @@ ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncc
 ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int* time);

 // Try to keep all searchs within one second
-#define NCCL_SEARCH_GLOBAL_TIMEOUT (5ULL<<16)
+#define NCCL_SEARCH_GLOBAL_TIMEOUT (1ULL<<19)
 #define NCCL_SEARCH_TIMEOUT (1<<14)
 #define NCCL_SEARCH_TIMEOUT_TREE (1<<14)
 #define NCCL_SEARCH_TIMEOUT_SAMECHANNELS (1<<8)
@@ -342,6 +342,7 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop

  if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) { // NVLS channels correspond to GPUs pulling from NVLS. So the more the better.
    if (graph->nChannels > refGraph->nChannels && graph->nChannels <= system->nodes[GPU].count) *copy = 1;
+    if (graph->nChannels*graph->bwInter > refGraph->nChannels*refGraph->bwInter) *copy = 1;
    return ncclSuccess;
  }
  // 2. Try to get better bandwidth
@@ -358,30 +359,27 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop
  return ncclSuccess;
 }

-// Build a list of the best NETs to try.
+// Build a sorted list of the NETs to try.
 //
 // "gpu" can be set to -1 to build a list suitable for all GPUs (search start) or to a given gpu
 //  index when trying to get back to the NIC.
 //
 // The list is built the following way:
 // 1. Select NETs starting with those close to GPU(s), based on paths[n].type.
-// 2. For each GPU, once that list of NICs with a given distance is prepared, shuffle the list
-//    based on the GPU NVML index so that e.g. GPU 1 chooses NIC 1 first instead of NIC 0 which
-//    might have been choosen by GPU 0 (case with multiple independent communicators per node)
-// 3. Then add the NETs to the final list if they were not already added by another closer GPU.
+// 2. add other NETs satisfying typeInter but not already in the list.

 ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int* nets, int* netCountRet) {
  int netCount = 0;
  int localNetCount;
  int* localNets;
-  NCCLCHECK(ncclCalloc(&localNets, system->nodes[NET].count));
+  NCCLCHECK(ncclCalloc(&localNets, MAXCHANNELS));

  // First add the preferred NICs
  for (int g=0; g<system->nodes[GPU].count; g++) {
    if (gpu != -1 && gpu != g) continue;
    localNetCount = 0;
    struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
-    for (int c = 0;; c++) {
+    for (int c = 0; c<MAXCHANNELS; c++) {
      int netId;
      NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId));
      NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount));
@@ -451,11 +449,11 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
      int startNetIndex;
      NCCLCHECK(getNetIndex(system, graph->inter[graph->nChannels*2], &startNetIndex));
      struct ncclTopoNode* startNet = system->nodes[NET].nodes+startNetIndex;
-      int netcount;
+      int netCount;
      int* nets;
      NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
-      NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netcount));
-      for (int i=0; i<netcount; i++) {
+      NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netCount));
+      for (int i=0; i<netCount; i++) {
        int n = nets[i];
        struct ncclTopoNode* net = system->nodes[NET].nodes+n;
        if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) continue; // Trees are symmetric
@@ -523,12 +521,12 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
  const int bw = graph->bwInter;
  int* nets;
  NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
-  int netcount;
-  NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netcount));
-  for (int i=0; i<netcount; i++) {
-    int n = nets[i];
+  int netCount;
+  NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netCount));
+  for (int i=0; i<netCount; i++) {
+    if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && i>0) continue;
+    int n = nets[(graph->nChannels+i)%netCount];
    struct ncclTopoNode* net = system->nodes[NET].nodes+n;
-    struct ncclTopoNode* gpu;
    if (graph->collNet && net->net.collSupport == 0) continue;
    if (net->net.bw < bw) continue;

@@ -542,12 +540,12 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
      }
    }

-    // NVLS needs to balance on all NICs
    if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
-      if (graph->nChannels < netcount) {
+      // NVLS search only tries to find NIC:GPU combinations to compute the heads.
+      if (graph->nChannels < netCount) {
        int gpu;
-        NCCLCHECK(ncclTopoGetLocalGpu(system, system->nodes[NET].nodes[nets[graph->nChannels]].id, &gpu));
-        if (gpu != -1) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, gpu));
+        NCCLCHECK(ncclTopoGetLocalGpu(system, net->id, &gpu));
+        if (gpu != -1) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, gpu));
      }
    } else {
      if (graph->nChannels > 0) {
@@ -577,18 +575,10 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
          }
        }
        if (maxBw >= bw) {
-          // In the first loop, avoid using GPUs in both directions between channels (one channel
-          // sending from that GPU and one channel receiving to that GPU), since that usually leads
-          // to lower BW.
-          for (int tryGpuBidir=0; tryGpuBidir<2; tryGpuBidir++) {
-            for (int g=0; g<system->nodes[GPU].count; g++) {
-              if (paths[g].bw == maxBw && paths[g].count == minHops) {
-                gpu = system->nodes[GPU].nodes+g;
-                int gpuUsed = gpuPciBw(gpu) > 0 ? 0 : 1;
-                if (tryGpuBidir == gpuUsed) {
-                  NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g));
-                }
-              }
+          for (int i=0; i<system->nodes[GPU].count; i++) {
+            int g = (graph->nChannels+i)%system->nodes[GPU].count;
+            if (paths[g].bw == maxBw && paths[g].count == minHops) {
+              NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g));
            }
          }
        }
@@ -809,28 +799,30 @@ float speedArrayInter[] = { 48.0, 30.0, 28.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0
 #define NSPEEDSINTRA (sizeof(speedArrayIntra)/sizeof(float))
 #define NSPEEDSINTER (sizeof(speedArrayInter)/sizeof(float))

-float sm90SpeedArrayIntra[] = { 60.0, 40.0, 30.0, 24.0, 20.0, 15.0, 12.0, 6.0, 3.0 };
+float sm90SpeedArrayIntra[] = { 60.0, 50.0, 40.0, 30.0, 24.0, 20.0, 15.0, 12.0, 6.0, 3.0 };
 float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
 #define NSPEEDSINTRA_SM90 (sizeof(sm90SpeedArrayIntra)/sizeof(float))
 #define NSPEEDSINTER_SM90 (sizeof(sm90SpeedArrayInter)/sizeof(float))

 ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) {
  int ngpus = system->nodes[GPU].count;
-  graph->crossNic = ncclParamCrossNic();
-  int crossNic = (system->nodes[NET].count > 1) && graph->crossNic &&
+  int crossNic = (system->nodes[NET].count > 1) &&
 	 (graph->pattern == NCCL_TOPO_PATTERN_RING ||
 	  graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE ||
-	  graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) ? 1 : 0;
+	  graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) ? ncclParamCrossNic() : 0;
+  graph->crossNic = crossNic == 1 ? 1 : 0;
  graph->bwIntra = graph->bwInter = 0;
  graph->latencyInter = 0;
-  if (graph->crossNic == 2) graph->crossNic = 0;
  graph->typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL;
  graph->typeInter = PATH_PIX;
  graph->nChannels = 0;
  int trySameChannels = graph->pattern == NCCL_TOPO_PATTERN_NVLS ? 0 : 1;
  graph->sameChannels = trySameChannels;

-  char* str = getenv("NCCL_GRAPH_FILE");
+  int cpuArch, cpuVendor, cpuModel;
+  NCCLCHECK(ncclTopoCpuType(system, &cpuArch, &cpuVendor, &cpuModel));
+
+  const char* str = ncclGetEnv("NCCL_GRAPH_FILE");
  if (str) {
    INFO(NCCL_ENV, "NCCL_GRAPH_FILE set by environment to %s", str);
    struct ncclXml* xml;
@@ -846,6 +838,8 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
  int ccMin;
  NCCLCHECK(ncclTopoGetCompCap(system, &ccMin, NULL));
  if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && (system->nodes[NVS].count == 0 || ccMin < 90)) return ncclSuccess;
+  // NVLS search must have ngpus heads at most.
+  if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) graph->maxChannels = system->nodes[GPU].count;

  if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;

@@ -884,7 +878,7 @@ search:

  NCCLCHECK(ncclTopoSearchRec(system, &tmpGraph, graph, &time));
 #if 0
-  printf("Pattern %d, crossNic %d, Bw %g/%g, type %d/%d, channels %d-%d sameChannels %d -> nChannels %dx%g/%g %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.bwInter, tmpGraph.bwIntra, tmpGraph.typeInter, tmpGraph.typeIntra, tmpGraph.minChannels, tmpGraph.maxChannels, tmpGraph.sameChannels, graph->nChannels, graph->bwInter, graph->bwIntra, time == 0 ? "TIMEOUT" : time == -1 ? "PERFECT" : "");
+  printf("Id %d Pattern %d, crossNic %d, Bw %g/%g, type %d/%d, channels %d-%d sameChannels %d -> nChannels %dx%g/%g %s\n", tmpGraph.id, tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.bwInter, tmpGraph.bwIntra, tmpGraph.typeInter, tmpGraph.typeIntra, tmpGraph.minChannels, tmpGraph.maxChannels, tmpGraph.sameChannels, graph->nChannels, graph->bwInter, graph->bwIntra, time == 0 ? "TIMEOUT" : time == -1 ? "PERFECT" : "");
  for (int c=0; c<graph->nChannels; c++) {
    printf("%2d : ", c);
    for (int g=0; g<ngpus; g++) {
@@ -901,8 +895,9 @@ search:
  if (pass == 1) {
    // First pass, we don't have a solution yet ; try other options

-    // Try having different channels
-    if (tmpGraph.sameChannels == 1) {
+    // Try having different channels (except when going through AMD CPUs)
+    if (tmpGraph.sameChannels == 1 &&
+        !(cpuArch == NCCL_TOPO_CPU_ARCH_X86 && cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD && tmpGraph.typeIntra == PATH_SYS)) {
      tmpGraph.sameChannels = 0;
      goto search;
    }
@@ -932,12 +927,12 @@ search:
    }
    tmpGraph.typeInter = PATH_PIX;

-    if (crossNic && tmpGraph.crossNic == 0) {
+    if (crossNic == 2 && tmpGraph.crossNic == 0) {
      // Try again with crossNic if permitted
-      tmpGraph.crossNic = crossNic;
+      tmpGraph.crossNic = 1;
      goto search;
    }
-    tmpGraph.crossNic = 0;
+    tmpGraph.crossNic = crossNic == 1 ? 1 : 0;

    // Decrease bw until we find a solution
    if ((speedIndex < nspeeds-1) && (graph->nChannels == 0 || (speedArray[speedIndex+1]/graph->bwInter > .49))) {
@@ -964,7 +959,7 @@ done:

  // 3. See if we can increase bwIntra for trees (2 nodes or collnet)
  if (pass == 2) {
-    if (time != 0 && graph->pattern != NCCL_TOPO_PATTERN_RING &&
+    if (time != 0 && graph->pattern != NCCL_TOPO_PATTERN_RING && graph->pattern != NCCL_TOPO_PATTERN_NVLS &&
        tmpGraph.bwIntra == graph->bwIntra && tmpGraph.bwIntra < tmpGraph.bwInter*2 &&
        speedIndex > 0) {
      tmpGraph.bwIntra = speedArray[--speedIndex];
@@ -972,6 +967,20 @@ done:
    }
    time = -1;
    memcpy(&tmpGraph, graph, sizeof(tmpGraph));
+    pass = 3;
+  }
+
+  // 4. See if we can increase bwInter for nvls+tree
+  if (pass == 3) {
+    if (time != 0 && graph->pattern == NCCL_TOPO_PATTERN_NVLS &&
+        tmpGraph.bwInter == graph->bwInter && tmpGraph.bwInter < tmpGraph.bwIntra*2 &&
+        speedIndex > 0) {
+      tmpGraph.minChannels = tmpGraph.maxChannels = graph->nChannels;
+      tmpGraph.bwInter = speedArray[--speedIndex];
+      goto search;
+    }
+    time = -1;
+    memcpy(&tmpGraph, graph, sizeof(tmpGraph));
  }

  if (graph->nChannels == 0 && graph->collNet == 0 && graph->pattern != NCCL_TOPO_PATTERN_NVLS) {
@@ -1023,7 +1032,7 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr
 }

 ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, struct ncclTopoGraph** graphs) {
-  char* str = getenv("NCCL_GRAPH_DUMP_FILE");
+  const char* str = ncclGetEnv("NCCL_GRAPH_DUMP_FILE");
  if (str) {
    INFO(NCCL_ENV, "NCCL_GRAPH_DUMP_FILE set by environment to %s", str);
    struct ncclXml* xml;
@@ -72,6 +72,9 @@ static ncclResult_t ncclTopoGetInterCpuBw(struct ncclTopoNode* cpu, float* bw) {
  if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
    *bw = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_BW : QPI_BW;
  }
+  if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_AMD) {
+    *bw = AMD_BW;
+  }
  if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
    *bw = cpu->cpu.model ==  NCCL_TOPO_CPU_TYPE_YONGFENG ? YONGFENG_ZPI_BW : ZPI_BW;
  }
@@ -540,6 +543,36 @@ ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem*
  return ncclSuccess;
 }

+ncclResult_t ncclTopoAddC2c(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId) {
+  if (strcmp(node->name, "c2c") == 0) {
+    struct ncclTopoNode* gpu = NULL;
+    int64_t pBusId;
+    NCCLCHECK(busIdToInt64(parentBusId, &pBusId));
+    NCCLCHECK(ncclTopoGetNode(system, &gpu, GPU, pBusId));
+    if (gpu == NULL) {
+      WARN("Add NVLink error : could not find GPU %lx", pBusId);
+      return ncclInternalError;
+    }
+    int count = 0;
+    NCCLCHECK(xmlGetAttrInt(node, "count", &count));
+    int bw = 0;
+    NCCLCHECK(xmlGetAttrInt(node, "bw", &bw));
+    double c2cBw = (bw*count)/1000.0;
+    struct ncclTopoNode* cpu = NULL;
+    NCCLCHECK(findLocalCpu(gpu, &cpu));
+    if (cpu == NULL) return ncclSuccess;
+    NCCLCHECK(ncclTopoConnectNodes(gpu, cpu, LINK_NVL, c2cBw));
+    NCCLCHECK(ncclTopoConnectNodes(cpu, gpu, LINK_NVL, c2cBw));
+  } else {
+    const char* busId;
+    NCCLCHECK(xmlGetAttr(node, "busid", &busId));
+    for (int s=0; s<node->nSubs; s++) {
+      NCCLCHECK(ncclTopoAddC2c(node->subs[s], system, busId ? busId : parentBusId));
+    }
+  }
+  return ncclSuccess;
+}
+
 ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem) {
  NCCLCHECK(ncclCalloc(topoSystem, 1));
  struct ncclXmlNode* topNode;
@@ -549,6 +582,7 @@ ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem
    if (strcmp(node->name, "cpu") == 0) NCCLCHECK(ncclTopoAddCpu(node, *topoSystem));
  }
  NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL));
+  NCCLCHECK(ncclTopoAddC2c(topNode, *topoSystem, NULL));

  NCCLCHECK(ncclTopoFlattenBcmSwitches(*topoSystem));
  NCCLCHECK(ncclTopoConnectCpus(*topoSystem));
@@ -595,7 +629,7 @@ static ncclResult_t xmlInitAttrFloat(struct ncclXmlNode* node, const char* attrN
 ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) {
  struct ncclXml* xml;
  NCCLCHECK(ncclCalloc(&xml, 1));
-  char* xmlTopoFile = getenv("NCCL_TOPO_FILE");
+  const char* xmlTopoFile = ncclGetEnv("NCCL_TOPO_FILE");
  if (xmlTopoFile) {
    INFO(NCCL_ENV, "NCCL_TOPO_FILE set by environment to %s", xmlTopoFile);
    NCCLCHECK(ncclTopoGetXmlFromFile(xmlTopoFile, xml, 1));
@@ -668,7 +702,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
  // Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
  NCCLCHECK(ncclTopoTrimXml(xml));

-  xmlTopoFile = getenv("NCCL_TOPO_DUMP_FILE");
+  xmlTopoFile = ncclGetEnv("NCCL_TOPO_DUMP_FILE");
  if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) {
    INFO(NCCL_ENV, "NCCL_TOPO_DUMP_FILE set by environment to %s", xmlTopoFile);
    NCCLCHECK(ncclTopoDumpXmlToFile(xmlTopoFile, xml));
@@ -704,7 +738,7 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int ch
  int* localNets;
  int localNetCount;
  NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, &localNets, &localNetCount, NULL));
-  int* localGpus;
+  int* localGpus = NULL;
  int localGpuCount;
  NCCLCHECK(ncclTopoGetLocal(system, NET, localNets[0], GPU, &localGpus, &localGpuCount, NULL));
  int net = system->nodes[GPU].nodes[gpu].gpu.dev;
@@ -717,17 +751,25 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int ch
 }

 ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int net, int* gpuIndex) {
+  int netIndex;
+  NCCLCHECK(ncclTopoIdToIndex(system, NET, net, &netIndex));
+  int* localGpus = NULL;
+  int localGpuCount;
+  NCCLCHECK(ncclTopoGetLocal(system, NET, netIndex, GPU, &localGpus, &localGpuCount, NULL));
  for (int c=0; c<MAXCHANNELS; c++) {
-    for (int g=0; g<system->nodes[GPU].count; g++) {
+    for (int lg=0; lg<localGpuCount; lg++) {
+      int g = localGpus[lg];
      struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
      int id;
      NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id));
      if (net == id) {
        *gpuIndex = g;
+        free(localGpus);
        return ncclSuccess;
      }
    }
  }
+  free(localGpus);
  *gpuIndex = -1;
  return ncclSuccess;
 }
@@ -18,6 +18,7 @@
 #define SM86_NVLINK_BW 12.0
 #define PCI_BW 12.0           // PCI Gen3 x16
 #define QPI_BW 6.0
+#define AMD_BW 16.0
 #define SKL_QPI_BW 10.0
 #define ZPI_BW 6.0
 #define YONGFENG_ZPI_BW 9.0
@@ -5,7 +5,7 @@
 ************************************************************************/

 #include "core.h"
-#include "devcomm.h"
+#include "device.h"
 #include "comm.h"
 #include "topo.h"

@@ -54,9 +54,9 @@ ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* li
 // Latencies in us, Bandwidths in GB/s
 // Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple }
 static const float baseLat  [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = {
-       {  6.8, 14.0,    0 }, {  6.6, 14.0,  8.4 }, // Tree, Ring
-       {  6.8, 14.0,    0 }, {  6.8, 14.0,    0 },       // Collnet Direct, Chain
-       {    0,    0, 23.0 }, {    0,    0, 23.0 }};     // NVLS, NVLS Tree
+       {  6.8, 14.0,    0 }, {  6.6, 14.0,  8.4 },  // Tree, Ring
+       {    0,    0,    0 }, {    0,    0,    0 },  // Collnet Direct, Chain
+       {    0,    0,    0 }, {    0,    0,    0 }}; // NVLS, NVLS Tree

 // NVLink, PCI, Network
 #define NCCL_HW_NVLINK 0
@@ -64,17 +64,17 @@ static const float baseLat  [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = {
 #define NCCL_HW_NET 2
 static float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
 { /* NVLINK */
-  { /* Tree (LL/LL128/Simple)*/ { .6, 1.25,  4 }, /* Ring (LL/LL128/Simple)*/ { .6, 1.9, 3.4 },
-    /* CollNetDirect (Simple)*/ { 0, 0, 8.0 }, /* CollNetChain (Simple)*/ { 0, 0, 4.75 },
-    /* NVLS */ { 0, 0, 0 }, /* NVLSTree */ { 0, 0, 0 } },
+  { /* Tree (LL/LL128/Simple)*/ { .6, 1.25, 28 }, /* Ring (LL/LL128/Simple)*/ { .6, 1.9, 3.4 },
+    /* CollNetDirect (Simple)*/ { 0, 0, 3.7 }, /* CollNetChain (Simple)*/ { 0, 0, 2.8 },
+    /* NVLS */ { 0, 0, 23 }, /* NVLSTree */ { 0, 0, 23 } },
  /* PCI */
-  { /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9,  6 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 },
-    /* CollNetDirect (Simple)*/ { 0, 0, 8.0 }, /* CollNetChain (Simple)*/ { 0, 0, 8.0 },
+  { /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 },
+    /* CollNetDirect (Simple)*/ { 0, 0, 3.7 }, /* CollNetChain (Simple)*/ { 0, 0, 2.8 },
    /* NVLS */ { 0, 0, 0 }, /* NVLSTree */ { 0, 0, 0 } },
  /* NET */
-  { /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 14 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 14.0 },
-    /* CollNetDirect (Simple)*/ { 0, 0, 10.7 }, /* CollNetChain (Simple)*/ { 0, 0, 14 },
-    /* NVLS */ { 0, 0, 18 }, /* NVLSTree */ { 0, 0, 19 } }
+  { /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 28 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 14.0 },
+    /* CollNetDirect (Simple)*/ { 0, 0, 31 }, /* CollNetChain (Simple)*/ { 0, 0, 30 },
+    /* NVLS */ { 0, 0, 18 }, /* NVLSTree */ { 0, 0, 14 } }
 };

 /* Array indexes used below */
@@ -165,13 +165,15 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
    for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
      if (coll == ncclFuncBroadcast && a != NCCL_ALGO_RING) continue;
      if (coll == ncclFuncReduce && a != NCCL_ALGO_RING) continue;
-      if (coll == ncclFuncReduceScatter && a != NCCL_ALGO_RING) continue;
-      if (coll == ncclFuncAllGather && a != NCCL_ALGO_RING) continue;
+      if (coll == ncclFuncReduceScatter && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS) continue;
+      if (coll == ncclFuncAllGather && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS) continue;

      for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
        if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && p != NCCL_PROTO_SIMPLE) continue;
        int collnet = (a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) ? 1 : 0;
        float bw = nNodes <= 2 || collnet ? graphs[a]->bwIntra : graphs[a]->bwInter;
+        if (a == NCCL_ALGO_NVLS) bw = std::min(graphs[a]->bwIntra, graphs[a]->bwInter);
+        if (a == NCCL_ALGO_NVLS_TREE) bw = std::min(graphs[a]->bwIntra, nNodes <= 2 ? graphs[a]->bwInter : graphs[a]->bwInter/2);
        float busBw = graphs[a]->nChannels * bw;

        // Various model refinements
@@ -194,10 +196,12 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
        // Convert bus BW to algorithm BW
        float ratio;
        if (a == NCCL_ALGO_RING) ratio = (1.0 * nRanks) / nsteps;
-        else if (a == NCCL_ALGO_NVLS) ratio = 5.0/6.0;
-        else if (a == NCCL_ALGO_NVLS_TREE) ratio = .70 * nNodes / (2*(nNodes-1));
+        else if (a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) ratio = 5.0/6.0;
        else ratio = .5;
        comm->bandwidths[coll][a][p] = busBw * ratio;
+        /* Ring bandwidth backup */
+        if (a == NCCL_ALGO_RING)
+          comm->ringbdw[coll][p] = comm->bandwidths[coll][NCCL_ALGO_RING][p];

        comm->latencies[coll][a][p] = baseLat[a][p];
        float intraLat = hwLat[intraHw[a]][a][p];
@@ -229,13 +233,14 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
            2 * ((nRanks/nNodes-1) * intraLat + log2i(nNodes) * interLat);
        } else if (a == NCCL_ALGO_COLLNET_DIRECT) {
          comm->latencies[coll][a][p] +=
-            2 * (std::min(1, (nRanks/nNodes-1)) * intraLat + (nRanks/nNodes-1) * 0.5) + interLat;  // Add 0.5 arity serialization latency
+            2 * (std::min(1, (nRanks/nNodes-1)) * intraLat + (nRanks/nNodes-1) * 0.4) + interLat;  // Add 0.4 us arity serialization latency
        } else if (a == NCCL_ALGO_COLLNET_CHAIN) {
          comm->latencies[coll][a][p] += 2 * (nRanks/nNodes-1) * intraLat + interLat;
        } else if (a == NCCL_ALGO_NVLS) {
-          if (nNodes > 1) comm->latencies[coll][a][p] += hwLat[NCCL_HW_NET][a][p];
+          comm->latencies[coll][a][p] = intraLat;
+          if (nNodes > 1) comm->latencies[coll][a][p] += interLat;
        } else if (a == NCCL_ALGO_NVLS_TREE) {
-          comm->latencies[coll][a][p] += 2*(nNodes-1)*hwLat[NCCL_HW_NET][a][p];
+          comm->latencies[coll][a][p] += intraLat + 2 * log2i(nNodes) * interLat;
        }
      }
    }
@@ -246,12 +251,12 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
  int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 };
  int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1, 1, 1 };

-  const char *protoStr = getenv("NCCL_PROTO");
+  const char *protoStr = ncclGetEnv("NCCL_PROTO");
  if (protoStr) {
    INFO(NCCL_ENV, "NCCL_PROTO set by environment to %s", protoStr);
    NCCLCHECK(parseList(protoStr, ncclProtoStr, NCCL_NUM_PROTOCOLS, protoEnable));
  }
-  const char *algoStr = getenv("NCCL_ALGO");
+  const char *algoStr = ncclGetEnv("NCCL_ALGO");
  if (algoStr) {
    INFO(NCCL_ENV, "NCCL_ALGO set by environment to %s", algoStr);
    NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable));
@@ -293,11 +298,25 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
      }
    }
    if (pEnable == 0) comm->bandwidths[c][a][p] = 0;
-    // Never disable ring for non-allreduce operations. That allows to run real apps with NCCL_ALGO=TREE.
-    if (a == NCCL_ALGO_RING && c != ncclFuncAllReduce) continue;
    if (algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
  }

+  for (int c = 0; c < NCCL_NUM_FUNCTIONS; c++) {
+    bool available = false;
+    for (int a = 0; a < NCCL_NUM_ALGORITHMS; a++)
+      for (int p = 0; p < NCCL_NUM_PROTOCOLS; p++)
+        if (comm->bandwidths[c][a][p] != 0) {
+          available = true;
+          goto check_avail;
+        }
+  check_avail:
+    if (available == false) {
+      /* at least set ring algo available */
+      for (int p = 0; p < NCCL_NUM_PROTOCOLS; p++)
+        comm->bandwidths[c][NCCL_ALGO_RING][p] = comm->ringbdw[c][p];
+    }
+  }
+
  if (comm->rank == 0) {
    char line[1024];
    for (int block=0; block<2; block++) {
@@ -346,7 +365,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
  comm->threadThresholds[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] = 512;

  // Override defaults with user env
-  char* str = getenv("NCCL_THREAD_THRESHOLDS");
+  const char* str = ncclGetEnv("NCCL_THREAD_THRESHOLDS");
  if (str) {
    INFO(NCCL_ENV, "NCCL_THREAD_THRESHOLDS set by environment to %s", str);
    ssize_t t[2][NCCL_NUM_PROTOCOLS] = {{ -2, -2, -2 }, { -2, -2, -2 }};
@@ -378,9 +397,19 @@ static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][23] = {
  {  .9,  .9,  .9,  .9,  .9,  .9,  .9,  .8,  .7,  .6,  .6,  .5,  .5,  .5,  .5,  .6,  .7,  .8,  .7,  .7,  .8,  .9,  .9 }
 };

-ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time) {
-  float bw = info->comm->bandwidths[info->coll][algorithm][protocol];
+ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time, bool* backup) {
+  float bw = info->comm->bandwidths[info->coll][algorithm][protocol]; 
  float lat = info->comm->latencies[info->coll][algorithm][protocol];
+  
+  if (backup) {
+    *backup = false;
+    if (algorithm == NCCL_ALGO_RING && bw == 0.0f) {
+      /* try back up RING algorithm */
+      bw = info->comm->ringbdw[info->coll][protocol];
+      *backup = true;
+    }
+  }
+
  if (bw == 0) {
    *time = -1.0; return ncclSuccess;
  }
@@ -254,9 +254,13 @@ ncclResult_t ncclTopoXmlLoadNvlink(FILE* file, struct ncclXml* xml, struct ncclX
  return ncclSuccess;
 }

+ncclResult_t ncclTopoXmlLoadC2c(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
+  NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0));
+  return ncclSuccess;
+}
 ncclResult_t ncclTopoXmlLoadGpu(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
-  struct xmlHandler handlers[] = { { "nvlink", ncclTopoXmlLoadNvlink } };
-  NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1));
+  struct xmlHandler handlers[] = { { "nvlink", ncclTopoXmlLoadNvlink }, { "c2c", ncclTopoXmlLoadC2c } };
+  NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 2));
  return ncclSuccess;
 }

@@ -687,6 +691,41 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
      }
    }
  }
+#if CUDART_VERSION >= 11080
+  struct ncclXmlNode* c2cNode = NULL;
+  NCCLCHECK(xmlGetSub(gpuNode, "c2c", &c2cNode));
+  if (c2cNode == NULL) {
+      if (sm >= 90) {
+        int c2cLinksCount = 0;
+        nvmlFieldValue_t fv;
+        fv.fieldId = NVML_FI_DEV_C2C_LINK_COUNT;
+        if ((ncclNvmlDeviceGetFieldValues(nvmlDev, 1, &fv) == ncclSuccess) && (fv.nvmlReturn == NVML_SUCCESS)) {
+          c2cLinksCount = fv.value.uiVal;
+          int bw = 0;
+	  int count = 0;
+          for (int l=0; l<c2cLinksCount; l++) {
+            nvmlFieldValue_t fvs[2];
+            fvs[0].fieldId = NVML_FI_DEV_C2C_LINK_GET_STATUS;
+            fvs[0].scopeId = l;
+            fvs[1].fieldId = NVML_FI_DEV_C2C_LINK_GET_MAX_BW;
+            fvs[1].scopeId = l;
+            if ((ncclNvmlDeviceGetFieldValues(nvmlDev, 2, fvs) == ncclSuccess) &&
+                (fvs[0].nvmlReturn == NVML_SUCCESS) &&
+                (fvs[0].value.uiVal == 1) &&
+                (fvs[1].nvmlReturn == NVML_SUCCESS)) {
+              bw = fvs[1].value.uiVal;
+	      count++;
+            }
+          }
+          if (count > 0) {
+            NCCLCHECK(xmlAddNode(xml, gpuNode, "c2c", &c2cNode));
+            NCCLCHECK(xmlSetAttrInt(c2cNode, "bw", bw));
+            NCCLCHECK(xmlSetAttrInt(c2cNode, "count", count));
+          }
+        }
+      }
+  }
+#endif
  // Fill target classes
  for (int s=0; s<gpuNode->nSubs; s++) {
    struct ncclXmlNode* sub = gpuNode->subs[s];
@@ -22,7 +22,6 @@ __thread int ncclGroupBlocking = -1; /* default mode */
 __thread bool ncclGroupJobAbortFlag = false;

 void* ncclAsyncJobMain(void* arg);
-static ncclResult_t groupJobComplete(struct ncclGroupJob *job);

 ncclResult_t ncclAsyncLaunch(
    struct ncclAsyncJob* job,
@@ -181,9 +180,28 @@ failure:
  return result;
 }

-static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** groupCommPreconnectHeadPtr, struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next>* asyncJobsPtr, ncclResult_t* groupErrorPtr, ncclResult_t error) {
+static inline void groupResetJobState(struct ncclGroupJob* job) {
+  if (job) {
+    if (job->groupBlockingPtr) *job->groupBlockingPtr = -1;
+    if (job->abortFlagPtr) *job->abortFlagPtr = false;
+    if (job->groupErrorPtr) *job->groupErrorPtr = ncclSuccess;
+    if (job->groupCommHeadPtr) *job->groupCommHeadPtr = NULL;
+    if (job->groupCommPreconnectHeadPtr) *job->groupCommPreconnectHeadPtr = NULL;
+    memset(job, 0, sizeof(struct ncclGroupJob));
+  }
+  return;
+}
+
+static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** groupCommPreconnectHeadPtr, struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next>* asyncJobsPtr, ncclResult_t* groupErrorPtr, int* groupBlockingPtr, volatile bool* groupJobAbortFlagPtr, ncclResult_t error) {
  struct ncclComm* comm = *groupCommHeadPtr;

+  /* reset all thread local variables */
+  *groupCommHeadPtr = NULL;
+  *groupCommPreconnectHeadPtr = NULL;
+  *groupErrorPtr = ncclSuccess;
+  *groupBlockingPtr = -1;
+  *groupJobAbortFlagPtr = false;
+
  while (comm != nullptr) {
    struct ncclComm* next = comm->groupNext;
    (void) ncclGroupCommLeave(comm); // overwrites comm->groupNext
@@ -233,16 +251,12 @@ static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** g
  /* reset everything */
  while (!ncclIntruQueueEmpty(asyncJobsPtr)) {
    struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsPtr);
-    *job->abortFlag = 1;
    if (job->comm && !job->comm->config.blocking)
      (void) ncclCommSetAsyncError(job->comm, error);
    if (job->undo) job->undo(job);
    if (job->destructor) job->destructor((void*)job);
  }

-  *groupErrorPtr = ncclSuccess;
-  *groupCommHeadPtr = nullptr;
-  *groupCommPreconnectHeadPtr = nullptr;
  return;
 }

@@ -325,9 +339,6 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
    NCCLCHECKGOTO(doLaunches(groupCommHeadMain), ret, fail);
  }

-  /* this atomic must happen before cleanup and setting state of communicators */
-  __atomic_store_n(&gjob->doneFlag, true, __ATOMIC_RELEASE);
-
  while (!ncclIntruQueueEmpty(asyncJobsMain)) {
    struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsMain);
    if (job->comm && !job->comm->config.blocking)
@@ -345,16 +356,12 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
    groupCommHeadMain = next;
  }

-  *gjob->groupErrorPtr = ncclSuccess;
-  *gjob->groupCommHeadPtr = nullptr;
-  *gjob->groupCommPreconnectHeadPtr = nullptr;
-
  CUDACHECK(cudaSetDevice(savedDev));

 exit:
  return ret;
 fail:
-  groupCleanup(gjob->groupCommHeadPtr, gjob->groupCommPreconnectHeadPtr, gjob->asyncJobsPtr, gjob->groupErrorPtr, ret);
+  groupCleanup(gjob->groupCommHeadPtr, gjob->groupCommPreconnectHeadPtr, gjob->asyncJobsPtr, gjob->groupErrorPtr, gjob->groupBlockingPtr, gjob->abortFlagPtr, ret);
  goto exit;
 }

@@ -377,7 +384,8 @@ ncclResult_t ncclGroupEndInternal() {
    ncclGroupJobMain.groupErrorPtr = &ncclGroupError;
    ncclGroupJobMain.asyncJobsPtr = &ncclAsyncJobs;
    ncclGroupJobMain.abortFlagPtr = &ncclGroupJobAbortFlag;
-    ncclGroupJobMain.doneFlag = false;
+    ncclGroupJobMain.groupBlockingPtr = &ncclGroupBlocking;
+    ncclGroupJobMain.initialized = true;
    ncclGroupJobMainPtr = &ncclGroupJobMain;
    /* make sure ncclGroupBlocking has been set. */
    assert(ncclGroupBlocking == 0 || ncclGroupBlocking == 1);
@@ -387,6 +395,7 @@ ncclResult_t ncclGroupEndInternal() {
        ncclAsyncJob* job = ncclIntruQueueHead(&ncclAsyncJobs);
        do {
          NCCLCHECKGOTO(ncclCommSetAsyncError(job->comm, ncclInProgress), ret, fail);
+          job->comm->groupJob = ncclGroupJobMainPtr;
          job = job->next;
        } while (job);
      }
@@ -395,30 +404,42 @@ ncclResult_t ncclGroupEndInternal() {
        ncclComm_t comm = ncclGroupCommHead;
        do {
          NCCLCHECKGOTO(ncclCommSetAsyncError(comm, ncclInProgress), ret, fail);
+          /* link group job to communicators. */
+          comm->groupJob = ncclGroupJobMainPtr;
          comm = comm->groupNext;
        } while (comm);
      }
+
      ncclGroupJobMainPtr->base.func = groupLaunch;
      SYSCHECKGOTO(pthread_create(&ncclGroupJobMainPtr->base.thread, NULL, ncclAsyncJobMain, (void*)&ncclGroupJobMainPtr->base), ret, fail);
      ret = ncclInProgress;
    } else {
      /* blocking group */
      NCCLCHECKGOTO(groupLaunch(&ncclGroupJobMainPtr->base), ret, fail);
-      groupResetJobState();
+      groupResetJobState(ncclGroupJobMainPtr);
    }
  }

 exit:
  return ret;
 fail:
-  groupCleanup(&ncclGroupCommHead, &ncclGroupCommPreconnectHead, &ncclAsyncJobs, &ncclGroupError, ret);
-  groupResetJobState();
+  groupCleanup(&ncclGroupCommHead, &ncclGroupCommPreconnectHead, &ncclAsyncJobs, &ncclGroupError, &ncclGroupBlocking, &ncclGroupJobAbortFlag, ret);
  goto exit;
 }

-void ncclGroupJobAbort() {
-  ncclGroupJobAbortFlag = true;
-  (void) groupJobComplete(ncclGroupJobMainPtr);
-  /* reset group abort flag */
-  ncclGroupJobAbortFlag = false;
+ncclResult_t ncclGroupJobComplete(struct ncclGroupJob* groupJob) {
+  ncclResult_t ret = ncclSuccess;
+  if (groupJob && groupJob->initialized) {
+    ret = ncclAsyncJobComplete(&groupJob->base);
+    groupResetJobState(groupJob);
+  }
+  return ret;
+}
+
+ncclResult_t ncclGroupJobAbort(struct ncclGroupJob* groupJob) {
+  if (groupJob && groupJob->initialized) {
+    *groupJob->abortFlagPtr = true;
+    NCCLCHECK(ncclGroupJobComplete(groupJob));
+  }
+  return ncclSuccess;
 }
@@ -101,7 +101,7 @@ static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHand
  /* Allocate the physical memory on the device */
  CUCHECK(cuMemCreate(&handle, size, &prop, 0));
  /* Reserve a virtual address range */
-  CUCHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0));
+  CUCHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0));
  /* Map the virtual address range to the physical allocation */
  CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
  /* Now allow RW access to the newly mapped memory */
@@ -7,108 +7,7 @@
 #ifndef NCCL_COLLECTIVES_H_
 #define NCCL_COLLECTIVES_H_

-enum ncclDevRedOp_t {
-  ncclDevSum, ncclDevProd, ncclDevMax, ncclDevMin,
-  ncclDevPreMulSum, ncclDevSumPostDiv,
-  ncclNumDevRedOps
-};
-struct ncclDevRedOpFull {
-  ncclDevRedOp_t op;
-  bool scalarArgIsPtr;
-  uint64_t scalarArg;
-};
-
-#define FUNC_INDEX_P2P 0
-#define FUNC_INDEX(func, devredop, ncclType, al, pr) (1+ncclNumTypes+(((((func)*ncclNumDevRedOps + (devredop))*ncclNumTypes) + (ncclType))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr))
-
-#define NCCL_FUNC_NAME(func, algo, proto, devredop, type) \
-  ncclFunction_##func##_##algo##_##proto##_##devredop##_##type
-
-#define NCCL_ONERANK_REDUCE_NAME(devredop, type) \
-  ncclFunction_OneRankReduce_##devredop##_##type
-
-#define NCCL_KERN_NAME(func, algo, proto, devredop, type) \
-  ncclKernel_##func##_##algo##_##proto##_##devredop##_##type
-
-#define NCCL_IMPL_NAME(func, algo, proto) \
-  nccl##func##algo##proto
-
-/* Declare all collective operations */
-#define DECL5(func, algo, proto, devredop, type) \
-  extern __device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(); \
-  extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \
-
-#define SINGLE_ARG(...) __VA_ARGS__
-#define CONCAT(a,b) a##b
-#define MACRO_IF(cond, t, f) CONCAT(MACRO_IF_, cond)(SINGLE_ARG(t), SINGLE_ARG(f))
-#define MACRO_IF_0(t, f) f
-#define MACRO_IF_1(t, f) t
-
-#define DECL4(func, algo, devredop, type, undef) \
-  MACRO_IF(undef, /*undefined*/, DECL5(func, algo, SIMPLE, devredop, type)) \
-  MACRO_IF(undef, /*undefined*/, DECL5(func, algo, LL,     devredop, type)) \
-  MACRO_IF(undef, /*undefined*/, DECL5(func, algo, LL128,  devredop, type))
-
-#define DECL3(func, devredop, type, undef) \
-  DECL4(func, RING,           devredop, type, undef) \
-  DECL4(func, TREE,           devredop, type, undef) \
-  DECL4(func, COLLNET_DIRECT, devredop, type, undef) \
-  DECL4(func, COLLNET_CHAIN,  devredop, type, undef) \
-  DECL4(func, NVLS,           devredop, type, undef) \
-  DECL4(func, NVLS_TREE,      devredop, type, undef)
-
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-#define DECL2(func, devredop, undefForFloat) \
-  DECL3(func, devredop, int8_t, /*undef=*/0) \
-  DECL3(func, devredop, uint8_t, /*undef=*/0) \
-  DECL3(func, devredop, int32_t, /*undef=*/0) \
-  DECL3(func, devredop, uint32_t, /*undef=*/0) \
-  DECL3(func, devredop, int64_t, /*undef=*/0) \
-  DECL3(func, devredop, uint64_t, /*undef=*/0) \
-  DECL3(func, devredop, half, /*undef=*/undefForFloat) \
-  DECL3(func, devredop, float, /*undef=*/undefForFloat) \
-  DECL3(func, devredop, double, /*undef=*/undefForFloat) \
-  DECL3(func, devredop, __nv_bfloat16, /*undef=*/undefForFloat)
-#else
-#define DECL2(func, devredop, undefForFloat) \
-  DECL3(func, devredop, int8_t, /*undef=*/0) \
-  DECL3(func, devredop, uint8_t, /*undef=*/0) \
-  DECL3(func, devredop, int32_t, /*undef=*/0) \
-  DECL3(func, devredop, uint32_t, /*undef=*/0) \
-  DECL3(func, devredop, int64_t, /*undef=*/0) \
-  DECL3(func, devredop, uint64_t, /*undef=*/0) \
-  DECL3(func, devredop, half, /*undef=*/undefForFloat) \
-  DECL3(func, devredop, float, /*undef=*/undefForFloat) \
-  DECL3(func, devredop, double, /*undef=*/undefForFloat)
-#endif
-
-#define DECL(func) \
-  DECL2(func, Sum, /*undefForFloat=*/0) \
-  DECL2(func, Prod, /*undefForFloat=*/0) \
-  DECL2(func, Min, /*undefForFloat=*/0) \
-  DECL2(func, Max, /*undefForFloat=*/0) \
-  DECL2(func, PreMulSum, /*undefForFloat=*/0) \
-  DECL2(func, SumPostDiv, /*undefForFloat=*/1)
-
-DECL2(Broadcast, Sum, /*undefForFloat=*/0)
-DECL(Reduce)
-DECL2(AllGather, Sum, /*undefForFloat=*/0)
-DECL(ReduceScatter)
-DECL(AllReduce)
-DECL5(SendRecv, RING, SIMPLE, Sum, int8_t)
-
-extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int8_t)();
-extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint8_t)();
-extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int32_t)();
-extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint32_t)();
-extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int64_t)();
-extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint64_t)();
-extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, half)();
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, __nv_bfloat16)();
-#endif
-extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, float)();
-extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, double)();
+#include "nccl.h"

 // CHUNKSIZE must be a multiple of SLICESIZE
 #define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4)
@@ -123,13 +22,27 @@ extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, double)();
 #define REDUCE_CHUNKSTEPS 1
 #define NCCL_MAX_SLICE_PER_CHUNK 2  // max value for CHUNKSTEPS/SLICESTEPS, must accord with above

-// We can't use the enum identifiers like ncclSum, ncclFloat, etc since this
-// macro will be used in preprocessor conditionals where enums have no meaning.
-#define NCCL_NVLS_SUPPORTS(/*ncclDataType_t*/ type, /*ncclDevRedOp_t*/ red) \
-  (((type==2 || type==3) && (red==0 || red==2 || red==3)) || \
-   ((type==4 || type==5) && (red==0 || red==2 || red==3)) || \
-   ((type==6 || type==9) && (red==0 || red==2 || red==3)) || \
-   (type==7 && red==0) || \
-   (type==8 && red==0))
+inline int ncclTypeSize(ncclDataType_t type) {
+  switch (type) {
+  case ncclInt8:
+  case ncclUint8:
+    return 1;
+  case ncclFloat16:
+  #if defined(__CUDA_BF16_TYPES_EXIST__)
+  case ncclBfloat16:
+  #endif
+    return 2;
+  case ncclInt32:
+  case ncclUint32:
+  case ncclFloat32:
+    return 4;
+  case ncclInt64:
+  case ncclUint64:
+  case ncclFloat64:
+    return 8;
+  default:
+    return -1;
+  }
+}

 #endif
@@ -10,8 +10,10 @@
 #include "transport.h"
 #include "p2p.h"
 #include "collectives.h"
+#include "nccl_tuner.h"
 #include "proxy.h"
 #include "strongstream.h"
+#include "nccl_net.h"

 #if CUDART_VERSION < 9000
 struct cudaLaunchParams {
@@ -153,6 +155,14 @@ struct ncclPointerList {
  void *ptr;
 };

+struct ncclNvlsMcHandleList {
+  struct ncclNvlsMcHandleList *next;
+  CUmemGenericAllocationHandle mcHandle;
+  CUdeviceptr ptr;
+  int dev;
+  size_t size;
+};
+
 struct ncclKernelPlan {
  // A kernel plan is also a callback that reclaims itself. Hence this must
  // be the first member.
@@ -176,6 +186,7 @@ struct ncclKernelPlan {
  int collOpCount; // zero based for this plan

  struct ncclIntruQueue<struct ncclPointerList, &ncclPointerList::next> ipcMemQueue;
+  struct ncclIntruQueue<struct ncclNvlsMcHandleList, &ncclNvlsMcHandleList::next> nvlsMcHandleQueue;

  struct Channel {
    int nWork;
@@ -189,6 +200,23 @@ struct ncclKernelPlan {
  } channels[MAXCHANNELS];
 };

+struct ncclRegRequest {
+  uintptr_t buff;
+  size_t size;
+  struct ncclRegRequest *next;
+};
+
+struct ncclRegRecord {
+  uintptr_t buff;
+  size_t size;
+  CUdeviceptr regAddr;
+  size_t regSize;
+  int dev;
+  CUmemGenericAllocationHandle mcHandle;
+  uintptr_t *addrs; /* use to check if NVLS buffers match among intra-node ranks */
+  struct ncclRegRecord *next;
+};
+
 struct ncclComm {
  struct ncclMemoryStack memPermanent, memScoped;
  // List of destructors to run when comm is destructed
@@ -259,6 +287,7 @@ struct ncclComm {
  ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
  float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
  float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float ringbdw[NCCL_NUM_FUNCTIONS][NCCL_NUM_PROTOCOLS];
  int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];

  /* This attribute can indicate the states of communicators and return code of
@@ -268,7 +297,7 @@ struct ncclComm {
  // Flag to ask NCCL kernels to abort
  volatile uint32_t *abortFlag;
  volatile uint32_t *childAbortFlag;
-  uint32_t *abortFlagRefCount;
+  volatile uint32_t *abortFlagRefCount;

  // Device side of the communicator (for cudaFree's)
  struct ncclDevComm* devComm; // actually = &ncclDevCommAndChannels::comm
@@ -308,15 +337,19 @@ struct ncclComm {

  // NVLink SHARP (NVLS) support
  int nvlsSupport;
+  int nvlsRegSupport;
  /* sharable NVLS resource. */
  struct ncclNvlsSharedRes* nvlsResources;
+  struct ncclShmemCollBuff nvlsShmem;
+  void *nvlsShmemHandle;

-  size_t channelSize; // User requested work size (bytes) for channel partitions
+  ssize_t channelSize; // User requested work size (bytes) for channel partitions

  // pools backed by comm->memPermanent
  struct ncclMemoryPool memPool_ncclProxyOp;
  struct ncclMemoryPool memPool_ncclKernelPlan;
  struct ncclMemoryPool memPool_ncclPointerList;
+  struct ncclMemoryPool memPool_ncclNvlsHandleList;
  // Next comm in this thread's active ncclGroup[Start|End](). Holds "0x1" when
  // this comm is not yet in a group.
  struct ncclComm* groupNext;
@@ -344,6 +377,16 @@ struct ncclComm {
  bool finalizeCalled;
  // shared structures for finalization
  int finalizeRankCnt;
+  // group job to support multi-thread FT
+  struct ncclGroupJob *groupJob;
+
+  /* store to buffer register request */
+  struct ncclIntruQueue<struct ncclRegRequest, &ncclRegRequest::next> regRequestQueue;
+  /* store registered buffer */
+  struct ncclIntruQueue<struct ncclRegRecord, &ncclRegRecord::next> regRecordQueue;
+
+  // Tuning plugin
+  ncclTuner_t* tuner;
 };

 enum ncclLaunchMode {
@@ -30,29 +30,6 @@
    ret func(args)
 #endif // end PROFAPI

-static __inline__ int ncclTypeSize(ncclDataType_t type) {
-  switch (type) {
-    case ncclInt8:
-    case ncclUint8:
-      return 1;
-    case ncclFloat16:
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-    case ncclBfloat16:
-#endif
-      return 2;
-    case ncclInt32:
-    case ncclUint32:
-    case ncclFloat32:
-      return 4;
-    case ncclInt64:
-    case ncclUint64:
-    case ncclFloat64:
-      return 8;
-    default:
-      return -1;
-  }
-}
-
 #include "debug.h"
 #include "checks.h"
 #include "cudawrap.h"
@@ -30,7 +30,7 @@ typedef CUresult (CUDAAPI *PFN_cuGetProcAddress_v11030)(const char *symbol, void
    if( err != CUDA_SUCCESS ) {				      \
      const char *errStr;				      \
      (void) pfn_cuGetErrorString(err, &errStr);	      \
-      WARN("Cuda failure '%s'", errStr);		      \
+      WARN("Cuda failure %d '%s'", err, errStr);	      \
      return ncclUnhandledCudaError;			      \
    }							      \
 } while(false)
@@ -40,7 +40,7 @@ typedef CUresult (CUDAAPI *PFN_cuGetProcAddress_v11030)(const char *symbol, void
    if( err != CUDA_SUCCESS ) {				      \
      const char *errStr;				      \
      (void) pfn_cuGetErrorString(err, &errStr);	      \
-      WARN("Cuda failure '%s'", errStr);		      \
+      WARN("Cuda failure %d '%s'", err, errStr);	      \
      res = ncclUnhandledCudaError;			      \
      goto label;					      \
    }							      \
@@ -52,7 +52,7 @@ typedef CUresult (CUDAAPI *PFN_cuGetProcAddress_v11030)(const char *symbol, void
    if( err != CUDA_SUCCESS ) {						\
      const char *errStr;						\
      (void) pfn_cuGetErrorString(err, &errStr);			\
-      INFO(NCCL_ALL,"%s:%d Cuda failure '%s'", __FILE__, __LINE__, errStr);	\
+      INFO(NCCL_ALL,"%s:%d Cuda failure %d '%s'", __FILE__, __LINE__, err, errStr); \
    }									\
 } while(false)

@@ -79,6 +79,7 @@ DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy, 4000);
 DECLARE_CUDA_PFN_EXTERN(cuCtxGetCurrent, 4000);
 DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent, 4000);
 DECLARE_CUDA_PFN_EXTERN(cuCtxGetDevice, 2000);
+DECLARE_CUDA_PFN_EXTERN(cuPointerGetAttribute, 4000);
 // cuMem API support
 DECLARE_CUDA_PFN_EXTERN(cuMemAddressReserve, 10020);
 DECLARE_CUDA_PFN_EXTERN(cuMemAddressFree, 10020);
@@ -4,10 +4,11 @@
 * See LICENSE.txt for license information
 ************************************************************************/

-#ifndef NCCL_DEBUG_H_
-#define NCCL_DEBUG_H_
+#ifndef NCCL_INT_DEBUG_H_
+#define NCCL_INT_DEBUG_H_

-#include "nccl_net.h"
+#include "nccl.h"
+#include "nccl_common.h"
 #include <stdio.h>
 #include <chrono>
 #include <type_traits>
@@ -8,31 +8,33 @@
 #define NCCL_DEVICE_H_

 #include "nccl.h"
+#include "nccl_common.h"
 #include "align.h"
 #include <stdint.h>

-#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
-typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t;
 extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS];

-#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
-#define NCCL_ALGO_TREE 0
-#define NCCL_ALGO_RING 1
-#define NCCL_ALGO_COLLNET_DIRECT 2
-#define NCCL_ALGO_COLLNET_CHAIN 3
-#define NCCL_ALGO_NVLS 4
-#define NCCL_ALGO_NVLS_TREE 5
 extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS];

-#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
-#define NCCL_PROTO_LL 0
-#define NCCL_PROTO_LL128 1
-#define NCCL_PROTO_SIMPLE 2
 extern const char* ncclProtoStr[NCCL_NUM_PROTOCOLS];

 #define NCCL_MAX_OPS 2048
 #define NCCL_STEPS 8

+#include "net_device.h"
+
+enum ncclDevRedOp_t {
+  ncclDevSum, ncclDevProd, ncclDevMinMax,
+  ncclDevPreMulSum, ncclDevSumPostDiv,
+  ncclNumDevRedOps
+};
+struct ncclDevRedOpFull {
+  ncclDevRedOp_t op;
+  ncclRedOp_t proxyOp;
+  bool scalarArgIsPtr;
+  uint64_t scalarArg;
+};
+
 union ncclLLFifoLine {
  /* Flags have to be *after* data, because otherwise, an incomplete receive
     from the network may receive the flag but not the data.
@@ -85,6 +87,7 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK
 struct ncclConnInfo {
  // Regular comm mechanism
  char *buffs[NCCL_NUM_PROTOCOLS]; // Local for recv, remote for send
+  void* mhandles[NCCL_NUM_PROTOCOLS];
  uint64_t *tail;     // Local for recv, remote for send
  uint64_t *head;     // Local for send, remote for recv

@@ -98,6 +101,7 @@ struct ncclConnInfo {

  uint64_t step;      // Keep where we are
  uint64_t llLastCleaning;
+  ncclNetDeviceHandle_t netDeviceHandle;
 };

 struct ncclProxyConnector {
@@ -105,6 +109,7 @@ struct ncclProxyConnector {
  int tpLocalRank;
  int sameProcess;
  struct ncclProxyConnection* connection;
+  ncclResult_t (*proxyProgress)(struct ncclProxyState* proxyState, struct ncclProxyArgs*); // Copied from transport if necessary
 };

 struct ncclConnector {
@@ -292,6 +297,7 @@ struct ncclDevComm {
  int rank;
  int nRanks;
  int buffSizes[NCCL_NUM_PROTOCOLS];
+  int p2pChunkSize;

  // Operation list for aggregation
  int workFifoDepth;
@@ -370,4 +376,88 @@ __host__ __device__ constexpr int ncclShmemDynamicSize(int cudaArch = NCCL_CUDA_
  return cudaArch < 700 ? 0 : ncclShmemScratchWarpSize(cudaArch)*(NCCL_MAX_NTHREADS/WARP_SIZE);
 }

+// Host-side table of kernel function pointers.
+extern int const ncclDevKernelCount;
+extern void* const ncclDevKernelList[/*ncclDevKernelCount*/];
+
+// Table of most specialized kernel function to run given func index.
+extern int const ncclDevFuncRowToId[];
+extern void* const ncclDevKernelForFunc[/*funcIndex*/];
+extern bool const ncclDevKernelForFuncIsSpecialized[/*funcIndex*/];
+
+// Launch a one-rank reduction on stream.
+ncclResult_t ncclLaunchOneRank(void* dst, void const* src, size_t nElts, struct ncclDevRedOpFull redOp, ncclDataType_t type, cudaStream_t stream);
+
+// `ncclNvlsSupported()` needs to be in sync with "func_valid" in "src/device/generate.py"
+inline bool ncclNvlsSupported(int devRedOp, int type) {
+  switch (type) {
+  case ncclInt32:
+  case ncclUint32:
+  case ncclInt64:
+  case ncclUint64:
+  case ncclFloat16:
+  #if defined(__CUDA_BF16_TYPES_EXIST__)
+  case ncclBfloat16:
+  #endif
+    return devRedOp == ncclDevSum || devRedOp == ncclDevMinMax;
+  case ncclFloat:
+  case ncclDouble:
+    return devRedOp == ncclDevSum;
+  default:
+    return false;
+  }
+}
+
+// `ncclDevFuncIndex()` needs to be in sync with "all_functions()" in "src/device/generate.py"
+inline int ncclDevFuncId(int coll, int devRedOp, int type, int algo, int proto) {
+  #if defined(__CUDA_BF16_TYPES_EXIST__)
+  constexpr int NumTypes = ncclNumTypes;
+  #else
+  constexpr int NumTypes = ncclNumTypes + 1;
+  #endif
+
+  int row = 0; // ncclDevFuncIndex_P2p
+  if (coll == ncclFuncSendRecv) goto have_row;
+  row += 1;
+
+  if (coll == ncclFuncAllGather) {
+    int algo1 = algo == NCCL_ALGO_RING ? 0 :
+              /*algo == NCCL_ALGO_NVLS*/ 1;
+    row += algo1*NCCL_NUM_PROTOCOLS + proto;
+    goto have_row;
+  }
+  row += (/*NumAlgos=*/2)*NCCL_NUM_PROTOCOLS;
+
+  if (coll == ncclFuncBroadcast) {
+    row += proto;
+    goto have_row;
+  }
+  row += (/*NumAlgos=*/1)*NCCL_NUM_PROTOCOLS;
+
+  if (coll == ncclFuncAllReduce) {
+    row += ((devRedOp*NumTypes + type)*NCCL_NUM_ALGORITHMS + algo)*NCCL_NUM_PROTOCOLS + proto;
+    goto have_row;
+  }
+  row += ncclNumDevRedOps*NumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS;
+
+  if (coll == ncclFuncReduce) {
+    row += (devRedOp*NumTypes + type)*NCCL_NUM_PROTOCOLS + proto;
+    goto have_row;
+  }
+  row += ncclNumDevRedOps*NumTypes*(/*NumAlgos=*/1)*NCCL_NUM_PROTOCOLS;
+
+  if (coll == ncclFuncReduceScatter) {
+    int algo1 = algo == NCCL_ALGO_RING ? 0 :
+              /*algo == NCCL_ALGO_NVLS*/ 1;
+    row += ((devRedOp*NumTypes + type)*2 + algo1)*NCCL_NUM_PROTOCOLS + proto;
+    goto have_row;
+  }
+  row += ncclNumDevRedOps*NumTypes*(/*NumAlgos=*/2)*NCCL_NUM_PROTOCOLS;
+
+have_row:
+  return ncclDevFuncRowToId[row];
+}
+
+inline int ncclDevFuncId_P2p() { return ncclDevFuncRowToId[0]; }
+
 #endif
@@ -8,7 +8,7 @@
 #define NCCL_GRAPH_H_

 #include "nccl.h"
-#include "devcomm.h"
+#include "device.h"
 #include <limits.h>
 #include <stdlib.h>
 #include <ctype.h>
@@ -112,6 +112,6 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa

 ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs);
 #include "info.h"
-ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time);
+ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time, bool* backup = NULL);

 #endif
@@ -14,7 +14,8 @@ ncclResult_t ncclGroupErrCheck(ncclResult_t ret);
 void ncclGroupCommJoin(struct ncclComm* comm);
 void ncclGroupCommPreconnect(struct ncclComm* comm);
 ncclResult_t ncclGroupCommLeave(struct ncclComm* comm);
-void ncclGroupJobAbort();
+ncclResult_t ncclGroupJobAbort(struct ncclGroupJob* groupJob);
+ncclResult_t ncclGroupJobComplete(struct ncclGroupJob *groupJob);

 typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev);

@@ -52,8 +53,9 @@ struct ncclGroupJob {
  struct ncclComm **groupCommPreconnectHeadPtr;
  ncclResult_t *groupErrorPtr;
  volatile bool *abortFlagPtr;
+  int *groupBlockingPtr;
  struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsPtr;
-  bool doneFlag;
+  bool initialized;
 };

 ncclResult_t ncclGroupStartInternal();
@@ -87,14 +89,6 @@ static inline ncclResult_t groupJobComplete(struct ncclGroupJob* job) {
 }

 inline ncclResult_t ncclGroupStartInternal() {
-  /* if previous group launch does not complete, don't launch this one. */
-  if (ncclGroupJobMainPtr != NULL) {
-    if (__atomic_load_n(&ncclGroupJobMainPtr->doneFlag, __ATOMIC_ACQUIRE) == false) {
-      return ncclInvalidUsage;
-    } else {
-      NCCLCHECK(groupJobComplete(ncclGroupJobMainPtr));
-    }
-  }
  ncclGroupDepth++;
  return ncclSuccess;
 }
@@ -1040,4 +1040,19 @@ static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struc
  return qp->context->ops.post_send(qp, wr, bad_wr);
 }

+struct ibv_ece {
+	/*
+	 * Unique identifier of the provider vendor on the network.
+	 * The providers will set IEEE OUI here to distinguish
+	 * itself in non-homogenius network.
+	 */
+	uint32_t vendor_id;
+	/*
+	 * Provider specific attributes which are supported or
+	 * needed to be enabled by ECE users.
+	 */
+	uint32_t options;
+	uint32_t comp_mask;
+};
+
 #endif  // NCCL_IBV_CORE_H_
@@ -36,6 +36,8 @@ struct ncclIbvSymbols {
  int (*ibv_internal_modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
  int (*ibv_internal_destroy_qp)(struct ibv_qp *qp);
  const char * (*ibv_internal_event_type_str)(enum ibv_event_type event);
+  int (*ibv_internal_query_ece)(struct ibv_qp *qp, struct ibv_ece *ece);
+  int (*ibv_internal_set_ece)(struct ibv_qp *qp, struct ibv_ece *ece);
 };

 /* Constructs IB verbs symbols per rdma-core linking or dynamic loading mode */
@@ -66,6 +66,8 @@ static inline ncclResult_t wrap_ibv_poll_cq(struct ibv_cq *cq, int num_entries,
 ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr);
 ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
 ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp);
+ncclResult_t wrap_ibv_query_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported);
+ncclResult_t wrap_ibv_set_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported);

 static inline ncclResult_t wrap_ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) {
  int ret = qp->context->ops.post_send(qp, wr, bad_wr); /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
@@ -8,7 +8,7 @@
 #define NCCL_INFO_H_

 #include "nccl.h"
-#include "devcomm.h"
+#include "device.h"
 #include "collectives.h"
 #include "core.h"
 #include "utils.h"
@@ -54,6 +54,8 @@ struct ncclInfo {
  int nChannels;
  int nThreads;
  size_t nBytes;
+  size_t sendbuffSize;
+  size_t recvbuffSize;
  int nstepsPerLoop;
  int nchunksPerLoop;
  int chunkSize;
@@ -67,6 +69,17 @@ inline ncclResult_t ncclInfoSetDerived(struct ncclInfo* info, int nRanks) {
    info->datatype = ncclInt8;
  }
  if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= nRanks; // count is per rank
+
+  /* compute buffer size for NVLS buffer registration */
+  if (info->coll == ncclFuncAllGather) {
+    info->sendbuffSize = info->count * ncclTypeSize(info->datatype);
+    info->recvbuffSize = info->sendbuffSize * nRanks;
+  } else if (info->coll == ncclFuncReduceScatter) {
+    info->recvbuffSize = info->count * ncclTypeSize(info->datatype);
+    info->sendbuffSize = info->recvbuffSize * nRanks;
+  } else {
+    info->sendbuffSize = info->recvbuffSize = info->count * ncclTypeSize(info->datatype);
+  }
  return ncclSuccess;
 }

@@ -30,6 +30,7 @@ struct ncclIpcSocket {

 ncclResult_t ncclIpcSocketInit(struct ncclIpcSocket *handle, int rank, uint64_t hash, volatile uint32_t* abortFlag);
 ncclResult_t ncclIpcSocketClose(struct ncclIpcSocket *handle);
+ncclResult_t ncclIpcSocketGetFd(struct ncclIpcSocket* handle, int* fd);

 ncclResult_t ncclIpcSocketRecvFd(struct ncclIpcSocket *handle, int *fd);
 ncclResult_t ncclIpcSocketSendFd(struct ncclIpcSocket *handle, const int fd, int rank, uint64_t hash);
@@ -0,0 +1,33 @@
+/*************************************************************************
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_DEBUG_H_
+#define NCCL_DEBUG_H_
+
+typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
+typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys;
+
+typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
+
+#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
+typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t;
+
+#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
+#define NCCL_ALGO_UNDEF -1
+#define NCCL_ALGO_TREE 0
+#define NCCL_ALGO_RING 1
+#define NCCL_ALGO_COLLNET_DIRECT 2
+#define NCCL_ALGO_COLLNET_CHAIN 3
+#define NCCL_ALGO_NVLS 4
+#define NCCL_ALGO_NVLS_TREE 5
+
+#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
+#define NCCL_PROTO_UNDEF -1
+#define NCCL_PROTO_LL 0
+#define NCCL_PROTO_LL128 1
+#define NCCL_PROTO_SIMPLE 2
+
+#endif
@@ -8,6 +8,8 @@
 #define NCCL_NET_H_

 #include "nccl.h"
+#include "nccl_common.h"
+#include "net_device.h"
 #include <stdint.h>

 #define NCCL_NET_HANDLE_MAXSIZE 128
@@ -17,13 +19,89 @@
 #define NCCL_PTR_DMABUF 0x4

 // Maximum number of requests per comm object
-#define NCCL_NET_MAX_REQUESTS 8
+#define NCCL_NET_MAX_REQUESTS 32

-typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
-typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys;
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+} ncclNetProperties_v7_t;

-typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
+typedef ncclNetProperties_v7_t ncclNetProperties_t;

+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+} ncclNet_v7_t;
+
+typedef ncclNet_v7_t ncclNet_t;
+
+#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v7
+
+#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v7
+
+#define NCCL_NET_MAX_REQUESTS_V6 8
+
+// v6 struct for backwards compatibility
 typedef struct {
  char* name;     // Used mostly for logging.
  char* pciPath;  // Path to the PCI device in /sys.
@@ -35,9 +113,7 @@ typedef struct {
  float latency;  // Network latency
  int maxComms;   // Maximum number of comms we can create
  int maxRecvs;   // Maximum number of grouped receives.
-}ncclNetProperties_v6_t;
-
-typedef ncclNetProperties_v6_t ncclNetProperties_t;
+} ncclNetProperties_v6_t;

 typedef struct {
  // Name of the network (mainly for logs)
@@ -86,10 +162,49 @@ typedef struct {
  ncclResult_t (*closeListen)(void* listenComm);
 } ncclNet_v6_t;

-typedef ncclNet_v6_t ncclNet_t;
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclCollNet_v7_t;

-#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v6
+typedef ncclCollNet_v7_t ncclCollNet_t;

+// v6 struct for backwards compatibility
 typedef struct {
  // Name of the collective network (mainly for logs)
  const char* name;
@@ -130,10 +245,6 @@ typedef struct {
  ncclResult_t (*closeListen)(void* listenComm);
 } ncclCollNet_v6_t;

-typedef ncclCollNet_v6_t ncclCollNet_t;
-
-#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v6
-
 // v5 struct for backwards compatibility
 typedef struct {
  // Name of the network (mainly for logs)
@@ -219,95 +330,4 @@ typedef struct {
  ncclResult_t (*closeListen)(void* listenComm);
 } ncclCollNet_v5_t;

-// v4 struct for backwards compatibility
-typedef struct {
-  char* name;     // Used mostly for logging.
-  char* pciPath;  // Path to the PCI device in /sys.
-  uint64_t guid;  // Unique identifier for the NIC chip. Important for
-                  // cards with multiple PCI functions (Physical or virtual).
-  int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA
-  int speed;      // Port speed in Mbps.
-  int port;       // Port number.
-  int maxComms;   // Maximum number of comms we can create
-} ncclNetProperties_v4_t;
-
-// v4 struct for backwards compatibility
-typedef struct {
-  // Name of the network (mainly for logs)
-  const char* name;
-  // Initialize the network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create a connection.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Connect to a handle and return a sending comm object for that peer.
-  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
-  // Finalize connection establishment after remote peer has called connectHandle
-  ncclResult_t (*accept)(void* listenComm, void** recvComm);
-  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
-  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
-  ncclResult_t (*deregMr)(void* comm, void* mhandle);
-  // Asynchronous send to a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
-  // Asynchronous recv from a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* recvComm, void* data, int size, void* mhandle, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* size);
-  // Close and free send/recv comm objects
-  ncclResult_t (*closeSend)(void* sendComm);
-  ncclResult_t (*closeRecv)(void* recvComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclNet_v4_t;
-
-// v4 struct for backwards compatibility
-typedef struct {
-  // Name of the collective network (mainly for logs)
-  const char* name;
-  // Initialize the collective network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters capable of doing collective operations.
-  // If ndev returns 0, all other functions might be set to NULL.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create connections.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Create a group for collective operations. handles have been created
-  // using listen() above. rank indicates caller's rank in the collective network.
-  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
-  // Returns whether a reduction operation on a data type is supported.
-  // 1 for supported, 0 otherwise.
-  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
-  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
-  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
-  // Performs an asynchronous allreduce operation on the collective group.
-  // May return request == NULL if the call cannot be performed (or would block).
-  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* size);
-  // Close and free collective comm objects
-  ncclResult_t (*closeColl)(void* collComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclCollNet_v4_t;
-
 #endif // end include guard
@@ -0,0 +1,55 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_TUNER_H_
+#define NCCL_TUNER_H_
+
+#include "nccl.h"
+#include "nccl_common.h"
+
+// API to be implemented by external tuner
+typedef struct {
+  // Name of the tuner
+  const char* name;
+
+  // Initializes tuner states.
+  // nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
+  // nNodes: number of nodes in current communicator.
+  // logFunction: a logFunction can be useful to integrate logging together with NCCL core.
+  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction);
+
+  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
+  // Inputs:
+  //   - collType: collective type , e.g., allreduce, allgather…
+  //   - nBytes: collective size in bytes
+  //   - collNetTypeSupport: whether collnet supports this type
+  //   - nvlsTypeSupport: whether nvlink sharp supports this time
+  //   - numPipeOps: number of operations in the group
+  //
+  // Outputs:
+  //   - algorithm: selected algorithm to be used for the given collective
+  //   - protocol: selected protocol to be used for the given collective
+  //   - nChannels: number of channels (hence SMs) to be used.
+  //
+  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
+  // default tuning for the given collective.
+  // Also, the plugin is allowed to not set any output, or set only the
+  // algorithm and protocol, but not only the algorithm or only the protocol.
+  // Unset fields will be set automatically by NCCL.
+  ncclResult_t (*getCollInfo)(ncclFunc_t collType, size_t nBytes,
+                              int collNetSupport, int nvlsSupport, int numPipeOps,
+                              int *algorithm, int *protocol, int* nChannels);
+
+  // Terminates the plugin and cleans up any resources that the plugin allocated.
+  ncclResult_t (*destroy)();
+} ncclTuner_v1_t;
+
+typedef ncclTuner_v1_t ncclTuner_t;
+
+#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v1"
+
+#endif
@@ -0,0 +1,29 @@
+/*************************************************************************
+ * Copyright (c) 2023-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_NET_DEVICE_H_
+#define NCCL_NET_DEVICE_H_
+
+#define NCCL_NET_DEVICE_INVALID_VERSION      0x0
+#define NCCL_NET_MTU_SIZE                    4096
+
+// Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin
+// version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version.
+#define NCCL_NET_DEVICE_UNPACK_VERSION 0x7  
+
+typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType;
+
+typedef struct {
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+  void* handle;
+  size_t size;
+  int needsProxyProgress;
+} ncclNetDeviceHandle_v7_t;
+
+typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_t;
+
+#endif
@@ -160,7 +160,12 @@ typedef union nvmlValue_st
 #define NVML_FI_DEV_NVLINK_GET_SPEED                  164
 #define NVML_FI_DEV_NVLINK_GET_STATE                  165
 #define NVML_FI_DEV_NVLINK_GET_VERSION                166
-#define NVML_FI_MAX 167 //!< One greater than the largest field ID defined above
+
+#define NVML_FI_DEV_C2C_LINK_COUNT                    170 //!< Number of C2C Links present on the device
+#define NVML_FI_DEV_C2C_LINK_GET_STATUS               171 //!< C2C Link Status 0=INACTIVE 1=ACTIVE
+#define NVML_FI_DEV_C2C_LINK_GET_MAX_BW               172 //!< C2C Link Speed in MBps for active links
+
+#define NVML_FI_MAX 173 //!< One greater than the largest field ID defined above

 /**
 * Information for a Field Value Sample
@@ -12,7 +12,7 @@
 #define NCCL_P2P_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR

 typedef struct {
-  int data; // Currently only support an fd based descriptor
+  uint64_t data; // Needs to hold a CUmemGenericAllocationHandle for UDS fd support
 } ncclCuDesc;

 typedef union {
@@ -12,6 +12,7 @@
 const char* userHomeDir();
 void setEnvFile(const char* fileName);
 void initEnv();
+const char *ncclGetEnv(const char *name);

 void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache);

@@ -7,10 +7,11 @@
 #ifndef NCCL_PROXY_H_
 #define NCCL_PROXY_H_

-#include "devcomm.h"
+#include "device.h"
 #include "info.h"
 #include "socket.h"
 #include "ipcsocket.h"
+#include "nccl_net.h"
 #include <pthread.h>
 #include "shm.h"
 #include "p2p.h"
@@ -65,6 +66,8 @@ struct ncclProxySubArgs {
  uint64_t end;
  void* requests[NCCL_STEPS];
  void* profilingEvents[NCCL_STEPS];
+  void* recvRequestsCache[NCCL_STEPS];
+  int recvRequestsSubCount;
 };

 struct ncclProxyArgs {
@@ -146,7 +149,7 @@ struct ncclProxyProgressState {
  char opsPoolShmSuffix[6];

  pthread_t thread;
-  bool stop;
+  volatile int stop;
  struct ncclProxyPeer** localPeers;
  struct ncclSharedNetComms* netComms[NCCL_MAX_NETDEVS];
  struct ncclProxyArgs* active;
@@ -157,11 +160,12 @@ struct ncclProxyProgressState {

 // Expected proxy response fifo
 struct ncclExpectedProxyResponse {
-  void*    opId;
-  int      respSize;
-  bool     done;
-  void*    respBuff;
-  struct   ncclExpectedProxyResponse* next;
+  void*                             opId;
+  int                               respSize;
+  bool                              done;
+  void*                             respBuff;
+  ncclResult_t                      res;
+  struct ncclExpectedProxyResponse* next;
 };

 struct ncclProxyAsyncOp {
@@ -181,7 +185,16 @@ struct ncclProxyLocalPeer {
  int asyncOpCounter;
 };

+// Common response header for all proxyOps
+// We pack this into a struct to reduce the number of blocking send and recv calls
+struct ncclProxyRpcResponseHeader {
+  void* opId;
+  ncclResult_t res;
+  int respSize;
+};
+
 struct ncclProxyState {
+  int internalRefCount;
  int refCount;
  int tpRank;
  int tpnRanks;
@@ -196,11 +209,13 @@ struct ncclProxyState {
  ncclNet_t* ncclNet;
  ncclCollNet_t* ncclCollNet;
  volatile uint32_t* abortFlag;
+  volatile uint32_t* abortFlagRefCount;
  // Service thread
  pthread_t thread;
  struct ncclSocket* listenSock;
-  int stop;
+  volatile int stop;
  CUcontext cudaCtx;
+  ncclResult_t asyncResult;

  // Used by main thread
  union ncclSocketAddress* peerAddresses;
@@ -233,8 +248,11 @@ struct ncclProxyConnection {
  struct ncclProxyArgs *proxyAppend;
  struct ncclProxyArgs **proxyAppendPtr;
  void* transportResources;
+  ncclNetDeviceHandle_t* netDeviceHandle;
+  void* mhandles[NCCL_NUM_PROTOCOLS];
  proxyConnectState state;
  struct ncclCollNetSharedRes* collNet;
+  int needsProxyProgress;
 };

 typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*);
@@ -260,7 +278,7 @@ enum ncclProxyMsgType {
  ncclProxyMsgClose = 6,
  ncclProxyMsgAbort = 7,
  ncclProxyMsgStop = 8,
-  ncclProxyMsgConvertFd = 9, // cuMem API support (UDS)
+  ncclProxyMsgGetFd = 9, // cuMem API support (UDS)
 };

 // This function is called by a client of the proxy that needs to invoke any of the non-progress proxyOp types
@@ -272,9 +290,10 @@ ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector
 ncclResult_t ncclProxyCallBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize);
 ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void* respBuff, void* opId);

-ncclResult_t ncclProxyClientConvertFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int fd, int* convertedFd);
+ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void *handle, int* convertedFd);

 ncclResult_t ncclProxyStop(struct ncclComm* comm);
 ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm);
-ncclResult_t ncclProxyDestroy(struct ncclComm* comm);
+ncclResult_t ncclProxyDestroy(struct ncclProxyState *proxyState);
+ncclResult_t ncclProxyTryDetach(struct ncclProxyState *proxyState);
 #endif
@@ -14,4 +14,12 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de
 ncclResult_t ncclShmClose(ncclShmHandle_t handle);
 ncclResult_t ncclShmUnlink(ncclShmHandle_t handle);

+struct ncclShmemCollBuff {
+  volatile size_t *cnt[2];
+  volatile void *ptr[2];
+  int round;
+};
+
+ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize);
+
 #endif
@@ -7,7 +7,7 @@
 #ifndef NCCL_TRANSPORT_H_
 #define NCCL_TRANSPORT_H_

-#include "devcomm.h"
+#include "device.h"
 #include "graph.h"
 #include "nvmlwrap.h"
 #include "core.h"
@@ -65,6 +65,7 @@ struct ncclNvlsSharedRes {
  CUmemGenericAllocationHandle ucHandle; // Unicast Handle for NVLS buffer
  char* ucBuff; // Unicast NVLS buffer address
  char shareableHandle[NVLS_HANDLE_SIZE];
+  size_t ucGran;
  int nChannels;
 };

@@ -102,8 +103,20 @@ struct ncclTransport {
 ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
 ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL);

+// Currently we only support POSIX_FILE_DESCRIPTOR handle exchange
+#define USE_POSIX_FD 1
+
+#if USE_POSIX_FD
+#define NVLS_CU_MEM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
+#else
+#define NVLS_CU_MEM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_NONE
+#endif
+
 ncclResult_t ncclNvlsInit(struct ncclComm* comm);
 ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent);
+ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKernelPlan *plan, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv);
+ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv);
+ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size);
 ncclResult_t ncclNvlsFree(struct ncclComm* comm);

 enum { collNetRecv=0, collNetSend=1 };
@@ -0,0 +1,22 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_INT_TUNER_H_
+#define NCCL_INT_TUNER_H_
+
+#include "nccl_tuner.h"
+
+// Tuning plugin to override NCCL's default algorithm/protocol tuning.
+
+// Attempts to load NCCL tuner from environmental variable.
+// Returns ncclSuccess if the correct tuner symbol has been found and
+// successully loaded.  Otherwise returns an error and also logs the error.
+ncclResult_t ncclLoadTunerPlugin(ncclTuner_t** tuner);
+
+// Cleans up NCCL tuner plugin.
+ncclResult_t ncclCloseTunerPlugin(ncclTuner_t** tuner);
+#endif
@@ -13,6 +13,7 @@
 #include <stdint.h>
 #include <time.h>
 #include <sched.h>
+#include <algorithm>
 #include <new>

 int ncclCudaCompCap();
@@ -259,11 +260,6 @@ struct ncclMemoryPool {
  struct Cell {
    Cell *next;
  };
-  template<int Size, int Align>
-  union CellSized {
-    Cell cell;
-    alignas(Align) char space[Size];
-  };
  struct Cell* head;
  struct Cell* tail; // meaningful only when head != nullptr
 };
@@ -275,14 +271,15 @@ inline void ncclMemoryPoolConstruct(struct ncclMemoryPool* me) {
 template<typename T>
 inline T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing) {
  using Cell = ncclMemoryPool::Cell;
-  using CellSized = ncclMemoryPool::CellSized<sizeof(T), alignof(T)>;
  Cell* cell;
  if (__builtin_expect(me->head != nullptr, true)) {
    cell = me->head;
    me->head = cell->next;
  } else {
    // Use the internal allocate() since it doesn't memset to 0 yet.
-    cell = (Cell*)ncclMemoryStack::allocate(backing, sizeof(CellSized), alignof(CellSized));
+    size_t cellSize = std::max(sizeof(Cell), sizeof(T));
+    size_t cellAlign = std::max(alignof(Cell), alignof(T));
+    cell = (Cell*)ncclMemoryStack::allocate(backing, cellSize, cellAlign);
  }
  memset(cell, 0, sizeof(T));
  return reinterpret_cast<T*>(cell);
@@ -349,6 +346,32 @@ inline T* ncclIntruQueueDequeue(ncclIntruQueue<T,next> *me) {
  return ans;
 }

+template<typename T, T *T::*next>
+inline bool ncclIntruQueueDelete(ncclIntruQueue<T,next> *me, T *x) {
+  T *prev = nullptr;
+  T *cur = me->head;
+  bool found = false;
+
+  while (cur) {
+    if (cur == x) {
+      found = true;
+      break;
+    }
+    prev = cur;
+    cur = cur->*next;
+  }
+
+  if (found) {
+    if (prev == nullptr)
+      me->head = cur->*next;
+    else
+      prev->*next = cur->*next;
+    if (cur == me->tail)
+      me->tail = prev;
+  }
+  return found;
+}
+
 template<typename T, T *T::*next>
 inline T* ncclIntruQueueTryDequeue(ncclIntruQueue<T,next> *me) {
  T *ans = me->head;
@@ -16,6 +16,7 @@
 #include "enqueue.h"
 #include "graph.h"
 #include "argcheck.h"
+#include "tuner.h"
 #include <fcntl.h>
 #include <string.h>
 #include <errno.h>
@@ -24,6 +25,7 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <unistd.h>
+#include "param.h"

 #define STR2(v) #v
 #define STR(v) STR2(v)
@@ -177,7 +179,13 @@ static ncclResult_t commFree(ncclComm_t comm) {
   * free all intra-process communicators; therefore, we only need to focus on local
   * resource cleanup in commFree(). */
  if (comm->proxyState && comm->proxyRefCountOld == 0 && comm->proxyState->thread) {
-    pthread_join(comm->proxyState->thread, nullptr);
+    if (*comm->abortFlag == 0) {
+      /* regular thread join */
+      pthread_join(comm->proxyState->thread, nullptr);
+    } else {
+      /* try to detach thread due to abort */
+      ncclProxyTryDetach(comm->proxyState);
+    }
  }

  delete[] comm->userRedOps;
@@ -211,7 +219,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
      free(comm->sharedRes->tpRankToLocalRank);
      NCCLCHECK(ncclStrongStreamDestruct(&comm->sharedRes->hostStream));
      NCCLCHECK(ncclStrongStreamDestruct(&comm->sharedRes->deviceStream));
-      NCCLCHECK(ncclProxyDestroy(comm));
+      NCCLCHECK(ncclProxyDestroy(comm->sharedRes->proxyState));
      free(comm->sharedRes);
    }
  }
@@ -229,13 +237,25 @@ static ncclResult_t commFree(ncclComm_t comm) {

  if (ncclAtomicRefCountDecrement(comm->abortFlagRefCount) == 0) {
    NCCLCHECK(ncclCudaHostFree((void *)comm->abortFlag));
-    free(comm->abortFlagRefCount);
+    free((void*)comm->abortFlagRefCount);
  }
  free((void*)comm->config.netName);

  free(comm->topParentRanks);
  free(comm->topParentLocalRanks);

+  while (!ncclIntruQueueEmpty(&comm->regRecordQueue)) {
+    struct ncclRegRecord* rec = ncclIntruQueueDequeue(&comm->regRecordQueue);
+    NCCLCHECK(ncclNvlsDeregBuffer(&rec->mcHandle, rec->regAddr, rec->dev, rec->regSize));
+    free(rec->addrs);
+    free(rec);
+  }
+
+  while (!ncclIntruQueueEmpty(&comm->regRequestQueue)) {
+    struct ncclRegRequest* req = ncclIntruQueueDequeue(&comm->regRequestQueue);
+    free(req);
+  }
+
  commPoison(comm); // poison comm before free to avoid comm reuse.
  free(comm);

@@ -275,7 +295,7 @@ ncclResult_t ncclCommEnsureReady(ncclComm_t comm) {
  ncclResult_t ret = ncclSuccess;

  if (*comm->abortFlag) {
-    ncclGroupJobAbort();
+    ncclGroupJobAbort(comm->groupJob);
  } else {
    NCCLCHECK(ncclCommGetAsyncError(comm, &ret));
    if (ret != ncclSuccess) {
@@ -284,6 +304,11 @@ ncclResult_t ncclCommEnsureReady(ncclComm_t comm) {
      if (ret == ncclInProgress) ret = ncclInvalidArgument;
      goto exit;
    }
+    /* if there is linked group job, we should complete it. */
+    if (comm->groupJob) {
+      NCCLCHECK(ncclGroupJobComplete(comm->groupJob));
+      comm->groupJob = NULL;
+    }
  }

 exit:
@@ -338,6 +363,7 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
  ncclMemoryPoolConstruct(&comm->memPool_ncclKernelPlan);
  ncclMemoryPoolConstruct(&comm->memPool_ncclProxyOp);
  ncclMemoryPoolConstruct(&comm->memPool_ncclPointerList);
+  ncclMemoryPoolConstruct(&comm->memPool_ncclNvlsHandleList);

  comm->groupNext = reinterpret_cast<struct ncclComm*>(0x1);
  comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
@@ -373,6 +399,8 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
      comm->topParentRanks[i] = i;
  }

+  ncclIntruQueueConstruct(&comm->regRequestQueue);
+  ncclIntruQueueConstruct(&comm->regRecordQueue);
  ncclIntruQueueMpscConstruct(&comm->callbackQueue);
  return ncclSuccess;
 }
@@ -393,6 +421,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
  for (int p=0; p < NCCL_NUM_PROTOCOLS; p++) {
    tmpCommAndChans.comm.buffSizes[p] = comm->buffSizes[p];
  }
+  tmpCommAndChans.comm.p2pChunkSize = comm->p2pChunkSize;
  tmpCommAndChans.comm.channels = &devCommAndChans->channels[0];

  comm->workFifoDepth = ncclParamWorkFifoDepth();
@@ -500,7 +529,6 @@ static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank,
 #define DEFAULT_LL_BUFFSIZE (NCCL_LL_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS*NCCL_STEPS*sizeof(union ncclLLFifoLine))
 #define DEFAULT_LL128_BUFFSIZE (NCCL_LL128_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS*NCCL_STEPS*sizeof(uint64_t))
 #define DEFAULT_BUFFSIZE (1 << 22) /* 4MiB */
-#define DEFAULT_BUFFSIZE_ARM (1 << 20) /* 1MiB */
 NCCL_PARAM(BuffSize, "BUFFSIZE", -2);
 NCCL_PARAM(LlBuffSize, "LL_BUFFSIZE", -2);
 NCCL_PARAM(Ll128BuffSize, "LL128_BUFFSIZE", -2);
@@ -516,8 +544,6 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
  int64_t envs[NCCL_NUM_PROTOCOLS] = { ncclParamLlBuffSize(), ncclParamLl128BuffSize(), ncclParamBuffSize() };
  int defaults[NCCL_NUM_PROTOCOLS] = { DEFAULT_LL_BUFFSIZE, DEFAULT_LL128_BUFFSIZE, DEFAULT_BUFFSIZE };

-  if (cpuArch == NCCL_TOPO_CPU_ARCH_ARM) defaults[NCCL_PROTO_SIMPLE] = DEFAULT_BUFFSIZE_ARM;
-
  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
    comm->buffSizes[p] = envs[p] != -2 ? envs[p] : defaults[p];
  }
@@ -525,6 +551,10 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
  if (comm->nNodes > 1) comm->p2pChunkSize = ncclParamP2pNetChunkSize();
  else if (ncclTopoPathAllNVLink(comm->topo)) comm->p2pChunkSize = ncclParamP2pNvlChunkSize();
  else comm->p2pChunkSize = ncclParamP2pPciChunkSize();
+
+  // Make sure P2P chunksize is not larger than coll chunksize.
+  if (comm->p2pChunkSize * NCCL_STEPS > comm->buffSizes[NCCL_PROTO_SIMPLE]) comm->p2pChunkSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
+
  if (comm->sharedRes->owner != comm) {
    /* make sure split comm p2pChunkSize won't exceed shared p2pChunkSize. */
    comm->p2pChunkSize = std::min(comm->p2pChunkSize, comm->sharedRes->tpP2pChunkSize);
@@ -606,7 +636,7 @@ static ncclResult_t collNetTrySetup(ncclComm_t comm, ncclComm_t parent, struct n
      if (share) {
        if (myinfo->isMaster) {
          comm->collNetSharedRes = parent->collNetSharedRes;
-          comm->collNetChannels = std::min(std::max(comm->nChannels, comm->nvlsChannels), parent->collNetSharedRes->nChannels);
+          comm->collNetChannels = std::min(comm->nChannels, parent->collNetSharedRes->nChannels);
          for (int c = 0; c < comm->collNetChannels; ++c)
            NCCLCHECKGOTO(initCollnetChannel(comm, c, parent, true), ret, fail);
        }
@@ -625,8 +655,7 @@ static ncclResult_t collNetTrySetup(ncclComm_t comm, ncclComm_t parent, struct n
  } else {
    /* this allocated buffer will be freed on proxy side */
    NCCLCHECK(ncclCalloc(&comm->collNetSharedRes, 1));
-    /* TODO: min or max? */
-    comm->collNetChannels = comm->collNetSharedRes->nChannels = std::max(comm->nChannels, comm->nvlsChannels);
+    comm->collNetChannels = comm->collNetSharedRes->nChannels = comm->nChannels;
    comm->collNetSharedRes->buffSize = comm->buffSizes[NCCL_PROTO_SIMPLE];
    for (int c = 0; c < comm->collNetChannels; c++) {
      struct ncclChannel* channel = comm->channels + c;
@@ -804,6 +833,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
    int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0;
    for (int i = 0; i < nranks; i++) comm->minCompCap = std::min(comm->minCompCap, comm->peerInfo[rank].cudaCompCap);
    for (int i = 0; i < nranks; i++) comm->maxCompCap = std::max(comm->maxCompCap, comm->peerInfo[rank].cudaCompCap);
+
+    comm->nvlsRegSupport = 1;
    for (int i = 0; i < nranks; i++) {
      if ((comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash)
          && (comm->peerInfo[i].pidHash == comm->peerInfo[rank].pidHash)) {
@@ -816,6 +847,16 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
          comm->intraNext = comm->peerInfo[i].comm;
        }
      }
+
+      if (comm->nvlsRegSupport) {
+        for (int j = i + 1; j < nranks; j++) {
+          if (comm->peerInfo[i].hostHash == comm->peerInfo[j].hostHash &&
+            comm->peerInfo[i].pidHash == comm->peerInfo[j].pidHash) {
+            comm->nvlsRegSupport = 0;
+            break;
+          }
+        }
+      }
    }
    TRACE(NCCL_INIT,"pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
        rank, comm->peerInfo[rank].pidHash, intraProcRank, intraProcRanks, intraProcRank0);
@@ -859,7 +900,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p

  // Determine local CollNet support
  if (collNetSupport(comm)) {
-    char *collNetEnable = getenv("NCCL_COLLNET_ENABLE");
+    const char *collNetEnable = ncclGetEnv("NCCL_COLLNET_ENABLE");
    if (collNetEnable != NULL) {
      INFO(NCCL_ALL, "NCCL_COLLNET_ENABLE set by environment to %s.", collNetEnable);
      if (strcmp(collNetEnable, "1") == 0) {
@@ -872,22 +913,23 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
  NCCLCHECK(ncclNvlsInit(comm));

  // Get rings and trees
+  memset(&ringGraph, 0, sizeof(struct ncclTopoGraph));
  ringGraph.id = 0;
  ringGraph.pattern = NCCL_TOPO_PATTERN_RING;
-  ringGraph.collNet = 0;
  ringGraph.minChannels = 1;
  ringGraph.maxChannels = MAXCHANNELS/2;
  NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &ringGraph), ret, fail);
  NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &ringGraph), ret, fail);

+  memset(&treeGraph, 0, sizeof(struct ncclTopoGraph));
  treeGraph.id = 1;
  treeGraph.pattern = NCCL_TOPO_PATTERN_BALANCED_TREE;
-  treeGraph.collNet = 0;
  treeGraph.minChannels = ringGraph.nChannels;
  treeGraph.maxChannels = ringGraph.nChannels;
  NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &treeGraph), ret, fail);
  NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &treeGraph), ret, fail);

+  memset(&collNetGraph, 0, sizeof(struct ncclTopoGraph));
  collNetGraph.id = 2;
  collNetGraph.pattern = NCCL_TOPO_PATTERN_TREE;
  collNetGraph.collNet = 1;
@@ -895,20 +937,16 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
  if (comm->collNetSupport) {
    NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &collNetGraph), ret, fail);
    NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &collNetGraph), ret, fail);
-  } else {
-    collNetGraph.nChannels = 0;
  }

+  memset(&nvlsGraph, 0, sizeof(struct ncclTopoGraph));
  nvlsGraph.id = 3;
  nvlsGraph.pattern = NCCL_TOPO_PATTERN_NVLS;
-  nvlsGraph.collNet = 0;
  nvlsGraph.minChannels = 1;
  nvlsGraph.maxChannels = MAXCHANNELS;
  if (comm->nvlsSupport) {
    NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &nvlsGraph), ret, fail);
    NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &nvlsGraph), ret, fail);
-  } else {
-    nvlsGraph.nChannels = 0;
  }

  // Initialize num P2P LL buffers for this communicator
@@ -1197,7 +1235,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
  }

  if (comm->intraRank == 0) { // Load ncclParamLaunchMode
-    char* str = getenv("NCCL_LAUNCH_MODE");
+    const char* str = ncclGetEnv("NCCL_LAUNCH_MODE");
    enum ncclLaunchMode mode, modeOld;
    if (str && strcasecmp(str, "GROUP") == 0) {
      mode = ncclLaunchModeGroup;
@@ -1357,6 +1395,11 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {

  NCCLCHECKGOTO(initTransportsRank(comm, job->parent), res, fail);

+  NCCLCHECKGOTO(ncclLoadTunerPlugin(&comm->tuner), res, fail);
+  if (comm->tuner) {
+    NCCLCHECK(comm->tuner->init(comm->nRanks, comm->nNodes, ncclDebugLog));
+  }
+
  // update communicator state
  comm->initState = ncclSuccess;

@@ -1425,7 +1468,7 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
    comm->config.maxCTAs = maxCTAsEnv;
  }

-  envNetName = getenv("NCCL_NET");
+  envNetName = ncclGetEnv("NCCL_NET");
  if (envNetName)
    tmpNetName = envNetName;
  if (tmpNetName != NULL) {
@@ -1560,7 +1603,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUni
  ncclResult_t res = ncclSuccess;
  ncclComm_t comm = NULL;
  struct ncclCommInitRankAsyncJob *job = NULL;
-  char* env = getenv("NCCL_COMM_ID");
+  const char* env = ncclGetEnv("NCCL_COMM_ID");
  if (env && myrank == 0) {
    INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env);
    NCCLCHECKGOTO(bootstrapCreateRoot((struct ncclBootstrapHandle*)&commId, true), res, fail);
@@ -1602,7 +1645,7 @@ exit:
 fail:
  if (comm) {
    if (comm->abortFlag) ncclCudaHostFree((void *)comm->abortFlag);
-    if (comm->abortFlagRefCount) free(comm->abortFlagRefCount);
+    if (comm->abortFlagRefCount) free((void*)comm->abortFlagRefCount);
    free(comm);
  }
  if (newcomm) *newcomm = NULL;
@@ -1777,6 +1820,11 @@ static ncclResult_t commCleanup(ncclComm_t comm) {
    CUDACHECK(cudaSetDevice(commDevice));
  }

+  if (comm->tuner != NULL) {
+    NCCLCHECK(comm->tuner->destroy());
+    NCCLCHECK(ncclCloseTunerPlugin(&comm->tuner));
+  }
+
  NCCLCHECK(commFree(comm));

  if (savedDevice != commDevice) {
@@ -1991,6 +2039,7 @@ ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newc
  NCCLCHECK(ncclGroupStartInternal());
  NCCLCHECKGOTO(PtrCheck(comm, "CommSplit", "comm"), res, fail);
  NCCLCHECKGOTO(PtrCheck(newcomm, "CommSplit", "newcomm"), res, fail);
+  NCCLCHECKGOTO(ncclCommEnsureReady(comm), res, fail);

  /* *newcomm should be NCCL_COMM_NULL until comm split fully complete. */
  *newcomm = NCCL_COMM_NULL;
@@ -2037,7 +2086,7 @@ fail:
  if (childComm) {
    if (comm && !comm->config.splitShare) {
      if (childComm->abortFlag) ncclCudaHostFree((void*)childComm->abortFlag);
-      if (childComm->abortFlagRefCount) free(childComm->abortFlagRefCount);
+      if (childComm->abortFlagRefCount) free((void*)childComm->abortFlagRefCount);
    }
    free(childComm);
  }
@@ -2074,6 +2123,7 @@ ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) {
  NCCLCHECK(PtrCheck(asyncError, "ncclGetAsyncError", "asyncError"));

  *asyncError = __atomic_load_n(&comm->asyncResult, __ATOMIC_ACQUIRE);
+  if (*asyncError == ncclSuccess && comm->proxyState) *asyncError = __atomic_load_n(&comm->proxyState->asyncResult, __ATOMIC_ACQUIRE);
  return ncclSuccess;
 }

@@ -2116,3 +2166,208 @@ ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
  *rank = comm->rank;
  return ncclSuccess;
 }
+
+NCCL_PARAM(LocalRegister, "LOCAL_REGISTER", 1);
+
+NCCL_API(ncclResult_t, ncclCommRegister, const ncclComm_t comm, void* buff, size_t size, void** handle);
+ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) {
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  ncclResult_t ret = ncclSuccess;
+
+#if CUDART_VERSION >= 12010
+  size_t granularity;
+  if (ncclParamLocalRegister()) {
+    if (comm == NCCL_COMM_NULL || buff == NULL || handle == NULL || size == 0) {
+      WARN("Invalid arguments comm %p, buff %p, size %ld, handle %p", comm, buff, size, handle);
+      ret = ncclInvalidArgument;
+    } else if (comm->nvlsSupport) {
+      CUmulticastObjectProp prop = comm->nvlsResources->properties;
+
+      prop.size = size;
+      CUCHECK(cuMulticastGetGranularity(&granularity, &prop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
+
+      if ((uintptr_t)buff % comm->nvlsResources->ucGran == 0 && size % granularity == 0) {
+        /* we can direct register what user provide */
+        struct ncclRegRequest* req;
+        NCCLCHECK(ncclCalloc(&req, 1));
+        req->buff = (uintptr_t)buff;
+        req->size = size;
+        ncclIntruQueueEnqueue(&comm->regRequestQueue, req);
+        *handle = (void*)req;
+      } else {
+        void* base;
+        size_t baseSize;
+        /* Since we don't provide actually allocated buffer size for users by ncclMemAlloc,
+         * therefore, we need to get the full range of the buffer by cuMemGetAddressRange to
+         * register buffers. */
+        CUCHECK(cuMemGetAddressRange((CUdeviceptr*)&base, &baseSize, (CUdeviceptr)buff));
+        if ((uintptr_t)base % comm->nvlsResources->ucGran == 0 && baseSize % granularity == 0) {
+          struct ncclRegRequest* req;
+          NCCLCHECK(ncclCalloc(&req, 1));
+          req->buff = (uintptr_t)base;
+          req->size = baseSize;
+          ncclIntruQueueEnqueue(&comm->regRequestQueue, req);
+          *handle = (void*)req;
+        } else {
+          WARN("register fails, buffer %p (aligned %s, granularity %ld) and size %ld (aligned %s, granularity %ld) for registration", buff, (uintptr_t)buff % comm->nvlsResources->ucGran == 0 ? "TRUE" : "FALSE", comm->nvlsResources->ucGran, size, size % granularity == 0 ? "TRUE" : "FALSE", granularity);
+          ret = ncclInvalidArgument;
+        }
+      }
+    }
+  }
+#endif
+
+  return ret;
+}
+
+NCCL_API(ncclResult_t, ncclCommDeregister, const ncclComm_t comm, void* handle);
+ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle) {
+  ncclResult_t ret = ncclSuccess;
+
+#if CUDART_VERSION >= 12010
+  struct ncclRegRequest* dreq = (struct ncclRegRequest*)handle;
+  if (ncclParamLocalRegister()) {
+    if (comm == NCCL_COMM_NULL || handle == NULL) {
+      WARN("Invalid arguments comm %p, handle %p", comm, handle);
+      ret = ncclInvalidArgument;
+    } else {
+      struct ncclRegRecord* rec;
+
+      /* first release register record */
+      rec = ncclIntruQueueHead(&comm->regRecordQueue);
+
+      while (rec) {
+        if (rec->buff == dreq->buff && rec->size == dreq->size) {
+          NCCLCHECK(ncclNvlsDeregBuffer(&rec->mcHandle, rec->regAddr, rec->dev, rec->regSize));
+          ncclIntruQueueDelete(&comm->regRecordQueue, rec);
+          free(rec->addrs);
+          free(rec);
+          break;
+        }
+        rec = rec->next;
+      }
+
+      /* then free register request */
+      if (ncclIntruQueueDelete(&comm->regRequestQueue, dreq) == false) {
+        WARN("Invalid handle %p", handle);
+        ret = ncclInvalidArgument;
+      }
+    }
+  }
+#endif
+
+  return ret;
+}
+
+NCCL_API(ncclResult_t, ncclMemAlloc, void **ptr, size_t size);
+ncclResult_t  ncclMemAlloc(void **ptr, size_t size) {
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  ncclResult_t ret = ncclSuccess;
+
+#if CUDART_VERSION >= 12010
+  size_t memGran = 0;
+  size_t mcGran = 0;
+  CUdevice currentDev;
+  CUmemAllocationProp memprop = {};
+  CUmulticastObjectProp mcprop = {};
+  CUmemAccessDesc accessDesc = {};
+  CUmemGenericAllocationHandle handle;
+  int cudaDev;
+  int flag = 0;
+  int dcnt;
+  int mcSupport = 0;
+
+  if (ptr == NULL || size == 0) goto fallback;
+
+  if (ncclCudaLibraryInit() != ncclSuccess) goto fallback;
+
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  CUCHECK(cuDeviceGet(&currentDev, cudaDev));
+  if (CUPFN(cuMulticastCreate) != NULL)
+    CUCHECK(cuDeviceGetAttribute(&mcSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, currentDev));
+
+  if (mcSupport) {
+    memprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+    memprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+    memprop.requestedHandleTypes = NVLS_CU_MEM_HANDLE_TYPE;
+    memprop.location.id = currentDev;
+    // Query device to see if RDMA support is available
+    CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, currentDev));
+    if (flag) memprop.allocFlags.gpuDirectRDMACapable = 1;
+    CUCHECK(cuMemGetAllocationGranularity(&memGran, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
+
+    /* mc property */
+    CUDACHECK(cudaGetDeviceCount(&dcnt));
+    mcprop.size = size;
+    /* device cnt is a dummy value right now, it might affect mc granularity in the future. */
+    mcprop.numDevices = dcnt;
+    mcprop.handleTypes = NVLS_CU_MEM_HANDLE_TYPE;
+    mcprop.flags = 0;
+    CUCHECK(cuMulticastGetGranularity(&mcGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
+
+    /* only size needs to be aligned to mcGran */
+    ALIGN_SIZE(size, mcGran);
+    /* Allocate the physical memory on the device */
+    CUCHECK(cuMemCreate(&handle, size, &memprop, 0));
+    /* Reserve a virtual address range */
+    CUCHECK(cuMemAddressReserve((CUdeviceptr*)ptr, size, memGran, 0, 0));
+    /* Map the virtual address range to the physical allocation */
+    CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
+    /* Now allow RW access to the newly mapped memory */
+    for (int i = 0; i < dcnt; ++i) {
+      int p2p = 0;
+      if (i == cudaDev || ((cudaDeviceCanAccessPeer(&p2p, cudaDev, i) == cudaSuccess) && p2p)) {
+        accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+        accessDesc.location.id = i;
+        accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+        CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
+      }
+    }
+    goto exit;
+  }
+
+fallback:
+#endif
+  CUDACHECKGOTO(cudaMalloc(ptr, size), ret, fail);
+
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+NCCL_API(ncclResult_t, ncclMemFree, void *ptr);
+ncclResult_t  ncclMemFree(void *ptr) {
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  ncclResult_t ret = ncclSuccess;
+  int saveDevice;
+
+  CUDACHECK(cudaGetDevice(&saveDevice));
+#if CUDART_VERSION >= 12010
+  CUdevice ptrDev = 0;
+  int mcSupport = 0;
+
+  if (ptr == NULL) goto fallback;
+
+  if (ncclCudaLibraryInit() != ncclSuccess) goto fallback;
+
+  CUCHECKGOTO(cuPointerGetAttribute((void*)&ptrDev, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, (CUdeviceptr)ptr), ret, fail);
+  if (CUPFN(cuMulticastCreate) != NULL)
+    CUCHECKGOTO(cuDeviceGetAttribute(&mcSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, ptrDev), ret, fail);
+
+  CUDACHECKGOTO(cudaSetDevice((int)ptrDev), ret, fail);
+  if (mcSupport) {
+    NCCLCHECKGOTO(ncclCuMemFree(ptr), ret, fail);
+    goto exit;
+  }
+
+fallback:
+#endif
+  CUDACHECKGOTO(cudaFree(ptr), ret, fail);
+
+exit:
+  cudaSetDevice(saveDevice);
+  return ret;
+fail:
+  goto exit;
+}
@@ -12,7 +12,7 @@
 #include <dlfcn.h>

 // This env var (NCCL_CUMEM_ENABLE) toggles cuMem API usage
-NCCL_PARAM(CuMemEnable, "CUMEM_ENABLE", 0);
+NCCL_PARAM(CuMemEnable, "CUMEM_ENABLE", -2);

 static int ncclCuMemSupported = 0;

@@ -43,7 +43,9 @@ error:
 }

 int ncclCuMemEnable() {
-  return ((ncclParamCuMemEnable() == -2 && ncclCuMemSupported) || ncclParamCuMemEnable());
+  // NCCL_CUMEM_ENABLE=-2 means auto-detect CUMEM support
+  int param = ncclParamCuMemEnable();
+  return  param >= 0 ? param : (param == -2 && ncclCuMemSupported);
 }

 #define DECLARE_CUDA_PFN(symbol,version) PFN_##symbol##_v##version pfn_##symbol = nullptr
@@ -74,6 +76,8 @@ DECLARE_CUDA_PFN(cuMemRelease, 10020);
 DECLARE_CUDA_PFN(cuMemRetainAllocationHandle, 11000);
 DECLARE_CUDA_PFN(cuMemSetAccess, 10020);
 DECLARE_CUDA_PFN(cuMemUnmap, 10020);
+/* ncclMemAlloc/Free */
+DECLARE_CUDA_PFN(cuPointerGetAttribute, 4000);
 #if CUDA_VERSION >= 11070
 /* transport/collNet.cc/net.cc*/
 DECLARE_CUDA_PFN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support
@@ -137,6 +141,8 @@ static ncclResult_t cudaPfnFuncLoader(void) {
  LOAD_SYM(cuMemRetainAllocationHandle, 11000, 1);
  LOAD_SYM(cuMemSetAccess, 10020, 1);
  LOAD_SYM(cuMemUnmap, 10020, 1);
+/* ncclMemAlloc/Free */
+  LOAD_SYM(cuPointerGetAttribute, 4000, 1);
 #if CUDA_VERSION >= 11070
  LOAD_SYM(cuMemGetHandleForAddressRange, 11070, 1); // DMA-BUF support
 #endif
@@ -158,7 +164,7 @@ static ncclResult_t initResult;

 static void initOnceFunc() {
  do {
-    char* val = getenv("CUDA_LAUNCH_BLOCKING");
+    const char* val = ncclGetEnv("CUDA_LAUNCH_BLOCKING");
    ncclCudaLaunchBlocking = val!=nullptr && val[0]!=0 && !(val[0]=='0' && val[1]==0);
  } while (0);

@@ -167,7 +173,7 @@ static void initOnceFunc() {
   * Load CUDA driver library
   */
  char path[1024];
-  char *ncclCudaPath = getenv("NCCL_CUDA_PATH");
+  const char *ncclCudaPath = ncclGetEnv("NCCL_CUDA_PATH");
  if (ncclCudaPath == NULL)
    snprintf(path, 1024, "%s", "libcuda.so");
  else
@@ -50,6 +50,9 @@ ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols) {
  ASSIGN_SYM(ibvSymbols, ibv_destroy_qp, ibv_internal_destroy_qp);
  ASSIGN_SYM(ibvSymbols, ibv_fork_init, ibv_internal_fork_init);
  ASSIGN_SYM(ibvSymbols, ibv_event_type_str, ibv_internal_event_type_str);
+  
+  ASSIGN_SYM(ibvSymbols, ibv_query_ece, ibv_internal_query_ece);
+  ASSIGN_SYM(ibvSymbols, ibv_set_ece, ibv_internal_set_ece);

  ibvSymbols->ibv_internal_reg_mr = &ibv_internal_reg_mr;
  ibvSymbols->ibv_internal_query_port = &ibv_internal_query_port;
@@ -123,6 +126,9 @@ ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols) {
  LOAD_SYM(ibvhandle, "ibv_fork_init", ibvSymbols->ibv_internal_fork_init);
  LOAD_SYM(ibvhandle, "ibv_event_type_str", ibvSymbols->ibv_internal_event_type_str);

+  LOAD_SYM_VERSION(ibvhandle, "ibv_query_ece", ibvSymbols->ibv_internal_query_ece, "IBVERBS_1.10");
+  LOAD_SYM_VERSION(ibvhandle, "ibv_set_ece",   ibvSymbols->ibv_internal_set_ece, "IBVERBS_1.10");
+
  return ncclSuccess;

 teardown:
@@ -150,6 +156,8 @@ teardown:
  ibvSymbols->ibv_internal_destroy_qp = NULL;
  ibvSymbols->ibv_internal_fork_init = NULL;
  ibvSymbols->ibv_internal_event_type_str = NULL;
+  ibvSymbols->ibv_internal_query_ece = NULL;
+  ibvSymbols->ibv_internal_set_ece = NULL;

  if (ibvhandle != NULL) dlclose(ibvhandle);
  return ncclSystemError;
@@ -45,11 +45,30 @@ ncclResult_t wrap_ibv_symbols(void) {
  } \
  return ncclSuccess;

+#define IBV_INT_CHECK_RET_ERRNO_OPTIONAL(container, internal_name, call, success_retval, name, supported) \
+  if (container.internal_name == NULL) { \
+    INFO(NCCL_NET, "Call to " name " skipped, internal_name doesn't exist"); \
+    *supported = 0; \
+    return ncclSuccess; \
+  } \
+  int ret = container.call; \
+  if (ret == ENOTSUP || ret == EOPNOTSUPP) { \
+    INFO(NCCL_NET, "Call to " name " failed with error %s errno %d", strerror(ret), ret); \
+    *supported = 0; \
+    return ncclSuccess; \
+  } else if (ret != success_retval) { \
+    WARN("Call to " name " failed with error %s errno %d", strerror(ret), ret); \
+    *supported = 1; \
+    return ncclSystemError; \
+  } \
+  *supported = 1; \
+  return ncclSuccess;
+
 #define IBV_INT_CHECK_RET_ERRNO(container, internal_name, call, success_retval, name) \
  CHECK_NOT_NULL(container, internal_name); \
  int ret = container.call; \
  if (ret != success_retval) { \
-    WARN("Call to " name " failed with error %s", strerror(ret)); \
+    WARN("Call to " name " failed with error %s errno %d", strerror(ret), ret); \
    return ncclSystemError; \
  } \
  return ncclSuccess;
@@ -187,6 +206,14 @@ ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int
  IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_modify_qp, ibv_internal_modify_qp(qp, attr, attr_mask), 0, "ibv_modify_qp");
 }

+ncclResult_t wrap_ibv_query_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
+  IBV_INT_CHECK_RET_ERRNO_OPTIONAL(ibvSymbols, ibv_internal_query_ece, ibv_internal_query_ece(qp, ece), 0, "ibv_query_ece", supported);
+}
+
+ncclResult_t wrap_ibv_set_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
+  IBV_INT_CHECK_RET_ERRNO_OPTIONAL(ibvSymbols, ibv_internal_set_ece, ibv_internal_set_ece(qp, ece), 0, "ibv_set_ece", supported);
+}
+
 ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event) {
  *ret = (char *) ibvSymbols.ibv_internal_event_type_str(event);
  return ncclSuccess;
@@ -30,7 +30,7 @@ ncclResult_t ncclIpcSocketInit(ncclIpcSocket *handle, int rank, uint64_t hash, v
  handle->fd = -1;
  handle->socketName[0] = '\0';
  if ((fd = socket(AF_UNIX, SOCK_DGRAM, 0)) < 0) {
-    WARN("UDS: Socket creation error : %d", errno);
+    WARN("UDS: Socket creation error : %s (%d)", strerror(errno), errno);
    return ncclSystemError;
  }

@@ -54,7 +54,7 @@ ncclResult_t ncclIpcSocketInit(ncclIpcSocket *handle, int rank, uint64_t hash, v
  cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick
 #endif
  if (bind(fd, (struct sockaddr *)&cliaddr, sizeof(cliaddr)) < 0) {
-    WARN("UDS: Binding to socket %s failed : %d", temp, errno);
+    WARN("UDS: Binding to socket %s failed : %s (%d)", temp, strerror(errno), errno);
    close(fd);
    return ncclSystemError;
  }
@@ -73,6 +73,15 @@ ncclResult_t ncclIpcSocketInit(ncclIpcSocket *handle, int rank, uint64_t hash, v
  return ncclSuccess;
 }

+ncclResult_t ncclIpcSocketGetFd(struct ncclIpcSocket* handle, int* fd) {
+  if (handle == NULL) {
+    WARN("ncclSocketGetFd: pass NULL socket");
+    return ncclInvalidArgument;
+  }
+  if (fd) *fd = handle->fd;
+  return ncclSuccess;
+}
+
 ncclResult_t ncclIpcSocketClose(ncclIpcSocket *handle) {
  if (handle == NULL) {
    return ncclInternalError;
@@ -90,7 +99,7 @@ ncclResult_t ncclIpcSocketClose(ncclIpcSocket *handle) {
  return ncclSuccess;
 }

-ncclResult_t ncclIpcSocketRecvFd(ncclIpcSocket *handle, int *recvFd) {
+ncclResult_t ncclIpcSocketRecvMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, int *recvFd) {
  struct msghdr msg = {0, 0, 0, 0, 0, 0, 0};
  struct iovec iov[1];

@@ -107,8 +116,13 @@ ncclResult_t ncclIpcSocketRecvFd(ncclIpcSocket *handle, int *recvFd) {
  msg.msg_control = control_un.control;
  msg.msg_controllen = sizeof(control_un.control);

-  iov[0].iov_base = (void *)dummy_buffer;
-  iov[0].iov_len = sizeof(dummy_buffer);
+  if (hdr == NULL) {
+    iov[0].iov_base = (void *)dummy_buffer;
+    iov[0].iov_len = sizeof(dummy_buffer);
+  } else {
+    iov[0].iov_base = hdr;
+    iov[0].iov_len = hdrLen;
+  }

  msg.msg_iov = iov;
  msg.msg_iovlen = 1;
@@ -121,25 +135,30 @@ ncclResult_t ncclIpcSocketRecvFd(ncclIpcSocket *handle, int *recvFd) {
    if (handle->abortFlag && *handle->abortFlag) return ncclInternalError;
  }

-  if (((cmptr = CMSG_FIRSTHDR(&msg)) != NULL) && (cmptr->cmsg_len == CMSG_LEN(sizeof(int)))) {
-    if ((cmptr->cmsg_level != SOL_SOCKET) || (cmptr->cmsg_type != SCM_RIGHTS)) {
-      WARN("UDS: Receiving data over socket failed");
+  if (recvFd != NULL) {
+    if (((cmptr = CMSG_FIRSTHDR(&msg)) != NULL) && (cmptr->cmsg_len == CMSG_LEN(sizeof(int)))) {
+      if ((cmptr->cmsg_level != SOL_SOCKET) || (cmptr->cmsg_type != SCM_RIGHTS)) {
+        WARN("UDS: Receiving data over socket failed");
+      return ncclSystemError;
+      }
+
+      memmove(recvFd, CMSG_DATA(cmptr), sizeof(*recvFd));
+    } else {
+      WARN("UDS: Receiving data over socket %s failed", handle->socketName);
      return ncclSystemError;
    }
-
-    memmove(recvFd, CMSG_DATA(cmptr), sizeof(*recvFd));
-  } else {
-    WARN("UDS: Receiving data over socket %s failed", handle->socketName);
-    return ncclSystemError;
+    TRACE(NCCL_INIT|NCCL_P2P, "UDS: Got recvFd %d from socket %s", *recvFd, handle->socketName);
  }

-  TRACE(NCCL_INIT|NCCL_P2P, "UDS: Got recvFd %d from socket %s", *recvFd, handle->socketName);
-
  return ncclSuccess;
 }

-ncclResult_t ncclIpcSocketSendFd(ncclIpcSocket *handle, const int sendFd, int rank, uint64_t hash) {
-  struct msghdr msg;
+ncclResult_t ncclIpcSocketRecvFd(ncclIpcSocket *handle, int *recvFd) {
+  return ncclIpcSocketRecvMsg(handle, NULL, 0, recvFd);
+}
+
+ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, const int sendFd, int rank, uint64_t hash) {
+  struct msghdr msg = {0, 0, 0, 0, 0, 0, 0};
  struct iovec iov[1];
  char temp[NCCL_IPC_SOCKNAME_LEN];

@@ -149,6 +168,7 @@ ncclResult_t ncclIpcSocketSendFd(ncclIpcSocket *handle, const int sendFd, int ra
  } control_un;

  struct cmsghdr *cmptr;
+  char dummy_buffer[1];
  struct sockaddr_un cliaddr;

  // Construct client address to send this shareable handle to
@@ -162,35 +182,43 @@ ncclResult_t ncclIpcSocketSendFd(ncclIpcSocket *handle, const int sendFd, int ra
  }
  (void) strncpy(cliaddr.sun_path, temp, len);

-  TRACE(NCCL_INIT, "UDS: Sending fd %d to UDS socket %s", sendFd, temp);
-
 #ifdef USE_ABSTRACT_SOCKET
  cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick
 #endif

-  msg.msg_control = control_un.control;
-  msg.msg_controllen = sizeof(control_un.control);
+  TRACE(NCCL_INIT, "UDS: Sending hdr %p len %d to UDS socket %s", hdr, hdrLen, temp);

-  cmptr = CMSG_FIRSTHDR(&msg);
-  cmptr->cmsg_len = CMSG_LEN(sizeof(int));
-  cmptr->cmsg_level = SOL_SOCKET;
-  cmptr->cmsg_type = SCM_RIGHTS;
+  if (sendFd != -1) {
+    TRACE(NCCL_INIT, "UDS: Sending fd %d to UDS socket %s", sendFd, temp);

-  memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd));
+    msg.msg_control = control_un.control;
+    msg.msg_controllen = sizeof(control_un.control);
+
+    cmptr = CMSG_FIRSTHDR(&msg);
+    cmptr->cmsg_len = CMSG_LEN(sizeof(int));
+    cmptr->cmsg_level = SOL_SOCKET;
+    cmptr->cmsg_type = SCM_RIGHTS;
+    memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd));
+  }

  msg.msg_name = (void *)&cliaddr;
  msg.msg_namelen = sizeof(struct sockaddr_un);

-  iov[0].iov_base = (void *)"";
-  iov[0].iov_len = 1;
+  if (hdr == NULL) {
+    iov[0].iov_base = (void *)dummy_buffer;
+    iov[0].iov_len = sizeof(dummy_buffer);
+  } else {
+    iov[0].iov_base = hdr;
+    iov[0].iov_len = hdrLen;
+  }
  msg.msg_iov = iov;
  msg.msg_iovlen = 1;
  msg.msg_flags = 0;

  ssize_t sendResult;
-  while ((sendResult = sendmsg(handle->fd, &msg, 0)) <= 0) {
+  while ((sendResult = sendmsg(handle->fd, &msg, 0)) < 0) {
    if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
-      WARN("UDS: Sending data over socket %s failed : %d", temp, errno);
+      WARN("UDS: Sending data over socket %s failed : %s (%d)", temp, strerror(errno), errno);
      return ncclSystemError;
    }
    if (handle->abortFlag && *handle->abortFlag) return ncclInternalError;
@@ -198,3 +226,7 @@ ncclResult_t ncclIpcSocketSendFd(ncclIpcSocket *handle, const int sendFd, int ra

  return ncclSuccess;
 }
+
+ncclResult_t ncclIpcSocketSendFd(ncclIpcSocket *handle, const int sendFd, int rank, uint64_t hash) {
+  return ncclIpcSocketSendMsg(handle, NULL, 0, sendFd, rank, hash);
+}
@@ -63,7 +63,7 @@ void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int6
  static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
  pthread_mutex_lock(&mutex);
  if (__atomic_load_n(cache, __ATOMIC_RELAXED) == uninitialized) {
-    char* str = getenv(env);
+    const char* str = ncclGetEnv(env);
    int64_t value = deftVal;
    if (str && strlen(str) > 0) {
      errno = 0;
@@ -79,3 +79,9 @@ void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int6
  }
  pthread_mutex_unlock(&mutex);
 }
+
+const char *ncclGetEnv(const char *name) {
+  static pthread_once_t once = PTHREAD_ONCE_INIT;
+  pthread_once(&once, initEnv);
+  return getenv(name);
+}
@@ -61,7 +61,7 @@ void ncclProfilingDump() {
  static int dumpDone = 0;
  if (dumpDone) return;
  dumpDone = 1;
-  const char* str = getenv("NCCL_PROXY_PROFILE");
+  const char* str = ncclGetEnv("NCCL_PROXY_PROFILE");
  if (!str) { free(profilingEvents); return; }
  FILE* f = fopen(str, "w");
  fprintf(f, "[\n");
@@ -5,6 +5,7 @@
 ************************************************************************/

 #include "shm.h"
+#include "comm.h"
 #include "checks.h"
 #include <sys/types.h>
 #include <sys/mman.h>
@@ -67,7 +68,7 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de
      SYSCHECKGOTO(fd = open(shmPath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), ret, fail);
    }

-    if (ftruncate(fd, realShmSize) != 0) {
+    if (fallocate(fd, 0, 0, realShmSize) != 0) {
      WARN("Error: failed to extend %s to %ld bytes", shmPath, realShmSize);
      ret = ncclSystemError;
      goto fail;
@@ -162,3 +163,37 @@ ncclResult_t ncclShmUnlink(ncclShmHandle_t handle) {
  }
  return ret;
 }
+
+ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize) {
+  ncclResult_t ret = ncclSuccess;
+  int curRound = shmem->round;
+  size_t mycnt;
+
+  if (comm == NULL || shmem == NULL || sendbuff == NULL || recvbuff == NULL) {
+    ret = ncclInvalidArgument;
+    goto exit;
+  }
+
+  memcpy((char*)shmem->ptr[curRound] + comm->localRank * typeSize, sendbuff, typeSize);
+  /* sync among local ranks */
+  mycnt = __atomic_add_fetch(shmem->cnt[curRound], 1, __ATOMIC_ACQ_REL);
+  if (mycnt == comm->localRanks) {
+    *shmem->cnt[curRound ^ 1] = 0; /* prepare next round */
+    __atomic_store_n(shmem->cnt[curRound], comm->localRanks + 1, __ATOMIC_RELEASE); /* release everyone */
+  } else {
+    uint64_t t0 = clockNano();
+    while(__atomic_load_n(shmem->cnt[curRound], __ATOMIC_ACQUIRE) != comm->localRanks + 1) {
+      if (clockNano() - t0 >= 5 * 1000) sched_yield();
+      if (*comm->abortFlag == 1) {
+        ret = ncclInternalError;
+        goto exit;
+      }
+    }
+  }
+
+  memcpy(recvbuff, (const void*)shmem->ptr[curRound], comm->localRanks * typeSize);
+  shmem->round ^= 1;
+
+exit:
+  return ret;
+}
@@ -11,6 +11,7 @@
 #include <unistd.h>
 #include <ifaddrs.h>
 #include <net/if.h>
+#include "param.h"

 static ncclResult_t socketProgressOpt(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int block, int* closed) {
  int bytes = 0;
@@ -84,7 +85,7 @@ static uint16_t socketToPort(union ncclSocketAddress *addr) {
 /* Allow the user to force the IPv4/IPv6 interface selection */
 static int envSocketFamily(void) {
  int family = -1; // Family selection is not forced, will use first one found
-  char* env = getenv("NCCL_SOCKET_FAMILY");
+  const char* env = ncclGetEnv("NCCL_SOCKET_FAMILY");
  if (env == NULL)
    return family;

@@ -325,7 +326,7 @@ int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNa
  // Allow user to force the INET socket family selection
  int sock_family = envSocketFamily();
  // User specified interface
-  char* env = getenv("NCCL_SOCKET_IFNAME");
+  const char* env = ncclGetEnv("NCCL_SOCKET_IFNAME");
  if (env && strlen(env) > 1) {
    INFO(NCCL_ENV, "NCCL_SOCKET_IFNAME set by environment to %s", env);
    // Specified by user : find or fail
@@ -337,10 +338,10 @@ int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNa
    nIfs = findInterfaces("ib", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
    // else see if we can get some hint from COMM ID
    if (nIfs == 0) {
-      char* commId = getenv("NCCL_COMM_ID");
+      const char* commId = ncclGetEnv("NCCL_COMM_ID");
      if (commId && strlen(commId) > 1) {
-	INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commId);
-	// Try to find interface that is in the same subnet as the IP in comm id
+        INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commId);
+        // Try to find interface that is in the same subnet as the IP in comm id
        union ncclSocketAddress idAddr;
        ncclSocketGetAddrFromString(&idAddr, commId);
        nIfs = ncclFindInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, maxIfs);
@@ -0,0 +1,82 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <dlfcn.h>
+#include <errno.h>
+#include <stdlib.h>
+
+#include "debug.h"
+#include "nccl_tuner.h"
+
+pthread_mutex_t tunerPluginLock = PTHREAD_MUTEX_INITIALIZER;
+static int tunerPluginRefCount = -1;
+static void* tunerPluginLib = nullptr;
+ncclTuner_t* tunerSymbol = nullptr;
+
+ncclResult_t ncclLoadTunerPlugin(ncclTuner_t** tuner) {
+  // Initialize to nullptr by default if plugin tuner cannot be loaded.
+  *tuner = nullptr;
+  if (tunerPluginRefCount == -2) return ncclSuccess;
+
+  pthread_mutex_lock(&tunerPluginLock);
+  if (tunerPluginRefCount == -1) {
+    tunerPluginRefCount = -2; // Default: no plugin, don't try again later
+
+    const char* name = getenv("NCCL_TUNER_PLUGIN");
+    if (name) {
+      INFO(NCCL_TUNING, "NCCL_TUNER_PLUGIN set to %s", name);
+      tunerPluginLib = dlopen(name, RTLD_LAZY | RTLD_LOCAL);
+    }
+    if (tunerPluginLib == nullptr) {
+      // dlopen does not guarantee to set errno, but dlerror only gives us a
+      // string, so checking errno doesn't hurt to try to provide a better
+      // error message
+      if (errno == ENOENT) {
+        INFO(NCCL_TUNING, "Tuner: no plugin found '%s', using default tuner instead.", name);
+      } else {
+        INFO(NCCL_TUNING, "Tuner: plugin load '%s' returned error (%d : %s), using default tuner instead.", name, errno, dlerror());
+      }
+    } else {
+      tunerSymbol = (ncclTuner_t*)dlsym(tunerPluginLib, NCCL_TUNER_PLUGIN_SYMBOL);
+      if (tunerSymbol == nullptr) {
+        INFO(NCCL_TUNING, "Tuner: failed to find " NCCL_TUNER_PLUGIN_SYMBOL " in plugin (%s), using default tuner instead.", name);
+        dlclose(tunerPluginLib);
+        tunerPluginLib = nullptr;
+      } else {
+        INFO(NCCL_TUNING, "Opened tuner: '%s'", tunerSymbol->name);
+        tunerPluginRefCount = 0;
+      }
+    }
+  }
+
+  if (tunerPluginRefCount >= 0) {
+    *tuner = tunerSymbol;
+    INFO(NCCL_INIT, "Using tuner plugin: '%s'", tunerSymbol->name);
+    tunerPluginRefCount++;
+  }
+  pthread_mutex_unlock(&tunerPluginLock);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCloseTunerPlugin(ncclTuner_t** tuner) {
+  if (*tuner == nullptr) return ncclSuccess;
+  pthread_mutex_lock(&tunerPluginLock);
+  if (--tunerPluginRefCount == 0) {
+    if (tunerPluginLib == nullptr) {
+      WARN("Tuner plugin refcount is 0, yet tunerPluginLib ptr is NULL\n");
+    } else {
+      INFO(NCCL_TUNING, "Closing tuner: '%s'", tunerSymbol->name);
+      dlclose(tunerPluginLib);
+    }
+    tunerPluginLib = nullptr;
+    tunerSymbol = nullptr;
+    *tuner = nullptr;
+    tunerPluginRefCount = -1;
+  }
+  pthread_mutex_unlock(&tunerPluginLock);
+  return ncclSuccess;
+}
@@ -85,13 +85,13 @@ uint64_t getHash(const char* string, int n) {
 #define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
 uint64_t getHostHash(void) {
  char hostHash[1024];
-  char *hostId;
+  const char *hostId;

  // Fall back is the full hostname if something fails
  (void) getHostName(hostHash, sizeof(hostHash), '\0');
  int offset = strlen(hostHash);

-  if ((hostId = getenv("NCCL_HOSTID")) != NULL) {
+  if ((hostId = ncclGetEnv("NCCL_HOSTID")) != NULL) {
    INFO(NCCL_ENV, "NCCL_HOSTID set by environment to %s", hostId);
    strncpy(hostHash, hostId, sizeof(hostHash));
  } else {
@@ -78,6 +78,15 @@ typedef struct ncclConfig_v21700 {
  NCCL_CONFIG_UNDEF_INT                     /* splitShare */            \
 }

+/* NCCL malloc and free function for all types of NCCL optimizations
+ * (e.g. user buffer registration). The actual allocated size might
+ * be larger than requested due to granularity requirement. */
+ncclResult_t  ncclMemAlloc(void** ptr, size_t size);
+ncclResult_t pncclMemAlloc(void** ptr, size_t size);
+
+ncclResult_t  ncclMemFree(void *ptr);
+ncclResult_t pncclMemFree(void *ptr);
+
 /* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
 * This integer is coded with the MAJOR, MINOR and PATCH level of the
 * NCCL library
@@ -417,6 +426,14 @@ ncclResult_t pncclGroupStart();
 ncclResult_t  ncclGroupEnd();
 ncclResult_t pncclGroupEnd();

+/* Register CUDA buffer for zero-copy operation */
+ncclResult_t  ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
+ncclResult_t pncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
+
+/* Deregister CUDA buffer */
+ncclResult_t  ncclCommDeregister(const ncclComm_t comm, void* handle);
+ncclResult_t pncclCommDeregister(const ncclComm_t comm, void* handle);
+
 #ifdef __cplusplus
 } // end extern "C"
 #endif
@@ -1,3 +1,9 @@
+/*************************************************************************
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
 #include "net.h"
 #include "bootstrap.h"
 #include "checks.h"
@@ -9,148 +15,190 @@
 //#include <sys/stat.h>
 //#include <unistd.h>

-static ncclNet_v6_t ncclNet_v4_as_v6;
-static ncclNet_v6_t ncclNet_v5_as_v6;
-static ncclNet_v4_t *ncclNet_v4;
+static ncclNet_v7_t ncclNet_v5_as_v7;
+static ncclNet_v7_t ncclNet_v6_as_v7;
 static ncclNet_v5_t *ncclNet_v5;
-static ncclCollNet_v6_t ncclCollNet_v4_as_v6;
-static ncclCollNet_v6_t ncclCollNet_v5_as_v6;
-static ncclCollNet_v4_t *ncclCollNet_v4;
+static ncclNet_v6_t *ncclNet_v6;
+static ncclCollNet_v7_t ncclCollNet_v5_as_v7;
+static ncclCollNet_v7_t ncclCollNet_v6_as_v7;
 static ncclCollNet_v5_t *ncclCollNet_v5;
+static ncclCollNet_v6_t *ncclCollNet_v6;

-static ncclResult_t ncclNet_v4_as_v6_getProperties(int dev, ncclNetProperties_v6_t* props) {
-  ncclNetProperties_v4_t p4;
-  ncclResult_t ans = ncclNet_v4->getProperties(dev, &p4);
+static ncclResult_t ncclNet_v6_as_v7_getProperties(int dev, ncclNetProperties_v7_t* props) {
+  ncclNetProperties_v6_t p6;
+  ncclResult_t ans = ncclNet_v6->getProperties(dev, &p6);
  if (ans != ncclSuccess) return ans;
-  props->name = p4.name;
-  props->pciPath = p4.pciPath;
-  props->guid = p4.guid;
-  props->ptrSupport = p4.ptrSupport;
-  props->speed = p4.speed;
-  props->port = p4.port;
-  props->maxComms = p4.maxComms;
-  props->maxRecvs = 1;
-  props->latency = 0;
+  props->name = p6.name;
+  props->pciPath = p6.pciPath;
+  props->guid = p6.guid;
+  props->ptrSupport = p6.ptrSupport;
+  props->speed = p6.speed;
+  props->port = p6.port;
+  props->maxComms = p6.maxComms;
+  props->maxRecvs = p6.maxRecvs;
+  props->latency = p6.latency;
+  props->netDeviceType = NCCL_NET_DEVICE_HOST;
+  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
  return ncclSuccess;
 }

-static ncclResult_t ncclNet_v4_as_v6_isend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
-  return ncclNet_v4->isend(sendComm, data, size, mhandle, request);
+static ncclResult_t ncclNet_v6_as_v7_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
+  return ncclNet_v6->connect(dev, handle, sendComm);
 }

-static ncclResult_t ncclNet_v4_as_v6_irecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
-  if (n == 0) return ncclSuccess;
-  if (n != 1) return ncclInvalidArgument;
-  return ncclNet_v4->irecv(recvComm, data[0], sizes[0], mhandles[0], request);
+static ncclResult_t ncclNet_v6_as_v7_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
+  return ncclNet_v6->accept(listenComm, recvComm);
 }

-static ncclResult_t ncclNet_v4_as_v6_iflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) {
-  if (n == 0) return ncclSuccess;
-  if (n != 1) return ncclInvalidArgument;
-  return ncclNet_v4->iflush(recvComm, data[0], sizes[0], mhandles[0], request);
-}
-
-// We use a wrapper around the v4 init to copy over the struct contents
-// post-init since they may not be initialized before hand.
-static ncclResult_t ncclNet_v4_as_v6_init(ncclDebugLogger_t logfn) {
-  NCCLCHECK(ncclNet_v4->init(logfn));
-  ncclNet_v4_as_v6.name = ncclNet_v4->name;
-  ncclNet_v4_as_v6.devices = ncclNet_v4->devices;
-  ncclNet_v4_as_v6.getProperties = ncclNet_v4_as_v6_getProperties;
-  ncclNet_v4_as_v6.listen = ncclNet_v4->listen;
-  ncclNet_v4_as_v6.connect = ncclNet_v4->connect;
-  ncclNet_v4_as_v6.accept = ncclNet_v4->accept;
-  ncclNet_v4_as_v6.regMr = ncclNet_v4->regMr;
-  ncclNet_v4_as_v6.regMrDmaBuf = NULL;
-  ncclNet_v4_as_v6.deregMr = ncclNet_v4->deregMr;
-  ncclNet_v4_as_v6.isend = ncclNet_v4_as_v6_isend;
-  ncclNet_v4_as_v6.irecv = ncclNet_v4_as_v6_irecv;
-  ncclNet_v4_as_v6.iflush = ncclNet_v4_as_v6_iflush;
-  ncclNet_v4_as_v6.test = ncclNet_v4->test;
-  ncclNet_v4_as_v6.closeSend = ncclNet_v4->closeSend;
-  ncclNet_v4_as_v6.closeRecv = ncclNet_v4->closeRecv;
-  ncclNet_v4_as_v6.closeListen = ncclNet_v4->closeListen;
+static ncclResult_t ncclNet_v6_as_v7_init(ncclDebugLogger_t logfn) {
+  NCCLCHECK(ncclNet_v6->init(logfn));
+  ncclNet_v6_as_v7.name = ncclNet_v6->name;
+  ncclNet_v6_as_v7.devices = ncclNet_v6->devices;
+  ncclNet_v6_as_v7.getProperties = ncclNet_v6_as_v7_getProperties; // ncclNet_v5->getProperties;
+  ncclNet_v6_as_v7.listen = ncclNet_v6->listen;
+  ncclNet_v6_as_v7.connect = ncclNet_v6_as_v7_connect;
+  ncclNet_v6_as_v7.accept =  ncclNet_v6_as_v7_accept;
+  ncclNet_v6_as_v7.regMr = ncclNet_v6->regMr;
+  ncclNet_v6_as_v7.regMrDmaBuf = ncclNet_v6->regMrDmaBuf;
+  ncclNet_v6_as_v7.deregMr = ncclNet_v6->deregMr;
+  ncclNet_v6_as_v7.isend = ncclNet_v6->isend;
+  ncclNet_v6_as_v7.irecv = ncclNet_v6->irecv;
+  ncclNet_v6_as_v7.iflush = ncclNet_v6->iflush;
+  ncclNet_v6_as_v7.test = ncclNet_v6->test;
+  ncclNet_v6_as_v7.closeSend = ncclNet_v6->closeSend;
+  ncclNet_v6_as_v7.closeRecv = ncclNet_v6->closeRecv;
+  ncclNet_v6_as_v7.closeListen = ncclNet_v6->closeListen;
+  ncclNet_v6_as_v7.getDeviceMr = NULL;
+  ncclNet_v6_as_v7.irecvConsumed = NULL;
  return ncclSuccess;
 }

+static ncclResult_t ncclNet_v5_as_v7_getProperties(int dev, ncclNetProperties_v7_t* props) {
+  ncclNetProperties_v6_t p6;
+  ncclResult_t ans = ncclNet_v5->getProperties(dev, &p6);
+  if (ans != ncclSuccess) return ans;
+  props->name = p6.name;
+  props->pciPath = p6.pciPath;
+  props->guid = p6.guid;
+  props->ptrSupport = p6.ptrSupport;
+  props->speed = p6.speed;
+  props->port = p6.port;
+  props->maxComms = p6.maxComms;
+  props->maxRecvs = p6.maxRecvs;
+  props->latency = p6.latency;
+  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
+  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclNet_v5_as_v7_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
+  return ncclNet_v5->connect(dev, handle, sendComm);
+}
+
+static ncclResult_t ncclNet_v5_as_v7_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
+  return ncclNet_v5->accept(listenComm, recvComm);
+}
+
 // We use a wrapper around the v5 init to copy over the struct contents
 // post-init since they may not be initialized before hand.
-static ncclResult_t ncclNet_v5_as_v6_init(ncclDebugLogger_t logfn) {
+static ncclResult_t ncclNet_v5_as_v7_init(ncclDebugLogger_t logfn) {
  NCCLCHECK(ncclNet_v5->init(logfn));
-  ncclNet_v5_as_v6.name = ncclNet_v5->name;
-  ncclNet_v5_as_v6.devices = ncclNet_v5->devices;
-  ncclNet_v5_as_v6.getProperties = ncclNet_v5->getProperties;
-  ncclNet_v5_as_v6.listen = ncclNet_v5->listen;
-  ncclNet_v5_as_v6.connect = ncclNet_v5->connect;
-  ncclNet_v5_as_v6.accept = ncclNet_v5->accept;
-  ncclNet_v5_as_v6.regMr = ncclNet_v5->regMr;
-  ncclNet_v5_as_v6.regMrDmaBuf = NULL;
-  ncclNet_v5_as_v6.deregMr = ncclNet_v5->deregMr;
-  ncclNet_v5_as_v6.isend = ncclNet_v5->isend;
-  ncclNet_v5_as_v6.irecv = ncclNet_v5->irecv;
-  ncclNet_v5_as_v6.iflush = ncclNet_v5->iflush;
-  ncclNet_v5_as_v6.test = ncclNet_v5->test;
-  ncclNet_v5_as_v6.closeSend = ncclNet_v5->closeSend;
-  ncclNet_v5_as_v6.closeRecv = ncclNet_v5->closeRecv;
-  ncclNet_v5_as_v6.closeListen = ncclNet_v5->closeListen;
+  ncclNet_v5_as_v7.name = ncclNet_v5->name;
+  ncclNet_v5_as_v7.devices = ncclNet_v5->devices;
+  ncclNet_v5_as_v7.getProperties = ncclNet_v5_as_v7_getProperties;
+  ncclNet_v5_as_v7.listen = ncclNet_v5->listen;
+  ncclNet_v5_as_v7.connect = ncclNet_v5_as_v7_connect;
+  ncclNet_v5_as_v7.accept =  ncclNet_v5_as_v7_accept;
+  ncclNet_v5_as_v7.regMr = ncclNet_v5->regMr;
+  ncclNet_v5_as_v7.regMrDmaBuf = NULL;
+  ncclNet_v5_as_v7.deregMr = ncclNet_v5->deregMr;
+  ncclNet_v5_as_v7.isend = ncclNet_v5->isend;
+  ncclNet_v5_as_v7.irecv = ncclNet_v5->irecv;
+  ncclNet_v5_as_v7.iflush = ncclNet_v5->iflush;
+  ncclNet_v5_as_v7.test = ncclNet_v5->test;
+  ncclNet_v5_as_v7.closeSend = ncclNet_v5->closeSend;
+  ncclNet_v5_as_v7.closeRecv = ncclNet_v5->closeRecv;
+  ncclNet_v5_as_v7.closeListen = ncclNet_v5->closeListen;
+  ncclNet_v5_as_v7.getDeviceMr = NULL;
+  ncclNet_v5_as_v7.irecvConsumed = NULL;
  return ncclSuccess;
 }

-static ncclResult_t ncclCollNet_v4_as_v6_getProperties(int dev, ncclNetProperties_v6_t* props) {
-  ncclNetProperties_v4_t p4;
-  ncclResult_t ans = ncclCollNet_v4->getProperties(dev, &p4);
+static ncclResult_t ncclCollNet_v5_as_v7_getProperties(int dev, ncclNetProperties_v7_t* props) {
+  ncclNetProperties_v6_t p6;
+  ncclResult_t ans = ncclCollNet_v5->getProperties(dev, &p6);
  if (ans != ncclSuccess) return ans;
-  props->name = p4.name;
-  props->pciPath = p4.pciPath;
-  props->guid = p4.guid;
-  props->ptrSupport = p4.ptrSupport;
-  props->speed = p4.speed;
-  props->port = p4.port;
-  props->maxComms = p4.maxComms;
-  props->maxRecvs = 1;
-  props->latency = 0;
-  return ncclSuccess;
-}
-
-// We use a wrapper around the v4 init to copy over the struct contents
-// post-init since they may not be initialized before hand.
-static ncclResult_t ncclCollNet_v4_as_v6_init(ncclDebugLogger_t logfn) {
-  NCCLCHECK(ncclCollNet_v4->init(logfn));
-  ncclCollNet_v4_as_v6.name = ncclCollNet_v4->name;
-  ncclCollNet_v4_as_v6.devices = ncclCollNet_v4->devices;
-  ncclCollNet_v4_as_v6.getProperties = ncclCollNet_v4_as_v6_getProperties;
-  ncclCollNet_v4_as_v6.listen = ncclCollNet_v4->listen;
-  ncclCollNet_v4_as_v6.connect = ncclCollNet_v4->connect;
-  ncclCollNet_v4_as_v6.reduceSupport = ncclCollNet_v4->reduceSupport;
-  ncclCollNet_v4_as_v6.regMr = ncclCollNet_v4->regMr;
-  ncclCollNet_v4_as_v6.regMrDmaBuf = NULL;
-  ncclCollNet_v4_as_v6.deregMr = ncclCollNet_v4->deregMr;
-  ncclCollNet_v4_as_v6.iallreduce = ncclCollNet_v4->iallreduce;
-  ncclCollNet_v4_as_v6.iflush = ncclCollNet_v4->iflush;
-  ncclCollNet_v4_as_v6.test = ncclCollNet_v4->test;
-  ncclCollNet_v4_as_v6.closeColl = ncclCollNet_v4->closeColl;
-  ncclCollNet_v4_as_v6.closeListen = ncclCollNet_v4->closeListen;
+  props->name = p6.name;
+  props->pciPath = p6.pciPath;
+  props->guid = p6.guid;
+  props->ptrSupport = p6.ptrSupport;
+  props->speed = p6.speed;
+  props->port = p6.port;
+  props->maxComms = p6.maxComms;
+  props->maxRecvs = p6.maxRecvs;
+  props->latency = p6.latency;
+  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
+  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
  return ncclSuccess;
 }

 // We use a wrapper around the v5 init to copy over the struct contents
 // post-init since they may not be initialized before hand.
-static ncclResult_t ncclCollNet_v5_as_v6_init(ncclDebugLogger_t logfn) {
+static ncclResult_t ncclCollNet_v5_as_v7_init(ncclDebugLogger_t logfn) {
  NCCLCHECK(ncclCollNet_v5->init(logfn));
-  ncclCollNet_v5_as_v6.name = ncclCollNet_v5->name;
-  ncclCollNet_v5_as_v6.devices = ncclCollNet_v5->devices;
-  ncclCollNet_v5_as_v6.getProperties = ncclCollNet_v5->getProperties;
-  ncclCollNet_v5_as_v6.listen = ncclCollNet_v5->listen;
-  ncclCollNet_v5_as_v6.connect = ncclCollNet_v5->connect;
-  ncclCollNet_v5_as_v6.reduceSupport = ncclCollNet_v5->reduceSupport;
-  ncclCollNet_v5_as_v6.regMr = ncclCollNet_v5->regMr;
-  ncclCollNet_v5_as_v6.regMrDmaBuf = NULL;
-  ncclCollNet_v5_as_v6.deregMr = ncclCollNet_v5->deregMr;
-  ncclCollNet_v5_as_v6.iallreduce = ncclCollNet_v5->iallreduce;
-  ncclCollNet_v5_as_v6.iflush = ncclCollNet_v5->iflush;
-  ncclCollNet_v5_as_v6.test = ncclCollNet_v5->test;
-  ncclCollNet_v5_as_v6.closeColl = ncclCollNet_v5->closeColl;
-  ncclCollNet_v5_as_v6.closeListen = ncclCollNet_v5->closeListen;
+  ncclCollNet_v5_as_v7.name = ncclCollNet_v5->name;
+  ncclCollNet_v5_as_v7.devices = ncclCollNet_v5->devices;
+  ncclCollNet_v5_as_v7.getProperties = ncclCollNet_v5_as_v7_getProperties;
+  ncclCollNet_v5_as_v7.listen = ncclCollNet_v5->listen;
+  ncclCollNet_v5_as_v7.connect = ncclCollNet_v5->connect;
+  ncclCollNet_v5_as_v7.reduceSupport = ncclCollNet_v5->reduceSupport;
+  ncclCollNet_v5_as_v7.regMr = ncclCollNet_v5->regMr;
+  ncclCollNet_v5_as_v7.regMrDmaBuf = NULL;
+  ncclCollNet_v5_as_v7.deregMr = ncclCollNet_v5->deregMr;
+  ncclCollNet_v5_as_v7.iallreduce = ncclCollNet_v5->iallreduce;
+  ncclCollNet_v5_as_v7.iflush = ncclCollNet_v5->iflush;
+  ncclCollNet_v5_as_v7.test = ncclCollNet_v5->test;
+  ncclCollNet_v5_as_v7.closeColl = ncclCollNet_v5->closeColl;
+  ncclCollNet_v5_as_v7.closeListen = ncclCollNet_v5->closeListen;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclCollNet_v6_as_v7_getProperties(int dev, ncclNetProperties_v7_t* props) {
+  ncclNetProperties_v6_t p6;
+  ncclResult_t ans = ncclCollNet_v6->getProperties(dev, &p6);
+  if (ans != ncclSuccess) return ans;
+  props->name = p6.name;
+  props->pciPath = p6.pciPath;
+  props->guid = p6.guid;
+  props->ptrSupport = p6.ptrSupport;
+  props->speed = p6.speed;
+  props->port = p6.port;
+  props->maxComms = p6.maxComms;
+  props->maxRecvs = p6.maxRecvs;
+  props->latency = p6.latency;
+  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
+  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  return ncclSuccess;
+}
+
+// We use a wrapper around the v5 init to copy over the struct contents
+// post-init since they may not be initialized before hand.
+static ncclResult_t ncclCollNet_v6_as_v7_init(ncclDebugLogger_t logfn) {
+  NCCLCHECK(ncclCollNet_v6->init(logfn));
+  ncclCollNet_v6_as_v7.name = ncclCollNet_v6->name;
+  ncclCollNet_v6_as_v7.devices = ncclCollNet_v6->devices;
+  ncclCollNet_v6_as_v7.getProperties = ncclCollNet_v6_as_v7_getProperties;
+  ncclCollNet_v6_as_v7.listen = ncclCollNet_v6->listen;
+  ncclCollNet_v6_as_v7.connect = ncclCollNet_v6->connect;
+  ncclCollNet_v6_as_v7.reduceSupport = ncclCollNet_v6->reduceSupport;
+  ncclCollNet_v6_as_v7.regMr = ncclCollNet_v6->regMr;
+  ncclCollNet_v6_as_v7.regMrDmaBuf = ncclCollNet_v6->regMrDmaBuf;
+  ncclCollNet_v6_as_v7.deregMr = ncclCollNet_v6->deregMr;
+  ncclCollNet_v6_as_v7.iallreduce = ncclCollNet_v6->iallreduce;
+  ncclCollNet_v6_as_v7.iflush = ncclCollNet_v6->iflush;
+  ncclCollNet_v6_as_v7.test = ncclCollNet_v6->test;
+  ncclCollNet_v6_as_v7.closeColl = ncclCollNet_v6->closeColl;
+  ncclCollNet_v6_as_v7.closeListen = ncclCollNet_v6->closeListen;
  return ncclSuccess;
 }

@@ -167,7 +215,7 @@ enum ncclNetState ncclCollNetStates[3] = { ncclNetStateInit, ncclNetStateInit, n

 ncclResult_t ncclNetPluginInit() {
  char ncclNetPluginName[128];
-  const char* envPluginName = getenv("NCCL_NET_PLUGIN");
+  const char* envPluginName = ncclGetEnv("NCCL_NET_PLUGIN");
  if (envPluginName && strlen(envPluginName)) {
    snprintf(ncclNetPluginName, 128, "libnccl-net-%s.so", envPluginName);
    INFO(NCCL_INIT, "Plugin name set by env to %s", ncclNetPluginName);
@@ -176,62 +224,97 @@ ncclResult_t ncclNetPluginInit() {
  }
  void* netPluginLib = dlopen(ncclNetPluginName, RTLD_NOW | RTLD_LOCAL);
  if (netPluginLib == nullptr) {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load (%s) returned %d : %s", ncclNetPluginName, errno, dlerror());
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found, using internal implementation");
+    // dlopen does not guarantee to set errno, but dlerror only gives us a
+    // string, so checking errno doesn't hurt to try to provide a better
+    // error message
+    if (errno == ENOENT) {
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : dlerror=%s No plugin found (%s), using internal implementation", dlerror(), ncclNetPluginName);
+      // exit(-1);
+    } else {
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror());
+    }
    return ncclSuccess;
  }

-  ncclNets[0] = (ncclNet_v6_t*)dlsym(netPluginLib, "ncclNetPlugin_v6");
+  ncclNets[0] = (ncclNet_v7_t*)dlsym(netPluginLib, "ncclNetPlugin_v7");
  if (ncclNets[0] == nullptr) {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v6 symbol.");
-    // Try v5 plugin
-    ncclNet_v5 = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5");
-    if (ncclNet_v5 == nullptr) {
-      ncclNet_v4 = (ncclNet_v4_t*)dlsym(netPluginLib, "ncclNetPlugin_v4");
-      if (ncclNet_v4 == nullptr) {
-        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin symbol (v4 or v5).");
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v7 symbol.");
+    // Try v6 plugin
+    ncclNet_v6 = (ncclNet_v6_t*)dlsym(netPluginLib, "ncclNetPlugin_v6");
+    if (ncclNet_v6 == nullptr) {
+      // Try v5 plugin
+      ncclNet_v5 = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5");
+      if (ncclNet_v5 == nullptr) {
+        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin symbol (>= v5). ncclNetPlugin symbols v4 and lower are not supported.");
        if (netPluginLib != nullptr) dlclose(netPluginLib);
        return ncclSuccess;
+      } else {
+        ncclNets[0] = &ncclNet_v5_as_v7;
+        ncclNet_v5_as_v7.init = ncclNet_v5_as_v7_init;
+        // Set the name right away to allow for NCCL_NET=... to work
+        ncclNet_v5_as_v7.name = ncclNet_v5->name;
+        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v5)", ncclNets[0]->name);
      }
-      ncclNets[0] = &ncclNet_v4_as_v6;
-      ncclNet_v4_as_v6.init = ncclNet_v4_as_v6_init;
-      // Set the name right away to allow for NCCL_NET=... to work
-      ncclNet_v4_as_v6.name = ncclNet_v4->name;
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v4)", ncclNets[0]->name);
    } else {
-      ncclNets[0] = &ncclNet_v5_as_v6;
-      ncclNet_v5_as_v6.init = ncclNet_v5_as_v6_init;
+      ncclNets[0] = &ncclNet_v6_as_v7;
+      ncclNet_v6_as_v7.init = ncclNet_v6_as_v7_init;
      // Set the name right away to allow for NCCL_NET=... to work
-      ncclNet_v5_as_v6.name = ncclNet_v5->name;
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v5)", ncclNets[0]->name);
+      ncclNet_v6_as_v7.name = ncclNet_v6->name;
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v6)", ncclNets[0]->name);
    }
  }

  // Check for CollNet
-  ncclCollNets[0] = (ncclCollNet_v6_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v6");
+  ncclCollNets[0] = (ncclCollNet_v7_t*) dlsym(netPluginLib, "ncclCollNetPlugin_v7");
  if (ncclCollNets[0] == nullptr) {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol.");
-    ncclCollNet_v5 = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5");
-    if (ncclCollNet_v5 == nullptr) {
-      ncclCollNet_v4 = (ncclCollNet_v4_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v4");
-      if (ncclCollNet_v4 == nullptr) {
-        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5).");
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v7 symbol.");
+    ncclCollNet_v6 = (ncclCollNet_v6_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v6");
+    if (ncclCollNet_v6 == nullptr) {
+      ncclCollNet_v5 = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5");
+      if (ncclCollNet_v5 == nullptr) {
+        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin symbol (>= v5). ncclCollNetPlugin symbols v4 and lower are not supported.");
      } else {
-        ncclCollNets[0] = &ncclCollNet_v4_as_v6;
-        ncclCollNet_v4_as_v6.init = ncclCollNet_v4_as_v6_init;
-        ncclCollNet_v4_as_v6.name = ncclCollNet_v4->name;
-        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v4)", ncclCollNets[0]->name);
+        ncclCollNets[0] = &ncclCollNet_v5_as_v7;
+        ncclCollNet_v5_as_v7.init = ncclCollNet_v5_as_v7_init;
+        ncclCollNet_v5_as_v7.name = ncclCollNet_v5->name;
+        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v5)", ncclCollNets[0]->name);
      }
    } else {
-      ncclCollNets[0] = &ncclCollNet_v5_as_v6;
-      ncclCollNet_v5_as_v6.init = ncclCollNet_v5_as_v6_init;
-      ncclCollNet_v5_as_v6.name = ncclCollNet_v5->name;
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v5)", ncclCollNets[0]->name);
+      ncclCollNets[0] = &ncclCollNet_v6_as_v7;
+      ncclCollNet_v6_as_v7.init = ncclCollNet_v6_as_v7_init;
+      ncclCollNet_v6_as_v7.name = ncclCollNet_v6->name;
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v6)", ncclCollNets[0]->name);
    }
  }
  return ncclSuccess;
 }

+ncclResult_t ncclNetCheckDeviceVersion(struct ncclComm* comm, ncclNet_t* net, int dev) {
+  ncclNetProperties_t props;
+
+  NCCLCHECK(net->getProperties(dev, &props));
+  ncclNetDeviceType type = props.netDeviceType;
+  if (type) switch (type) {
+    case NCCL_NET_DEVICE_UNPACK:
+      if (props.netDeviceVersion == NCCL_NET_DEVICE_UNPACK_VERSION) {
+        INFO(NCCL_INIT, "Using NCCL_NET_DEVICE_UNPACK net plugin version %d",
+          props.netDeviceVersion);
+        return ncclSuccess;
+      } else {
+        WARN("NCCL_DEVICE_UNPACK plugin has incompatible version %d, this NCCL build is compatible with %d, not using it",
+          props.netDeviceVersion, NCCL_NET_DEVICE_UNPACK_VERSION);
+        return ncclInternalError;
+      }
+    default:
+      WARN("Unknown device code index");
+      return ncclInternalError;
+  }
+
+  INFO(NCCL_INIT, "Using non-device net plugin version %d",
+    props.netDeviceVersion);
+  return ncclSuccess;
+}
+
 static ncclResult_t netGetState(int i, enum ncclNetState* state) {
  pthread_mutex_lock(&netLock);
  if (ncclNetStates[i] == ncclNetStateInit) {
@@ -268,6 +351,10 @@ ncclResult_t ncclNetInit(struct ncclComm* comm) {
    NCCLCHECK(netGetState(i, &state));
    if (state != ncclNetStateEnabled) continue;
    if (netName && strcasecmp(netName, ncclNets[i]->name) != 0) continue;
+    if (ncclSuccess != ncclNetCheckDeviceVersion(comm, ncclNets[i], 0)) {
+      // Mismatched device plugin version
+      continue;
+    }

    comm->ncclNet = ncclNets[i];
    ok = true;
@@ -334,10 +421,10 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
      }

      if (sComm == NULL)
-        NCCLCHECKGOTO(comm->ncclNet->connect(dev, &handle, &sComm), ret, cleanup2);
+        NCCLCHECKGOTO(comm->ncclNet->connect(dev, &handle, &sComm, NULL), ret, cleanup2);

      if (rComm == NULL)
-        NCCLCHECKGOTO(comm->ncclNet->accept(lComm, &rComm), ret, cleanup2);
+        NCCLCHECKGOTO(comm->ncclNet->accept(lComm, &rComm, NULL), ret, cleanup2);

      connected = (rComm != NULL) && (sComm != NULL);
    }
@@ -366,5 +453,11 @@ cleanup1:
 }

 int ncclNetVersion(struct ncclComm* comm) {
-  return (comm->ncclNet == &ncclNet_v4_as_v6) ? 4 : ((comm->ncclNet == &ncclNet_v5_as_v6) ? 5 : 6);
+  if (comm->ncclNet == &ncclNet_v5_as_v7) {
+    return 5;
+  } else if (comm->ncclNet == &ncclNet_v6_as_v7) {
+    return 6;
+  } else {
+    return 7;
+  }
 }
@@ -15,6 +15,16 @@

 #include <sys/syscall.h>
 #include <assert.h>
+#include <unistd.h>
+#include <sys/time.h>
+
+#define PROGRESS_RUNNING 0
+#define PROGRESS_REQUEST_STOP 1
+#define PROGRESS_ABORT 2
+#define PROGRESS_COMPLETE 3
+
+#define SERVICE_RUNNING 0
+#define SERVICE_COMPLETE 1

 enum { proxyRecv=0, proxySend=1 };

@@ -50,7 +60,7 @@ static void expectedProxyResponseFree(struct ncclProxyState* state) {
  }
 }

-static ncclResult_t expectedProxyResponseStore(struct ncclProxyState* state, void* opId, void* respBuff, int respSize) {
+static ncclResult_t expectedProxyResponseStore(struct ncclProxyState* state, void* opId, void* respBuff, int respSize, ncclResult_t res) {
  struct ncclExpectedProxyResponse* elem = state->expectedResponses;
  while (elem) {
    if (elem->opId == opId) {
@@ -67,6 +77,7 @@ static ncclResult_t expectedProxyResponseStore(struct ncclProxyState* state, voi
      memcpy(elem->respBuff, respBuff, respSize);
      free(respBuff);
      elem->done = true;
+      elem->res  = res;
      return ncclSuccess;
    }
    elem = elem->next;
@@ -84,6 +95,7 @@ static ncclResult_t expectedProxyResponseEnqueue(struct ncclProxyState* state, v
  // Pre-alloc response buffer
  ex->respBuff = malloc(respSize);
  ex->respSize = respSize;
+  ex->res      = ncclInternalError;
  ex->done     = false;

  // Enqueue
@@ -109,10 +121,11 @@ static ncclResult_t expectedProxyResponseDequeue(struct ncclProxyState* state, v
        prev->next = elem->next;
      }
      memcpy(respBuff, elem->respBuff, elem->respSize);
+      ncclResult_t res = elem->res;
      free(elem->respBuff);
      free(elem);
      *found = 1;
-      return ncclSuccess;
+      return res;
    }
    prev = elem;
    elem = elem->next;
@@ -509,7 +522,7 @@ static ncclResult_t SaveProxy(struct ncclComm* comm, struct ncclChannel* channel
        type == proxyRecv ? "recv" : "send", peer, channel->id, connIndex);
    return ncclInternalError;
  }
-  if (connector->transportComm->proxyProgress == NULL) return ncclSuccess;
+  if (connector->proxyConn.proxyProgress == NULL) return ncclSuccess;

  if (justInquire) *justInquire = true;
  else {
@@ -707,13 +720,13 @@ static ncclResult_t ncclProxyGetPostedOps(struct ncclProxyState* proxyState, int

  if (state->active == NULL) {
    pthread_mutex_lock(&pool->mutex);
-    while (pool->nextOps == -1 && !state->stop) {
+    while (pool->nextOps == -1 && state->stop == PROGRESS_RUNNING) {
      struct ncclProxyArgs profArgs; // Only used for profiling purposes
      ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileSleep);
      pthread_cond_wait(&pool->cond, &pool->mutex);
      ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileWakeup);
    }
-    if (state->stop) { // We might have been woken up to stop.
+    if (state->stop != PROGRESS_RUNNING) { // We might have been woken up to stop.
      pthread_mutex_unlock(&pool->mutex);
      return ncclSuccess;
    }
@@ -851,12 +864,13 @@ void* ncclProxyProgress(void *proxyState_) {
   * frequency of calling ncclProxyGetPostedOps() and reduce the perf impact. */
  int proxyOpAppendCounter = 0;
  struct ncclProxyArgs profArgs; // Only used for profiling purposes
-  while ((state->stop == false || (state->stop == true && state->active)) && *proxyState->abortFlag == 0) {
+  while (state->stop == PROGRESS_RUNNING || (state->stop == PROGRESS_REQUEST_STOP && state->active)) {
    int idle = 1;
    ncclResult_t ret = progressOps(proxyState, state, state->active, &idle);
    if (ret != ncclSuccess) {
-      INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
-      return NULL;
+      __atomic_store_n(&proxyState->asyncResult, ret, __ATOMIC_RELEASE);
+      INFO(NCCL_ALL,"%s:%d -> %d [Progress Thread]", __FILE__, __LINE__, ret);
+      continue;
    }
    if (lastIdle == 0 && idle == 1) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileIdle);
    if (lastIdle == 1 && idle == 0) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileActive);
@@ -864,11 +878,12 @@ void* ncclProxyProgress(void *proxyState_) {
      int added = 0;
      proxyOpAppendCounter = 0;
      TIME_START(3);
-      if (state->stop == false)
+      if (state->stop == PROGRESS_RUNNING)
        ret = ncclProxyGetPostedOps(proxyState, &added);
      if (added) { TIME_STOP(3); } else { TIME_CANCEL(3); }
      if (ret != ncclSuccess) {
-        INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
+        __atomic_store_n(&proxyState->asyncResult, ret, __ATOMIC_RELEASE);
+        INFO(NCCL_ALL,"%s:%d -> %d [Progress Thread]", __FILE__, __LINE__, ret);
      }
      if (added == 0) {
        sched_yield(); // No request progressed. Let others run.
@@ -876,6 +891,9 @@ void* ncclProxyProgress(void *proxyState_) {
    }
    lastIdle = idle;
  }
+
+  /* progress serive thread should be waiting for me, I need to notify it. */
+  __atomic_store_n(&state->stop, PROGRESS_COMPLETE, __ATOMIC_RELEASE);
  return NULL;
 }

@@ -898,7 +916,11 @@ ncclResult_t ncclProxyStart(struct ncclComm* comm) {
 static ncclResult_t ncclProxyProgressCreate(struct ncclProxyState* proxyState) {
  struct ncclProxyProgressState* state = &proxyState->progressState;
  if (!state->thread) {
-    pthread_create(&state->thread, NULL, ncclProxyProgress, proxyState);
+    pthread_attr_t attr;
+    SYSCHECK(pthread_attr_init(&attr), "pthread_attr_init");
+    SYSCHECK(pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED), "pthread_attr_setdetachstate");
+    SYSCHECK(pthread_create(&state->thread, &attr, ncclProxyProgress, proxyState), "pthread_create");
+    SYSCHECK(pthread_attr_destroy(&attr), "pthread_attr_destroy");
    ncclSetThreadName(state->thread, "NCCL Progress%2d", proxyState->tpLocalnRanks);
  }
  return ncclSuccess;
@@ -910,10 +932,17 @@ ncclResult_t ncclProxyProgressDestroy(struct ncclProxyState* proxyState) {
  // Request the proxy to stop and then wake it
  if (state->opsPool) {
    pthread_mutex_lock(&state->opsPool->mutex);
-    state->stop = true;
+    if (*proxyState->abortFlag == 0) 
+      state->stop = PROGRESS_REQUEST_STOP;
+    else
+      state->stop = PROGRESS_ABORT;
    pthread_cond_signal(&state->opsPool->cond);
    pthread_mutex_unlock(&state->opsPool->mutex);
-    pthread_join(state->thread, NULL);
+    /* progress thread is always detached, wait for it to exit. */
+    uint64_t t0 = clockNano();
+    while (__atomic_load_n(&state->stop, __ATOMIC_ACQUIRE) != PROGRESS_COMPLETE) {
+      if (clockNano() - t0 >= 1000) sched_yield();
+    }
  }

  // Free off any memory allocated for the proxy arg pools
@@ -1005,7 +1034,7 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
  int ready, proxyRank = -1;
  struct ncclProxyState* sharedProxyState = comm->proxyState;

-  // Keep one connection per mlocal rank
+  // Keep one connection per local rank
  for (int i = 0; i < comm->localRanks; ++i) {
    /* find the proxy rank in comm. */
    if (comm->topParentRanks[comm->localRankToRank[i]] == tpProxyRank) {
@@ -1058,42 +1087,43 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
      proxyOps->nextOps = proxyOps->nextOpsEnd = proxyOps->freeOp = -1;
    }
  }
-  INFO(NCCL_NET|NCCL_PROXY, "Connection to proxy localRank %d -> connection %p", proxyConn->tpLocalRank, proxyConn->connection);
+  INFO(NCCL_NET|NCCL_PROXY, "Connected to proxy localRank %d -> connection %p", proxyConn->tpLocalRank, proxyConn->connection);
  return ncclSuccess;
 }

 // cuMem API support
 // The response is sent out-of-band using ncclIpcSocket for this specific command
-ncclResult_t ncclProxyClientConvertFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int fd, int* convertedFd) {
+ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void *handle, int* convertedFd) {
  ncclResult_t ret = ncclSuccess;
  ncclResult_t res = ncclInProgress;
  struct ncclIpcSocket ipcSock = { 0 };
-  void* opId = malloc(1);
+  void *opId = (void*)((((uintptr_t)random()) << 32) | random());
+
  // Create a UDS socket to receive the converted fd
  NCCLCHECK(ncclIpcSocketInit(&ipcSock, comm->topParentLocalRanks[comm->localRank], (uint64_t)opId, comm->abortFlag));

-  // Request the conversion of the fd over sockets
-  NCCLCHECKGOTO(ncclProxyCallAsync(comm, proxyConn, ncclProxyMsgConvertFd, &fd, sizeof(int), 0, opId), ret, error);
+  // Request the allocation of a UDS fd for the handle over sockets
+  NCCLCHECKGOTO(ncclProxyCallAsync(comm, proxyConn, ncclProxyMsgGetFd, handle, sizeof(CUmemGenericAllocationHandle), 0, opId), ret, error);

-  // Receive converted fd over UDS
-  NCCLCHECK(ncclIpcSocketRecvFd(&ipcSock, convertedFd));
-  TRACE(NCCL_PROXY, "UDS: ConvertFd rank %d returned %p %d", proxyConn->tpLocalRank, convertedFd, *convertedFd);
-  NCCLCHECK(ncclIpcSocketClose(&ipcSock));
+  // Receive the converted fd over UDS
+  NCCLCHECKGOTO(ncclIpcSocketRecvFd(&ipcSock, convertedFd), ret, error);
+  TRACE(NCCL_PROXY, "UDS: ClientGetFd handle 0x%lx rank %d returned fd %d", *(uint64_t*)handle, proxyConn->tpLocalRank, *convertedFd);
+  NCCLCHECKGOTO(ncclIpcSocketClose(&ipcSock), ret, error);

+  // Wait for proxy response (sockets)
  while (res == ncclInProgress) {
    res = ncclPollProxyResponse(comm, proxyConn, NULL, opId);
  }

-  free(opId);
-  return res;
+  return ret;

 error:
  NCCLCHECK(ncclIpcSocketClose(&ipcSock));
-  WARN("ncclProxyClientConvertFd call to top parent rank %d failed", proxyConn->tpRank);
+  WARN("ncclProxyClientGetFd call to rank %d handle 0x%lx failed : %d", proxyConn->tpRank, *(uint64_t*)handle, ret);
  return ret;
 }

-const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop", "ConvertFd" };
+const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop", "GetFd" };
 ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId) {
  struct ncclSocket* sock;
  ncclResult_t ret = ncclSuccess;
@@ -1132,14 +1162,13 @@ ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnec

  // Check response queue
  int found = 0;
-  NCCLCHECK(expectedProxyResponseDequeue(sharedProxyState, opId, respBuff, &found));
+  ncclResult_t res = expectedProxyResponseDequeue(sharedProxyState, opId, respBuff, &found);
  if (found == 0) {
    // Attempt to read in a new response header from the proxy thread
    struct ncclSocket* sock = sharedProxyState->peerSocks + proxyConn->tpLocalRank;
-
-    void* recvOpId;
+    ncclProxyRpcResponseHeader resp = {0};
    int offset = 0;
-    if (ncclSuccess != ncclSocketProgress(NCCL_SOCKET_RECV, sock, &recvOpId, sizeof(recvOpId), &offset)) {
+    if (ncclSuccess != ncclSocketProgress(NCCL_SOCKET_RECV, sock, &resp, sizeof(resp), &offset)) {
      WARN("Socket recv failed while polling for opId=%p", opId);
      return ncclInternalError;
    }
@@ -1147,42 +1176,38 @@ ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnec
    if (offset == 0) {
      return ncclInProgress;
    // If we've returned a partial response, block to receive the rest of it
-    } else if (offset < sizeof(recvOpId)) {
-      while (offset < sizeof(recvOpId))
-        NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, sock, &recvOpId, sizeof(recvOpId), &offset));
+    } else if (offset < sizeof(resp)) {
+      while (offset < sizeof(resp))
+        NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, sock, &resp, sizeof(resp), &offset));
    }

-    INFO(NCCL_PROXY, "ncclPollProxyResponse Received new opId=%p", recvOpId);
-
-    // Now do a blocking recv of the response size
-    int respSize = 0;
-    NCCLCHECK(ncclSocketRecv(sock, &respSize, sizeof(respSize)));
+    INFO(NCCL_PROXY, "ncclPollProxyResponse Received new opId=%p", resp.opId);

    // If there's a respSize to recv
-    if (respSize > 0) {
-      if (recvOpId != opId) {
+    if (resp.respSize > 0) {
+      if (resp.opId != opId) {
        // Unexpected response, need to buffer the socket data
-        respBuff = malloc(respSize);
+        respBuff = malloc(resp.respSize);
      }
      assert(respBuff != NULL);
-      NCCLCHECK(ncclSocketRecv(sock, respBuff, respSize));
+      NCCLCHECK(ncclSocketRecv(sock, respBuff, resp.respSize));
    }

-    if (recvOpId == opId) {
-      INFO(NCCL_PROXY, "recvOpId=%p matches expected opId=%p", recvOpId, opId);
-      NCCLCHECK(expectedProxyResponseRemove(sharedProxyState, recvOpId));
-      return ncclSuccess;
+    if (resp.opId == opId) {
+      INFO(NCCL_PROXY, "resp.opId=%p matches expected opId=%p", resp.opId, opId);
+      NCCLCHECK(expectedProxyResponseRemove(sharedProxyState, resp.opId));
+      return resp.res;
    } else {
-      INFO(NCCL_PROXY, "Queuing opId=%p respBuff=%p respSize=%d", recvOpId, respBuff, respSize);
+      INFO(NCCL_PROXY, "Queuing opId=%p respBuff=%p respSize=%d", resp.opId, respBuff, resp.respSize);
      // Store the result and mark response as completed
-      NCCLCHECK(expectedProxyResponseStore(sharedProxyState, recvOpId, respBuff, respSize));
+      NCCLCHECK(expectedProxyResponseStore(sharedProxyState, resp.opId, respBuff, resp.respSize, resp.res));
      return ncclInProgress;
    }
  } else {
    INFO(NCCL_PROXY, "ncclPollProxyResponse Dequeued cached opId=%p", opId);
  }

-  return ncclSuccess;
+  return res;
 }

 ncclResult_t ncclProxyCallBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize) {
@@ -1284,38 +1309,52 @@ static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclPr
 }

 // cuMem API support
-static ncclResult_t proxyConvertFd(struct ncclProxyLocalPeer* peer, void *opId, struct ncclProxyState* proxyState, int fd) {
+static ncclResult_t proxyGetFd(struct ncclProxyLocalPeer* peer, void *opId, struct ncclProxyState* proxyState, uint64_t handle) {
+#if CUDART_VERSION >= 11030
+  // cuMem API support
+  ncclResult_t ret = ncclSuccess;
  struct ncclIpcSocket ipcSock = { 0 };
  uint64_t hash = (uint64_t) opId;
+  INFO(NCCL_PROXY, "UDS proxyGetFd received handle 0x%lx peer %d opId %lx", handle, peer->tpLocalRank, hash);

-  INFO(NCCL_PROXY, "UDS proxyConvertFd received fd %d peer %d opId %lx", fd, peer->tpLocalRank, hash);
+  CUmemAllocationHandleType type = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+  int fd = -1;
+
+  CUCHECK(cuMemExportToShareableHandle(&fd, handle, type, 0));
  // Send back the converted fd using UDS
-  NCCLCHECK(ncclIpcSocketInit(&ipcSock, proxyState->tpRank, hash^1, proxyState->abortFlag));
-  NCCLCHECK(ncclIpcSocketSendFd(&ipcSock, fd, peer->tpLocalRank, hash));
+  NCCLCHECKGOTO(ncclIpcSocketInit(&ipcSock, proxyState->tpRank, hash^1, proxyState->abortFlag), ret, error);
+  NCCLCHECKGOTO(ncclIpcSocketSendFd(&ipcSock, fd, peer->tpLocalRank, hash), ret, error);
+error:
  NCCLCHECK(ncclIpcSocketClose(&ipcSock));
-  return ncclSuccess;
+  // We can now safely close the exported fd
+  (void) close(fd);
+  return ret;
+#else
+  return ncclInternalError;
+#endif
 }

 static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclProxyState* proxyState, int* asyncOpCount, struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool) {
  int done = 1;
+  ncclResult_t res = ncclInternalError;
  if (op->type == ncclProxyMsgSetup) {
    TRACE(NCCL_PROXY, "proxyProgressAsync::proxySetup() opId=%p", op->opId);
-    NCCLCHECK(op->connection->tcomm->proxySetup(op->connection, proxyState, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done));
+    res = op->connection->tcomm->proxySetup(op->connection, proxyState, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done);
  } else if (op->type == ncclProxyMsgConnect) {
    TRACE(NCCL_PROXY, "proxyProgressAsync::proxyConnect() opId=%p op.reqBuff=%p", op->opId, op->reqBuff);
-    NCCLCHECK(op->connection->tcomm->proxyConnect(op->connection, proxyState, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done));
+    res = op->connection->tcomm->proxyConnect(op->connection, proxyState, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done);
  } else if (op->type == ncclProxyMsgSharedInit) {
    int nChannels = (int) *op->reqBuff;
    TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgSharedInit opId=%p op.reqBuff=%p nChannels=%d", op->opId, op->reqBuff, nChannels);
-    if (op->connection->tcomm->proxySharedInit) NCCLCHECK(op->connection->tcomm->proxySharedInit(op->connection, proxyState, nChannels));
+    if (op->connection->tcomm->proxySharedInit) res = op->connection->tcomm->proxySharedInit(op->connection, proxyState, nChannels);
    __atomic_store_n(&op->connection->state, connSharedInitialized, __ATOMIC_RELEASE);
-  } else if (op->type == ncclProxyMsgConvertFd) {
-    int fd = *(int *)op->reqBuff;
-    TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgConvertFd opId=%p op.reqBuff=%p fd=%d", op->opId, op->reqBuff, fd);
-    NCCLCHECK(proxyConvertFd(peer, op->opId, proxyState, fd)); // cuMem API support
+  } else if (op->type == ncclProxyMsgGetFd) {
+    uint64_t handle = *(uint64_t*)op->reqBuff;
+    TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgGetFd opId=%p op.reqBuff=%p handle=0x%lx", op->opId, op->reqBuff, handle);
+    res = proxyGetFd(peer, op->opId, proxyState, handle); // cuMem API support
  } else if (op->type == ncclProxyMsgInit) {
    TRACE(NCCL_PROXY, "proxyProgressAsync::ncclProxyMsgInit opId=%p op.reqBuff=%p", op->opId, op->reqBuff);
-    NCCLCHECK(proxyConnInit(peer, connectionPool, proxyState, (ncclProxyInitReq*) op->reqBuff, (ncclProxyInitResp*) op->respBuff, &op->connection));
+    res = proxyConnInit(peer, connectionPool, proxyState, (ncclProxyInitReq*) op->reqBuff, (ncclProxyInitResp*) op->respBuff, &op->connection);
  } else return ncclInternalError;

  if (done) {
@@ -1329,11 +1368,10 @@ static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclP
     * to abort and close the connection, it can cause segfault if the requester is using
     * the respBuff. */

-    // Send the opId for referencing async operation
-    NCCLCHECK(ncclSocketSend(op->connection->sock, &op->opId, sizeof(op->opId)));
+    ncclProxyRpcResponseHeader resp = {op->opId, res, op->respSize};

-    // Send the response size
-    NCCLCHECK(ncclSocketSend(op->connection->sock, &op->respSize, sizeof(op->respSize)));
+    // Send the opId for referencing async operation
+    NCCLCHECK(ncclSocketSend(op->connection->sock, &resp, sizeof(resp)));

    if (op->respSize) {
      // Send the response
@@ -1386,7 +1424,7 @@ static bool proxyMatchOpType(int type) {
    case ncclProxyMsgSharedInit:
    case ncclProxyMsgSetup:
    case ncclProxyMsgConnect:
-    case ncclProxyMsgConvertFd:
+    case ncclProxyMsgGetFd:
      return true;
    default:
      return false;
@@ -1544,6 +1582,19 @@ void* ncclProxyService(void* _args) {
  ncclSocketClose(proxyState->listenSock);
  free(proxyState->listenSock);
  proxyOpsFree(proxyState);
+
+  if (*proxyState->abortFlag) {
+    /* abort happened, need to notify main thread I am done. */
+    __atomic_store_n(&proxyState->stop, SERVICE_COMPLETE, __ATOMIC_RELEASE);
+  }
+
+  if (ncclAtomicRefCountDecrement(proxyState->abortFlagRefCount) == 0) {
+    ncclCudaHostFree((void *)proxyState->abortFlag);
+    free((void*)proxyState->abortFlagRefCount);
+  }
+
+  /* proxy itself holds one internal ref count, needs to call ncclProxyDestroy */
+  ncclProxyDestroy(proxyState);
  return NULL;
 }

@@ -1552,8 +1603,16 @@ ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union
  NCCLCHECK(ncclCalloc(&comm->sharedRes->proxyState, 1));
  comm->proxyState = comm->sharedRes->proxyState;
  comm->proxyState->refCount = 1;
+  /* ref count for communicator and proxy service thread. */
+  comm->proxyState->internalRefCount = 2;
  comm->proxyState->listenSock = sock;
  comm->proxyState->peerAddresses = peerAddresses;
+  // Seed the random number generator for UDS filename generation
+  struct timeval time;
+  gettimeofday(&time,NULL);
+  unsigned int seed = time.tv_sec*time.tv_usec;
+  seed ^= getpid();
+  srandom(seed);
  return ncclSuccess;
 }

@@ -1568,6 +1627,8 @@ ncclResult_t ncclProxyCreate(struct ncclComm* comm) {
    proxyState->tpLocalnRanks = comm->localRanks;
    proxyState->cudaDev = comm->cudaDev;
    proxyState->abortFlag = comm->abortFlag;
+    proxyState->abortFlagRefCount = comm->abortFlagRefCount;
+    ncclAtomicRefCountIncrement(comm->abortFlagRefCount);
    proxyState->p2pnChannels = comm->p2pnChannels;
    proxyState->p2pChunkSize = comm->p2pChunkSize;
    proxyState->nChannels = comm->nChannels;
@@ -1584,8 +1645,8 @@ ncclResult_t ncclProxyCreate(struct ncclComm* comm) {
 }

 ncclResult_t ncclProxyStop(struct ncclComm* comm) {
-  if (comm->sharedRes && comm->sharedRes->proxyState) {
-    struct ncclProxyState* sharedProxyState = comm->sharedRes->proxyState;
+  if (comm->proxyState) {
+    struct ncclProxyState* sharedProxyState = comm->proxyState;

    if ((comm->proxyRefCountOld = ncclAtomicRefCountDecrement(&sharedProxyState->refCount)) == 0) {
      if (sharedProxyState->peerAddresses) {
@@ -1625,15 +1686,41 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) {
  return ncclSuccess;
 }

-ncclResult_t ncclProxyDestroy(struct ncclComm* comm) {
-  struct ncclProxyState* sharedProxyState = comm->sharedRes->proxyState;
-
-  assert(sharedProxyState->refCount == 0);
-  free(sharedProxyState->peerAddresses);
-  free(sharedProxyState->peerSocks);
-  free(sharedProxyState->proxyOps);
-  free(sharedProxyState->sharedDevMems);
-  expectedProxyResponseFree(sharedProxyState);
-  free(sharedProxyState);
+ncclResult_t ncclProxyDestroy(struct ncclProxyState *proxyState) {
+  if (__atomic_sub_fetch(&proxyState->internalRefCount, 1, __ATOMIC_ACQ_REL) == 0) {
+    free(proxyState->peerAddresses);
+    free(proxyState->peerSocks);
+    free(proxyState->proxyOps);
+    free(proxyState->sharedDevMems);
+    expectedProxyResponseFree(proxyState);
+    free(proxyState);
+  }
+  return ncclSuccess;
+}
+
+/* detach all proxy threads in case of abort */
+ncclResult_t ncclProxyTryDetach(struct ncclProxyState *proxyState) {
+  if (proxyState && proxyState->thread) {
+    /* proxy service thread can call cudaFreeHost to free pinned host mem, but
+     * it can cause a hang if main thread is issuing other cuda calls. To solution
+     * should be allocate/free pinned host mem using cuMem* driver API, this waiting
+     * 5 secs is just a workaround for now. */
+    bool join = false;
+    struct timespec start, now;
+    clock_gettime(CLOCK_MONOTONIC, &start);
+    do {
+      clock_gettime(CLOCK_MONOTONIC, &now);
+      if (__atomic_load_n(&proxyState->stop, __ATOMIC_ACQUIRE) == SERVICE_COMPLETE) {
+        /* proxy thread is done, join it. */
+        pthread_join(proxyState->thread, NULL);
+        join = true;
+        break;
+      }
+    } while(now.tv_sec - start.tv_sec < 5);
+    
+    if (join == false) {
+      pthread_detach(proxyState->thread);
+    }
+  }
  return ncclSuccess;
 }
--- a/Daha Fazla Göster
+++ b/Daha Fazla Göster