Merge remote-tracking branch 'nccl/master' into develop

2025-06-20 07:53:59 -05:00
@@ -2,7 +2,7 @@

 Full documentation for RCCL is available at [https://rccl.readthedocs.io](https://rccl.readthedocs.io)

-## Unreleased - RCCL 2.25.1 for ROCm 7.0.0
+## Unreleased - RCCL 2.26.6 for ROCm 7.0.0

 ### Resolved issues

@@ -29,6 +29,7 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https:
 * Compatibility with NCCL 2.23.4
 * Compatibility with NCCL 2.24.3
 * Compatibility with NCCL 2.25.1
+* Compatibility with NCCL 2.26.6

 ## RCCL 2.22.3 for ROCm 6.4.1

@@ -423,7 +423,6 @@ set(SRC_FILES
  src/init.cc
  src/init_nvtx.cc
  src/mnnvl.cc
-  src/net.cc
  src/msccl.cc
  src/proxy.cc
  src/rccl_wrap.cc
@@ -491,9 +490,6 @@ set(SRC_FILES
  src/include/ipcsocket.h
  src/include/mnnvl.h
  src/include/nccl_common.h
-  src/include/nccl_net.h
-  src/include/nccl_profiler.h
-  src/include/nccl_tuner.h
  src/include/net_device.h
  src/include/net.h
  src/include/nvmlwrap.h
@@ -566,6 +562,25 @@ set(SRC_FILES
  src/include/nvtx3/nvtxDetail/nvtxInitDefs.h
  src/include/nvtx3/nvtxDetail/nvtxLinkOnce.h
  src/include/nvtx3/nvtxDetail/nvtxTypes.h
+  src/include/plugin/nccl_net.h
+  src/include/plugin/nccl_profiler.h
+  src/include/plugin/nccl_tuner.h
+  src/include/plugin/plugin.h
+  src/include/plugin/net/net_v6.h
+  src/include/plugin/net/net_v7.h
+  src/include/plugin/net/net_v8.h
+  src/include/plugin/net/net_v9.h
+  src/include/plugin/net/net_v10.h
+  src/include/plugin/profiler/net_ib_v1.h
+  src/include/plugin/profiler/net_ib.h
+  src/include/plugin/profiler/net_socket_v1.h
+  src/include/plugin/profiler/net_socket.h
+  src/include/plugin/profiler/profiler_v1.h
+  src/include/plugin/profiler/profiler_v2.h
+  src/include/plugin/profiler/profiler_v3.h
+  src/include/plugin/tuner/tuner_v2.h
+  src/include/plugin/tuner/tuner_v3.h
+  src/include/plugin/tuner/tuner_v4.h
  src/misc/alt_rsmi.cc
  src/misc/archinfo.cc
  src/misc/argcheck.cc
@@ -580,7 +595,6 @@ set(SRC_FILES
 # src/misc/nvmlwrap.cc
  src/misc/nvmlwrap_stub.cc
  src/misc/param.cc
-  src/misc/profiler.cc
  src/misc/rocm_smi_wrap.cc
  src/misc/rocmwrap.cc
  src/misc/roctx.cc
@@ -589,12 +603,26 @@ set(SRC_FILES
  src/misc/signals.cc
  src/misc/socket.cc
  src/misc/strongstream.cc
-  src/misc/tuner.cc
  src/misc/utils.cc
  src/misc/msccl/msccl_lifecycle.cc
  src/misc/msccl/msccl_parser.cc
  src/misc/msccl/msccl_setup.cc
  src/misc/msccl/msccl_status.cc
+  src/plugin/net.cc
+  src/plugin/plugin_open.cc
+  src/plugin/profiler.cc
+  src/plugin/tuner.cc
+  src/plugin/net/net_v6.cc
+  src/plugin/net/net_v7.cc
+  src/plugin/net/net_v8.cc
+  src/plugin/net/net_v9.cc
+  src/plugin/net/net_v10.cc
+  src/plugin/profiler/profiler_v1.cc
+  src/plugin/profiler/profiler_v2.cc
+  src/plugin/profiler/profiler_v3.cc
+  src/plugin/tuner/tuner_v2.cc
+  src/plugin/tuner/tuner_v3.cc
+  src/plugin/tuner/tuner_v4.cc
  src/ras/client.cc
  src/ras/client_support.cc
  src/ras/collectives.cc
@@ -612,6 +640,7 @@ set(SRC_FILES
  src/transport/net_socket.cc
  src/transport/nvls.cc
  src/transport/p2p.cc
+  src/transport/profiler.cc
  src/transport/shm.cc
 )

@@ -737,6 +766,7 @@ add_dependencies(rccl git_version_check)                                      #
 target_include_directories(rccl PRIVATE ${PROJECT_BINARY_DIR}/include)        # for generated rccl.h header
 target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src)                    # for hipfied headers
 target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include)
+target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include/plugin)
 target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/device)
 target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/device/network/unpack)
 target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/gensrc)
@@ -1024,7 +1054,7 @@ endif()
 #==================================================================================================
 ## Specify install targets
 rocm_install_targets(TARGETS rccl)
-rocm_install(FILES       ${PROJECT_BINARY_DIR}/include/rccl/rccl.h src/include/nccl_net.h
+rocm_install(FILES       ${PROJECT_BINARY_DIR}/include/rccl/rccl.h src/include/plugin/nccl_net.h
             DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rccl)
 rocm_install(FILES       src/include/api_trace.h
             DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rccl/amd_detail)
@@ -1050,10 +1080,10 @@ if(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY)

  ### install the wrapper header file to package
  rocm_install(
-    FILES ${PROJECT_BINARY_DIR}/rccl/include/rccl.h src/include/nccl_net.h
+    FILES ${PROJECT_BINARY_DIR}/rccl/include/rccl.h src/include/plugin/nccl_net.h
    DESTINATION "./rccl/${CMAKE_INSTALL_INCLUDEDIR}/" )
  rocm_install(
-    FILES ${PROJECT_BINARY_DIR}/include/rccl.h src/include/nccl_net.h
+    FILES ${PROJECT_BINARY_DIR}/include/rccl.h src/include/plugin/nccl_net.h
    DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/" )
 endif()

@@ -60,20 +60,20 @@ of newer ones.
 The `nccl/` directory is populated with `net_vX.h` files extracting all relevant definitions
 from old API versions. It also provides error codes in `err.h`.

-# API (v9)
+# API (v10)

-Below is the main `ncclNet_v9` struct. Each function is explained in later sections.
+Below is the main `ncclNet_v10` struct. Each function is explained in later sections.

 ```
 typedef struct {
  // Name of the network (mainly for logs)
  const char* name;
  // Initialize the network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
  // Return the number of adapters.
  ncclResult_t (*devices)(int* ndev);
  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props);
  // Create a receiving object and provide a handle to connect to it. The
  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
  // between ranks to create a connection.
@@ -83,13 +83,13 @@ typedef struct {
  // should return successfully with sendComm == NULL with the expectation that
  // it will be called again until sendComm != NULL.
  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
+  ncclResult_t (*connect)(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm);
  // Finalize connection establishment after remote peer has called connect.
  // This call must not block for the connection to be established, and instead
  // should return successfully with recvComm == NULL with the expectation that
  // it will be called again until recvComm != NULL.
  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v10_t** recvDevComm);
  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
@@ -98,10 +98,10 @@ typedef struct {
  ncclResult_t (*deregMr)(void* comm, void* mhandle);
  // Asynchronous send to a peer.
  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request);
+  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request);
  // Asynchronous recv from a peer.
  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request);
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** pHandles, void** request);
  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
  // visible to the GPU
  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
@@ -200,6 +200,9 @@ the plugin code adding the following definitions:
 #define INFO(FLAGS, ...) logFunction(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
 ```

+The `ncclProfilerCallback_t` argument is a NCCL core callback that allows the plugin to define and
+record its own events with the NCCL profiler plugin.
+
 `devices`

 Once the plugin is initialized, NCCL will query the number of devices available. It should not
@@ -301,6 +304,11 @@ the `listen` call previously. If the sender did not connect yet, `accept` should
 should return `ncclSuccess`, setting `recvComm` to `NULL`. NCCL will call `accept` again until it
 succeeds.

+The `connect` API takes a `ncclNetCommConfig_t`, which contains a trafficClass field.
+This field can be used by the network plugin to specify the QoS level of the connection. By default,
+`trafficClass` is set to -1 but can be configured by the application during communicator initialization
+to select a plugin-supported QoS level.
+
 `closeListen`/`closeSend`/`closeRecv`

 Once a `listenComm`/`sendComm`/`recvComm` is no longer needed, NCCL will call
@@ -354,6 +362,9 @@ The `isend` operation returns a handle in the `request` argument for further cal
 the `isend` operation cannot be initiated, `request` can be set to `NULL` and NCCL will call
 `isend` again later.

+The `pHandle` argument allows NCCL to pass an opaque handle that can be used by the network plugin
+to support network defined events.
+
 `irecv`

 To receive data, NCCL will call `irecv` with the `recvComm` returned by `accept`. The argument
@@ -375,6 +386,9 @@ of irecv and is resilient to redundant network writes. This allows the plugin to
 completions on such irecvs (for example, complete the request immediately). The plugin is still
 expected to set a valid request pointer on return which NCCL can poll to check for completion.

+The `pHandle` argument allows NCCL to pass an array of opaque handles that can be used by the
+network plugin to support network defined events.
+
 Note: for a given connection, send/receive operations should always match in the order they were
 posted. Tags provided for receive operations are only used to assign a given send operation to one
 of the buffers of the first (multi-)receive in the queue, not to allow for out-of-order tag
@@ -2,14 +2,15 @@
 * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 */

-#ifndef NCCL_NET_H_
-#define NCCL_NET_H_
+#ifndef NET_H_
+#define NET_H_

 #include <stdint.h>
 #include <stdlib.h>

 #include "common.h"
 #include "err.h"
+#include "net_device.h"

 #define NCCL_NET_HANDLE_MAXSIZE 128
 #define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L) //1TB
@@ -22,6 +23,9 @@
 // Maximum number of requests per comm object
 #define NCCL_NET_MAX_REQUESTS 32

+typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* phandle, int64_t pluginId, void* extData);
+
+#include "net_v10.h"
 #include "net_v9.h"
 #include "net_v8.h"
 #include "net_v7.h"
@@ -31,4 +35,9 @@
 #include "net_v3.h"
 #include "net_v2.h"

+typedef ncclNet_v10_t ncclNet_t;
+typedef ncclNetProperties_v10_t ncclNetProperties_t;
+typedef ncclNetVDeviceProps_v10_t ncclNetVDeviceProps_t;
+typedef ncclNetCommConfig_v10_t ncclNetCommConfig_t;
+
 #endif // end include guard
@@ -26,6 +26,7 @@ typedef struct {

 typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
 typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t;
-typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_t;
+typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t;
+typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_t;

 #endif
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NET_V10_H_
+#define NET_V10_H_
+
+#define NCCL_NET_MAX_DEVS_PER_NIC_V10 4
+typedef struct {
+  int ndevs;
+  int devs[NCCL_NET_MAX_DEVS_PER_NIC_V10];
+} ncclNetVDeviceProps_v10_t;
+
+
+#define NCCL_NET_TRAFFIC_CLASS_UNDEF -1
+typedef struct {
+  // Plugin-specific TC value
+  int trafficClass;
+} ncclNetCommConfig_v10_t;
+
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int regIsGlobal;                 // regMr is not tied to a particular comm
+  int forceFlush;                  // Force a flush on receives
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+  ncclNetVDeviceProps_v10_t vProps;
+  size_t maxP2pBytes;              // Max transfer size for point-to-point operations
+  size_t maxCollBytes;             // Max transfer size for collective operations
+} ncclNetProperties_v10_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v10_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+
+  // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
+  // what index this new vNIC exists at
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v10_t* props);
+} ncclNet_v10_t;
+
+#endif // end include guard
@@ -2,8 +2,8 @@
 * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 */

-#ifndef NCCL_NET_V2_H_
-#define NCCL_NET_V2_H_
+#ifndef NET_V2_H_
+#define NET_V2_H_

 typedef struct {
  // Name of the network (mainly for logs)
@@ -2,8 +2,8 @@
 * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 */

-#ifndef NCCL_NET_V3_H_
-#define NCCL_NET_V3_H_
+#ifndef NET_V3_H_
+#define NET_V3_H_

 #define NCCL_NET_MAX_REQUESTS_V3 16

@@ -2,8 +2,8 @@
 * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 */

-#ifndef NCCL_NET_V4_H_
-#define NCCL_NET_V4_H_
+#ifndef NET_V4_H_
+#define NET_V4_H_

 #define NCCL_NET_HANDLE_MAXSIZE_V4 64

@@ -2,8 +2,8 @@
 * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 */

-#ifndef NCCL_NET_V5_H_
-#define NCCL_NET_V5_H_
+#ifndef NET_V5_H_
+#define NET_V5_H_

 typedef ncclNetProperties_v6_t ncclNetProperties_v5_t;
 typedef struct {
@@ -2,10 +2,8 @@
 * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 */

-#ifndef NCCL_NET_V6_H_
-#define NCCL_NET_V6_H_
-
-#define NCCL_NET_MAX_REQUESTS_V6 8
+#ifndef NET_V6_H_
+#define NET_V6_H_

 typedef struct {
  char* name;     // Used mostly for logging.
@@ -2,10 +2,8 @@
 * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 */

-#ifndef NCCL_NET_V7_H_
-#define NCCL_NET_V7_H_
-
-#include "net_device.h"
+#ifndef NET_V7_H_
+#define NET_V7_H_

 typedef struct {
  char* name;                      // Used mostly for logging.
@@ -2,10 +2,8 @@
 * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 */

-#ifndef NCCL_NET_V8_H_
-#define NCCL_NET_V8_H_
-
-#include "net_device.h"
+#ifndef NET_V8_H_
+#define NET_V8_H_

 typedef struct {
  char* name;                      // Used mostly for logging.
@@ -2,18 +2,14 @@
 * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 */

-#ifndef NCCL_NET_V9_H_
-#define NCCL_NET_V9_H_
-
-#include "net_device.h"
+#ifndef NET_V9_H_
+#define NET_V9_H_

 #define NCCL_NET_MAX_DEVS_PER_NIC_V9 4
-#define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V9
 typedef struct {
  int ndevs;
  int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9];
 } ncclNetVDeviceProps_v9_t;
-typedef ncclNetVDeviceProps_v9_t ncclNetVDeviceProps_t;

 typedef struct {
  char* name;                      // Used mostly for logging.
@@ -35,8 +31,6 @@ typedef struct {
  size_t maxCollBytes;             // Max transfer size for collective operations
 } ncclNetProperties_v9_t;

-typedef ncclNetProperties_v9_t ncclNetProperties_t;
-
 typedef struct {
  // Name of the network (mainly for logs)
  const char* name;
@@ -93,7 +87,7 @@ typedef struct {

  // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
  // what index this new vNIC exists at
-  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props);
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v9_t* props);
 } ncclNet_v9_t;

 #endif // end include guard
@@ -11,7 +11,7 @@

 int max_requests = NCCL_NET_MAX_REQUESTS;

-__hidden ncclResult_t pluginInit(ncclDebugLogger_t logFunction) { return ncclSuccess; }
+__hidden ncclResult_t pluginInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { return ncclSuccess; }
 __hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess; }
 __hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; }
 __hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; }
@@ -52,13 +52,13 @@ __hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_t* props) {
 }

 __hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; }
-__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginConnect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { return ncclInternalError; }
 __hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** recvDevComm) { return ncclInternalError; }
 __hidden ncclResult_t pluginRegMr(void* collComm, void* data, size_t size, int type, void** mhandle) { return ncclInternalError; }
 __hidden ncclResult_t pluginRegMrDmaBuf(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { return ncclInternalError; }
 __hidden ncclResult_t pluginDeregMr(void* collComm, void* mhandle) { return ncclInternalError;}
-__hidden ncclResult_t pluginIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) { return ncclInternalError; }
-__hidden ncclResult_t pluginIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) { return ncclInternalError; }
+__hidden ncclResult_t pluginIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request) { return ncclInternalError; }
+__hidden ncclResult_t pluginIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request) { return ncclInternalError; }
 __hidden ncclResult_t pluginIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { return ncclInternalError; }
 __hidden ncclResult_t pluginTest(void* request, int* done, int* size) { return ncclInternalError; }
 __hidden ncclResult_t pluginCloseSend(void* sendComm) { return ncclInternalError; }
@@ -70,7 +70,7 @@ __hidden ncclResult_t pluginMakeVDevice(int* d, ncclNetVDeviceProps_t* props) {

 #define PLUGIN_NAME "Plugin"

-ncclNet_v9_t ncclNetPlugin_v9 = {
+const ncclNet_v10_t ncclNetPlugin_v10 = {
  .name = PLUGIN_NAME,
  .init = pluginInit,
  .devices = pluginDevices,
@@ -93,6 +93,51 @@ ncclNet_v9_t ncclNetPlugin_v9 = {
  .makeVDevice   = pluginMakeVDevice,
 };

+__hidden ncclResult_t pluginInit_v9(ncclDebugLogger_t logFunction) {
+  return pluginInit(logFunction, NULL);
+}
+
+__hidden ncclResult_t pluginGetProperties_v9(int dev, ncclNetProperties_v9_t* props) {
+  return pluginGetProperties(dev, (ncclNetProperties_t*)props);
+}
+
+__hidden ncclResult_t pluginConnect_v9(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm){
+  return pluginConnect(dev, NULL, handle, sendComm, sendDevComm);
+}
+
+__hidden ncclResult_t pluginIsend_v9(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
+  return pluginIsend(sendComm, data, size, tag, mhandle, NULL, request);
+}
+
+__hidden ncclResult_t pluginIrecv_v9(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
+  return pluginIrecv(recvComm, n, data, sizes, tags, mhandles, NULL, request);
+}
+
+__hidden ncclResult_t pluginMakeVDevice_v9(int* d, ncclNetVDeviceProps_v9_t* props) { return ncclInternalError; }
+
+const ncclNet_v9_t ncclNetPlugin_v9 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit_v9,
+  .devices = pluginDevices,
+  .getProperties = pluginGetProperties_v9,
+  .listen = pluginListen,
+  .connect = pluginConnect_v9,
+  .accept = pluginAccept,
+  .regMr = pluginRegMr,
+  .regMrDmaBuf = pluginRegMrDmaBuf,
+  .deregMr = pluginDeregMr,
+  .isend = pluginIsend_v9,
+  .irecv = pluginIrecv_v9,
+  .iflush = pluginIflush,
+  .test = pluginTest,
+  .closeSend = pluginCloseSend,
+  .closeRecv = pluginCloseRecv,
+  .closeListen = pluginCloseListen,
+  .getDeviceMr = pluginGetDeviceMr,
+  .irecvConsumed = pluginIrecvConsumed,
+  .makeVDevice   = pluginMakeVDevice_v9,
+};
+
 __hidden ncclResult_t pluginGetProperties_v8(int dev, ncclNetProperties_v8_t* props_v8) {
  ncclNetProperties_t props;
  ncclResult_t ret = pluginGetProperties(dev, &props);
@@ -113,22 +158,22 @@ __hidden ncclResult_t pluginGetProperties_v8(int dev, ncclNetProperties_v8_t* pr
 }

 __hidden ncclResult_t pluginIsend_v8(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
-  return pluginIsend(sendComm, data, (int)size, tag, mhandle, request);
+  return pluginIsend(sendComm, data, (int)size, tag, mhandle, NULL, request);
 }

 __hidden ncclResult_t pluginIrecv_v8(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
  size_t sizesOut[NCCL_PLUGIN_MAX_RECVS];
  for (int i=0; i<n; i++) sizesOut[i] = sizes[i];
-  return pluginIrecv(recvComm, 1, data, sizesOut, tags, mhandles, request);
+  return pluginIrecv(recvComm, 1, data, sizesOut, tags, mhandles, NULL, request);
 }

 const ncclNet_v8_t ncclNetPlugin_v8 = {
  .name = PLUGIN_NAME,
-  .init = pluginInit,
+  .init = pluginInit_v9,
  .devices = pluginDevices,
  .getProperties = pluginGetProperties_v8,
  .listen = pluginListen,
-  .connect = pluginConnect,
+  .connect = pluginConnect_v9,
  .accept = pluginAccept,
  .regMr = pluginRegMr,
  .regMrDmaBuf = pluginRegMrDmaBuf,
@@ -168,11 +213,11 @@ __hidden ncclResult_t pluginRegMr_v7(void* collComm, void* data, int size, int t

 const ncclNet_v7_t ncclNetPlugin_v7 = {
  .name = PLUGIN_NAME,
-  .init = pluginInit,
+  .init = pluginInit_v9,
  .devices = pluginDevices,
  .getProperties = pluginGetProperties_v7,
  .listen = pluginListen,
-  .connect = pluginConnect,
+  .connect = pluginConnect_v9,
  .accept = pluginAccept,
  .regMr = pluginRegMr_v7,
  .regMrDmaBuf = pluginRegMrDmaBuf,
@@ -209,7 +254,7 @@ __hidden ncclResult_t pluginAccept_v6(void* listenComm, void** recvComm) { retur

 const ncclNet_v6_t ncclNetPlugin_v6 = {
  .name = PLUGIN_NAME,
-  .init = pluginInit,
+  .init = pluginInit_v9,
  .devices = pluginDevices,
  .getProperties = pluginGetProperties_v6,
  .listen = pluginListen,
@@ -230,7 +275,7 @@ const ncclNet_v6_t ncclNetPlugin_v6 = {
 /* v5 Compat */
 const ncclNet_v5_t ncclNetPlugin_v5 = {
  .name = PLUGIN_NAME,
-  .init = pluginInit,
+  .init = pluginInit_v9,
  .devices = pluginDevices,
  .getProperties = pluginGetProperties_v6,
  .listen = pluginListen,
@@ -275,7 +320,7 @@ static ncclResult_t pluginConnect_v4(int dev, void* handle, void** sendComm) {
  ncclResult_t ret;
  do {
    ncclNetDeviceHandle_v7_t* handle = NULL;
-    ret = pluginConnect(dev, handle, sendComm, &handle);
+    ret = pluginConnect(dev, NULL, handle, sendComm, &handle);
  } while (ret == ncclSuccess && *sendComm == NULL);
  return ret;
 }
@@ -289,7 +334,7 @@ static ncclResult_t pluginAccept_v4(void* listenComm, void** recvComm) {
 }
 const ncclNet_v4_t ncclNetPlugin_v4 = {
  .name = PLUGIN_NAME,
-  .init = pluginInit,
+  .init = pluginInit_v9,
  .devices = pluginDevices,
  .getProperties = pluginGetProperties_v4,
  .listen = pluginListen,
@@ -318,7 +363,7 @@ static ncclResult_t pluginFlush(void* recvComm, void* data, int size, void* mhan
 }
 static ncclResult_t pluginInit_v3(ncclDebugLogger_t logFunction) {
  max_requests = NCCL_NET_MAX_REQUESTS_V3;
-  return pluginInit(logFunction);
+  return pluginInit(logFunction, NULL);
 }
 #include <string.h>
 static ncclResult_t pluginListen_v3(int dev, void* handle, void** listenComm) {
@@ -49,9 +49,9 @@ of newer ones.
 The `nccl/` directory is populated with `profiler_vX.h` files extracting all relevant definitions
 from old API versions. It also provides error codes in `err.h`.

-# API (v2)
+# API (v3)

-Below is the main `ncclProfiler_v2` struct. Each function is explained in later sections.
+Below is the main `ncclProfiler_v3` struct. Each function is explained in later sections.

 ```
 typedef struct {
@@ -70,7 +70,7 @@ typedef struct {
  //  - eDescr : pointer to ncclProfilerEventDescr_t object
  // Output
  //  - eHandle: return event handle for supplied event descriptor object
-  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr);
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v3_t* eDescr);

  // stopEvent - stop/finalize an event inside and event set
  // Input
@@ -82,13 +82,13 @@ typedef struct {
  //  - eHandle   : handle to event object created through startEvent
  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
  //  - eState    : event state transition
-  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs);
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v3_t eState, ncclProfilerEventStateArgs_v3_t* eStateArgs);

  // finalize - finalize the profiler plugin
  // Input
  //  - context: opaque profiler context object
  ncclResult_t (*finalize)(void* context);
-} ncclProfiler_v2_t;
+} ncclProfiler_v3_t;
 ```

 ## Error codes
@@ -156,7 +156,6 @@ typedef struct {
      size_t count;         // data count
      int root;             // root rank
      const char* datatype; // string containing the name of the datatype
-      size_t trafficBytes;  // number of transfer bytes
      uint8_t nMaxChannels; // max number of channels for this collective
      uint8_t nWarps;       // number of GPU warps for this collective
      const char* algo;     // string containing name of the algorithm for this collective
@@ -185,12 +184,22 @@ typedef struct {
    struct {                // proxyStep events metadata
      int step;             // individual step in `ncclProxyOp`
    } proxyStep;
+
+    struct {
+      uint8_t channelId;    // id of the channel used by the kernel
+    } kernelCh;
+
+    struct {
+      int64_t id;           // net plugin id (used by net and profiler plugins to agree on event definitions)
+      void* data;           // pointer to network plugin defined event
+    } netPlugin;
  };
-} ncclProfilerEventDescr_v2_t;
+} ncclProfilerEventDescr_v3_t;
 ```

 NCCL defines the following events: `ncclProfileGroup`, `ncclProfileColl`, `ncclProfileP2p`,
-`ncclProfileProxyOp`, `ncclProfileProxyStep`, and `ncclProfileProxyCtrl`.
+`ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileProxyCtrl`, `ncclProfileKernelCh` and
+`ncclProfileNetPlugin`.

 #### stopEvent

@@ -236,7 +245,7 @@ typedef enum {
  ncclProfilerProxyCtrlWakeup,          // state marks proxy progress thread waking up
  ncclProfilerProxyCtrlAppend,          // state marks append of new network work item begin
  ncclProfilerProxyCtrlAppendEnd,       // state marks append of new network work item end
-} ncclProfilerEventState_v2_t;
+} ncclProfilerEventState_v3_t;
 ```

 `ncclProfileProxyOp` events are generated by the proxy progress thread while it is processing
@@ -251,6 +260,89 @@ the channel. Thus, they provide a more fine-grained view w.r.t. ProxyOp events.
 network requests for the GPU kernel. This includes everything else that the proxy thread might be
 doing, including appending new `ncclProxyOp` objects to the list of work elements to process.

+`ncclProfileKernelCh` events are generated by the profiler proxy progress function while the kernel
+processes work items for the enqueued NCCL operations.
+
+`ncclProfileNetPlugin` events are generated by the network plugin. Network plugins are free to define
+their own set of events and communicate them to the profiler plugin using `ncclProfileNetPlugin` and
+the `ncclProfilerCallback\_t` NCCL core callback. The network and profiler plugin can agree on the
+network defined event definition using the plugin id in the event descriptor. The plugin identifier
+is a 64-bit integer that has two parts: the 16 LSB are assigned to the plugin event version, the next
+16 bits are assigned to the plugin type (NCCL\_PROFILER\_NET\_TYPE\_IB, ...). The rest of the bits are
+unused and available for future extensions.
+
+A network IB plugin can use this infrastructure to define a QP event as:
+
+```C
+#define NCCL_PROFILER_NET_IB_VER 1
+
+enum {
+  ncclProfileQp = (1 << 0),
+};
+
+// The data structure version is encoded in the plugin identifier bitmask and
+// passed to NCCL core through the profiler callback. NCCL copies the plugin
+// identifier in the event descriptor before calling the profiler startEvent
+// function. The profiler should inspect the plugin id to find out the source
+// plugin as well as the version of the event struct
+typedef struct {
+  uint8_t type;        // event type (plugin defined)
+  union {
+    struct {
+      int device;      // network device id
+      uint64_t wr_id;  // work request id
+      int opcode;      // ibv opcode
+      int qpNum;       // QP number
+      size_t length;   // work request data length
+    } qp;
+  };
+} ncclProfilerNetIbDescr_v1_t;
+```
+
+The network event infrastructure is network agnostic. A different network socket plugin can
+use it to define a socket event as:
+
+```C
+#define NCCL_PROFILER_NET_SOCKET_VER 1
+
+enum {
+  ncclProfileSocket = (1 << 0),
+};
+
+// The data structure version is encoded in the plugin identifier bitmask and
+// passed to NCCL core through the profiler callback. NCCL copies the plugin
+// identifier in the event descriptor before calling the profiler startEvent
+// function. The profiler should inspect the plugin id to find out the source
+// plugin as well as the version of the event struct
+typedef struct {
+  uint8_t type;        // event type (plugin defined)
+  union {
+    struct {
+      int fd;
+      int op;
+      size_t length;
+    } sock;
+  };
+} ncclProfilerNetSockDescr_v1_t;
+```
+
+The network plugin creates an event (descriptor) and passes it to the profiler callback,
+along with the network type and version (plugin id). NCCL then creates a `ncclProfileNetPlugin`
+event descriptor, attaches the network plugin defined event as external data, and calls
+the profiler `startEvent` function.
+
+```C
+ncclResult_t isend(..., void* phandle, ...) {
+  ...
+  int pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER;
+  ncclProfilerNetIbDescr_v1_t eDescr = { };
+  eDescr.type = ncclProfileQp;
+  eDescr.qp = { ... };
+  ncclProfilerCallback(&eHandle, 0 /* start net event */, phandle, pluginId, &eDescr);
+  ...
+}
+```
+
 State transitions for the events described can also come with event attribute updates. For this
 reason the profiler defines the `ncclProfilerEventStateArgs_t` struct, reported below.

@@ -264,7 +356,7 @@ typedef union {
  struct {                // attributes to update for ncclProfileProxyCtrl
    int appendedProxyOps; // number of appended proxy ops thus far
  } proxyCtrl;
-} ncclProfilerEventStateArgs_v2_t;
+} ncclProfilerEventStateArgs_v3_t;
 ```

 The example profiler in `ext-profiler/example` contains details on how to capture and use the events above.
@@ -279,14 +371,22 @@ Group event
   +- Collective event
   |  |
   |  +- ProxyOp event
-   |     |
-   |     +- ProxyStep event
+   |  |  |
+   |  |  +- ProxyStep event
+   |  |     |
+   |  |     +- NetPlugin event
+   |  |
+   |  +- KernelCh event
   |
   +- Point-to-point event
      |
      +- ProxyOp event
-         |
-         +- ProxyStep event
+      |  |
+      |  +- ProxyStep event
+      |     |
+      |     +- NetPlugin event
+      |
+      +- KernelCh event

 ProxyCtrl event
 ```
@@ -316,3 +416,17 @@ thread originating the operation. To avoid the profiler instance in the remote p
 dereference a pointer from another address space the event descriptor includes the PID of the originator.
 The profiler plugin needs to check that the originator PID matches the local PID before dereferencing the
 parent event.
+
+# Known Limitations
+
+In intra-node communication, or whenever a rank does not have any network activity for which proxy events
+are unavailable, the profiler will only report the enqueue events (e.g., ncclAllReduce). The events from
+enqueue can be time stamped by the profiler (at start and stop) to reconstruct the execution time of the
+collective. However, this time only represents the launch time of the collective and not the actual
+execution time. To reconstruct the execution time more accurately proxy and kernel events are provided.
+
+Kernel events instrumentation leverages counters exposed by the kernel to the host and the proxy progress
+thread. Thus, the proxy progress thread infrastructure is shared between the network and the profiler. If
+the proxy is serving network requests the kernel profiling probing can be delayed, causing loss of
+accuracy. Similarly, if the CPU is under heavy load and the scheduling of the proxy progress thread is
+delayed, a similar loss of accuracy can be encountered. Keep this in mind when using kernel events.
@@ -10,7 +10,7 @@ PLUGIN_SO := libnccl-profiler.so
 default: $(PLUGIN_SO)

 $(PLUGIN_SO): plugin.c event.c print_event.c
-	$(CC) $(INC) -g -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
+	$(CXX) $(INC) -g -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^

 clean:
 	rm -f $(PLUGIN_SO)
@@ -33,10 +33,42 @@

 #define MAX_PROXY_OP_STATES              ((NUM_PROXY_OP_SEND_STATES   > NUM_PROXY_OP_RECV_STATES  ) ? NUM_PROXY_OP_SEND_STATES   : NUM_PROXY_OP_RECV_STATES)
 #define MAX_PROXY_STEP_STATES            ((NUM_PROXY_STEP_SEND_STATES > NUM_PROXY_STEP_RECV_STATES) ? NUM_PROXY_STEP_SEND_STATES : NUM_PROXY_STEP_RECV_STATES)
-
-#define MAX_COMM_CLIQUES                 (32 * 8)
+#define MAX_EVENTS_PER_REQ               (8)

 struct proxyOp;
+struct proxyStep;
+
+struct netPlugin {
+  uint8_t type;
+  int pluginType;
+  int pluginVer;
+  uint8_t pluginEvent;
+  union {
+    struct {
+      int device;
+      int qpNum;
+      int opcode;
+      uint64_t wr_id;
+      size_t length;
+    } qp;
+    struct {
+      int fd;
+      int op;
+      size_t length;
+    } sock;
+  };
+  double startTs;
+  double stopTs;
+  struct proxyStep* parent;
+};
+
+struct kernelCh {
+  uint8_t type;
+  uint8_t channelId;
+  struct taskEventBase* parent;
+  double startTs;
+  double stopTs;
+};

 struct proxyStep {
  uint8_t type;                     // type of event: network transfer
@@ -46,6 +78,8 @@ struct proxyStep {
  double startTs;
  double stopTs;
  struct proxyOp* parent;
+  struct netPlugin net[MAX_EVENTS_PER_REQ];
+  int nNetEvents;
 };

 struct proxyOp {
@@ -101,7 +135,6 @@ struct collective {
  void const* sendBuff;
  void* recvBuff;
  size_t count;
-  size_t trafficBytes;
  int root;
  const char* datatype;
  uint8_t nMaxChannels;
@@ -111,6 +144,7 @@ struct collective {
  struct proxyOp send[MAX_CHANNELS][MAX_OPS];// array of send proxy operation events
  struct proxyOp recv[MAX_CHANNELS][MAX_OPS];// array of recv proxy operation events
  int nProxyOps[MAX_CHANNELS];
+  struct kernelCh kernel[MAX_CHANNELS];
 };

 struct p2p {
@@ -121,6 +155,7 @@ struct p2p {
  const char* datatype;
  int peer;
  struct proxyOp op[MAX_CHANNELS];
+  struct kernelCh kernel[MAX_CHANNELS];
 };

 struct group {
@@ -0,0 +1,34 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_IB_V1_H_
+#define NET_IB_V1_H_
+
+#define NCCL_PROFILER_NET_IB_VER 1
+
+enum {
+  ncclProfileQp = (1 << 0),
+};
+
+// The data structure version is encoded in the plugin identifier bitmask and
+// passed to NCCL core through the profiler callback. NCCL copies the plugin
+// identifier in the event descriptor before calling the profiler startEvent
+// function. The profiler should inspect the plugin id to find out the source
+// plugin as well as the version of the event struct
+typedef struct {
+  uint8_t type;        // event type (plugin defined)
+  union {
+    struct {
+      int device;      // network device id
+      uint64_t wr_id;  // work request id
+      int opcode;      // ibv opcode
+      int qpNum;       // QP number
+      size_t length;   // work request data length
+    } qp;
+  };
+} ncclProfilerNetIbDescr_v1_t;
+
+#endif
@@ -0,0 +1,32 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_SOCKET_V1_H_
+#define NET_SOCKET_V1_H_
+
+#define NCCL_PROFILER_NET_SOCKET_VER 1
+
+enum {
+  ncclProfileSocket = (1 << 0),
+};
+
+// The data structure version is encoded in the plugin identifier bitmask and
+// passed to NCCL core through the profiler callback. NCCL copies the plugin
+// identifier in the event descriptor before calling the profiler startEvent
+// function. The profiler should inspect the plugin id to find out the source
+// plugin as well as the version of the event struct
+typedef struct {
+  uint8_t type;        // event type (plugin defined)
+  union {
+    struct {
+      int fd;
+      int op;
+      size_t length;
+    } sock;
+  };
+} ncclProfilerNetSockDescr_v1_t;
+
+#endif
@@ -4,8 +4,8 @@
 * See LICENSE.txt for license information
 ************************************************************************/

-#ifndef NCCL_PROFILER_H_
-#define NCCL_PROFILER_H_
+#ifndef PROFILER_H_
+#define PROFILER_H_

 #include <stdint.h>
 #include <stdlib.h>
@@ -13,7 +13,54 @@
 #include "common.h"
 #include "err.h"

+enum {
+  ncclProfileGroup     = (1 << 0),  // group event type
+  ncclProfileColl      = (1 << 1),  // host collective call event type
+  ncclProfileP2p       = (1 << 2),  // host point-to-point call event type
+  ncclProfileProxyOp   = (1 << 3),  // proxy operation event type
+  ncclProfileProxyStep = (1 << 4),  // proxy step event type
+  ncclProfileProxyCtrl = (1 << 5),  // proxy control event type
+  ncclProfileKernelCh  = (1 << 6),  // kernel channel event type
+  ncclProfileNetPlugin = (1 << 7),  // network plugin-defined, events
+};
+
+typedef enum {
+  ncclProfilerProxyOpSendPosted,
+  ncclProfilerProxyOpSendRemFifoWait,
+  ncclProfilerProxyOpSendTransmitted,
+  ncclProfilerProxyOpSendDone,
+  ncclProfilerProxyOpRecvPosted,
+  ncclProfilerProxyOpRecvReceived,
+  ncclProfilerProxyOpRecvTransmitted,
+  ncclProfilerProxyOpRecvDone,
+
+  /* Legacy proxy profiler states */
+  ncclProfilerProxyStepSendGPUWait,
+  ncclProfilerProxyStepSendWait,
+  ncclProfilerProxyStepRecvWait,
+  ncclProfilerProxyStepRecvFlushWait,
+  ncclProfilerProxyStepRecvGPUWait,
+
+  /* Legacy proxy control states */
+  ncclProfilerProxyCtrlIdle,
+  ncclProfilerProxyCtrlActive,
+  ncclProfilerProxyCtrlSleep,
+  ncclProfilerProxyCtrlWakeup,
+  ncclProfilerProxyCtrlAppend,
+  ncclProfilerProxyCtrlAppendEnd,
+} ncclProfilerEventState_t;
+
+typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t;
+typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t;
+typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t;
+
+#include "profiler_v3.h"
 #include "profiler_v2.h"
 #include "profiler_v1.h"
+#include "profiler_net.h"
+
+typedef ncclProfiler_v3_t ncclProfiler_t;
+typedef ncclProfilerEventDescr_v3_t ncclProfilerEventDescr_t;
+typedef ncclProfilerEventStateArgs_v3_t ncclProfilerEventStateArgs_t;

 #endif // end include guard
@@ -0,0 +1,22 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_NET_H_
+#define PROFILER_NET_H_
+
+#define NCCL_PROFILER_NET_VER_BITS  (16)
+#define NCCL_PROFILER_NET_VER_MASK  (~0U >> NCCL_PROFILER_NET_VER_BITS)
+#define NCCL_PROFILER_NET_TYPE_MASK (~0U << NCCL_PROFILER_NET_VER_BITS)
+
+typedef enum {
+  NCCL_PROFILER_NET_TYPE_IB   = (1U << NCCL_PROFILER_NET_VER_BITS),
+  NCCL_PROFILER_NET_TYPE_SOCK = (2U << NCCL_PROFILER_NET_VER_BITS),
+} ncclProfilerNetType;
+
+#include "net_ib_v1.h"
+#include "net_socket_v1.h"
+
+#endif
@@ -4,8 +4,8 @@
 * See LICENSE.txt for license information
 ************************************************************************/

-#ifndef NCCL_PROFILER_V1_H_
-#define NCCL_PROFILER_V1_H_
+#ifndef PROFILER_V1_H_
+#define PROFILER_V1_H_

 #include <stdint.h>

@@ -59,8 +59,16 @@ typedef struct {
  };
 } ncclProfilerEventDescr_v1_t;

-typedef ncclProfilerEventState_v2_t ncclProfilerEventState_v1_t;
-typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_v1_t;
+typedef union {
+  struct {
+    size_t transSize;
+    int steps;
+  } proxyOp;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+} ncclProfilerEventStateArgs_v1_t;

 typedef struct {
  const char* name;
@@ -4,20 +4,11 @@
 * See LICENSE.txt for license information
 ************************************************************************/

-#ifndef NCCL_PROFILER_V2_H_
-#define NCCL_PROFILER_V2_H_
+#ifndef PROFILER_V2_H_
+#define PROFILER_V2_H_

 #include <stdint.h>

-enum {
-  ncclProfileGroup     = (1 << 0),  // group event type
-  ncclProfileColl      = (1 << 1),  // host collective call event type
-  ncclProfileP2p       = (1 << 2),  // host point-to-point call event type
-  ncclProfileProxyOp   = (1 << 3),  // proxy operation event type
-  ncclProfileProxyStep = (1 << 4),  // proxy step event type
-  ncclProfileProxyCtrl = (1 << 5),  // proxy control event type
-};
-
 typedef struct {
  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
@@ -65,32 +56,6 @@ typedef struct {
  };
 } ncclProfilerEventDescr_v2_t;

-typedef enum {
-  ncclProfilerProxyOpSendPosted,
-  ncclProfilerProxyOpSendRemFifoWait,
-  ncclProfilerProxyOpSendTransmitted,
-  ncclProfilerProxyOpSendDone,
-  ncclProfilerProxyOpRecvPosted,
-  ncclProfilerProxyOpRecvReceived,
-  ncclProfilerProxyOpRecvTransmitted,
-  ncclProfilerProxyOpRecvDone,
-
-  /* Legacy proxy profiler states */
-  ncclProfilerProxyStepSendGPUWait,
-  ncclProfilerProxyStepSendWait,
-  ncclProfilerProxyStepRecvWait,
-  ncclProfilerProxyStepRecvFlushWait,
-  ncclProfilerProxyStepRecvGPUWait,
-
-  /* Legacy proxy control states */
-  ncclProfilerProxyCtrlIdle,
-  ncclProfilerProxyCtrlActive,
-  ncclProfilerProxyCtrlSleep,
-  ncclProfilerProxyCtrlWakeup,
-  ncclProfilerProxyCtrlAppend,
-  ncclProfilerProxyCtrlAppendEnd,
-} ncclProfilerEventState_v2_t;
-
 typedef union {
  struct {
    size_t transSize;
@@ -138,9 +103,4 @@ typedef struct {
  ncclResult_t (*finalize)(void* context);
 } ncclProfiler_v2_t;

-typedef ncclProfilerEventDescr_v2_t ncclProfilerEventDescr_t;
-typedef ncclProfilerEventState_v2_t ncclProfilerEventState_t;
-typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_t;
-typedef ncclProfiler_v2_t ncclProfiler_t;
-
 #endif
@@ -0,0 +1,119 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V3_H_
+#define PROFILER_V3_H_
+
+#include <stdint.h>
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      uint8_t nMaxChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+    } coll;
+
+    struct {
+      const char* name;
+      uint64_t commHash;
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+
+    struct {
+      uint8_t channelId;
+    } kernelCh;
+
+    struct {
+      int64_t id;
+      void* data;
+    } netPlugin;
+  };
+} ncclProfilerEventDescr_v3_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+    int steps;
+  } proxyOp;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+} ncclProfilerEventStateArgs_v3_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v3_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v3_t eState, ncclProfilerEventStateArgs_v3_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v3_t;
+
+typedef ncclProfilerEventDescr_v3_t ncclProfilerEventDescr_t;
+typedef ncclProfilerEventState_v3_t ncclProfilerEventState_t;
+typedef ncclProfilerEventStateArgs_v3_t ncclProfilerEventStateArgs_t;
+typedef ncclProfiler_v3_t ncclProfiler_t;
+
+#endif
@@ -58,6 +58,7 @@ __hidden double gettime(void) {

 static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
 static pid_t pid;
+static int* eActivationMaskPtr;

 __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask) {
  pthread_mutex_lock(&lock);
@@ -65,7 +66,7 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask)
    // first thread initializes event mask, environment and detach pool
    const char* str;
    str = getenv("NCCL_PROFILE_EVENT_MASK");
-    __atomic_store_n(eActivationMask, str ? atoi(str) : defaultEActivationMask, __ATOMIC_RELAXED);
+    __atomic_store_n(eActivationMask, str ? atoi(str) : 0, __ATOMIC_RELAXED);

    str = getenv("NCCL_PROFILE_GROUP_POOL_SIZE");
    groupPoolSize = str ? atoi(str) : defaultGroupPoolSize;
@@ -100,6 +101,9 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask)
  }
  pthread_mutex_unlock(&lock);

+  // store pointer to activation mask globally
+  eActivationMaskPtr = eActivationMask;
+
  // pre-allocate memory for event object pools in dedicated profiler context
  struct context* ctx = (struct context *)calloc(1, sizeof(*ctx));
  ctx->groupPool = (struct group *)calloc(groupPoolSize, sizeof(*ctx->groupPool));
@@ -199,8 +203,6 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
        if (base->type == ncclProfileColl) {
          struct collective* c = (struct collective *)base;
          // reset event proxyOps & proxySteps
-          memset(c->send, 0, sizeof(struct proxyOp)*MAX_CHANNELS*MAX_OPS);
-          memset(c->recv, 0, sizeof(struct proxyOp)*MAX_CHANNELS*MAX_OPS);
          memset(c->nProxyOps, 0, sizeof(int)*MAX_CHANNELS);
          // release collective events in the group and return them to the collective pool
          __atomic_fetch_add(&ctx->collPoolBase, 1, __ATOMIC_RELAXED);
@@ -252,7 +254,6 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
    event->count = eDescr->coll.count;
    event->root = eDescr->coll.root;
    event->datatype = eDescr->coll.datatype;
-    event->trafficBytes = eDescr->coll.trafficBytes;
    event->nMaxChannels = eDescr->coll.nMaxChannels;
    event->nWarps = eDescr->coll.nWarps;
    event->algo = eDescr->coll.algo;
@@ -373,7 +374,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
      __atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED);
      debugEvent(event, "ProxyOpStart");
    }
- } else if (eDescr->type == ncclProfileProxyStep) {
+  } else if (eDescr->type == ncclProfileProxyStep) {
    // the parent might be null if we run out of events
    struct proxyOp* parent = (struct proxyOp *)eDescr->parentObj;
    if (parent == NULL) return ncclSuccess;
@@ -385,8 +386,77 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
    event->isSend = parent->isSend;
    event->parent = parent;
    event->startTs = gettime() - startTime;
+    event->nNetEvents = 0;
    *eHandle = event;
    debugEvent(event, "ProxyStepStart");
+  } else if (eDescr->type == ncclProfileKernelCh) {
+    struct taskEventBase* eventBase = (struct taskEventBase *)eDescr->parentObj;
+    if (eventBase == NULL) return ncclSuccess;
+    if (eventBase->type == ncclProfileColl) {
+      struct collective* parent = (struct collective *)eDescr->parentObj;
+      struct kernelCh* event = &parent->kernel[eDescr->kernelCh.channelId];
+      event->type = ncclProfileKernelCh;
+      event->channelId = eDescr->kernelCh.channelId;
+      event->parent = eventBase;
+      event->startTs = gettime() - startTime;
+      *eHandle = event;
+      __atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED);
+      debugEvent(event, "KernelChStart");
+    } else { // ncclProfileP2p
+      struct p2p* parent = (struct p2p *)eDescr->parentObj;
+      struct kernelCh* event = &parent->kernel[eDescr->kernelCh.channelId];
+      event->type = ncclProfileKernelCh;
+      event->channelId = eDescr->kernelCh.channelId;
+      event->parent = eventBase;
+      event->startTs = gettime() - startTime;
+      *eHandle = event;
+      __atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED);
+      debugEvent(event, "KernelChStart");
+    }
+  } else if (eDescr->type == ncclProfileNetPlugin) {
+    struct proxyStep* parent = (struct proxyStep *)eDescr->parentObj;
+    if (parent == NULL) return ncclSuccess;
+
+    int64_t pluginId = eDescr->netPlugin.id;
+    int64_t type = pluginId & NCCL_PROFILER_NET_TYPE_MASK;
+    int64_t ver = pluginId & NCCL_PROFILER_NET_VER_MASK;
+    if (type == NCCL_PROFILER_NET_TYPE_IB) {
+      if (ver == 1) {
+        ncclProfilerNetIbDescr_v1_t* descr = (ncclProfilerNetIbDescr_v1_t *)eDescr->netPlugin.data;
+        struct netPlugin* event = parent->net + __atomic_fetch_add(&parent->nNetEvents, 1, __ATOMIC_RELAXED);
+        event->type = ncclProfileNetPlugin;
+        event->pluginType = type;
+        event->pluginVer = ver;
+        if (descr->type == ncclProfileQp) {
+          event->pluginEvent = ncclProfileQp;
+          event->qp.device = descr->qp.device;
+          event->qp.wr_id = descr->qp.wr_id;
+          event->qp.opcode = descr->qp.opcode;
+          event->qp.qpNum = descr->qp.qpNum;
+          event->qp.length = descr->qp.length;
+        }
+        event->startTs = gettime() - startTime;
+        *eHandle = event;
+        debugEvent(event, "NetPluginStart");
+      }
+    } else if (type == NCCL_PROFILER_NET_TYPE_SOCK) {
+      if (ver == 1) {
+        ncclProfilerNetSockDescr_v1_t* descr = (ncclProfilerNetSockDescr_v1_t *)eDescr->netPlugin.data;
+        struct netPlugin* event = parent->net + __atomic_fetch_add(&parent->nNetEvents, 1, __ATOMIC_RELAXED);
+        event->type = ncclProfileNetPlugin;
+        event->pluginType = type;
+        event->pluginVer = ver;
+        if (descr->type == ncclProfileSocket) {
+          event->pluginEvent = ncclProfileSocket;
+          event->sock.fd = descr->sock.fd;
+          event->sock.op = descr->sock.op;
+          event->sock.length = descr->sock.length;
+        }
+        event->startTs = gettime() - startTime;
+        *eHandle = event;
+        debugEvent(event, "NetPluginStart");
+      }
+    }
  }
  return ncclSuccess;
 }
@@ -445,6 +515,15 @@ void updateEvent(void* handle) {
    struct proxyCtrl* event = (struct proxyCtrl *)handle;
    event->stopTs = gettime() - startTime;
    debugEvent(event, "ProxyCtrlStop");
+  } else if (type == ncclProfileKernelCh) {
+    struct kernelCh* event = (struct kernelCh *)handle;
+    event->stopTs = gettime() - startTime;
+    updateEvent(event->parent);
+    debugEvent(event, "KernelChStop");
+  } else if (type == ncclProfileNetPlugin) {
+    struct netPlugin* event = (struct netPlugin *)handle;
+    event->stopTs = gettime() - startTime;
+    debugEvent(event, "NetPluginStop");
  }
 }

@@ -506,7 +585,7 @@ __hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfile
  return ncclSuccess;
 }

-ncclProfiler_t ncclProfiler_v2 = {
+ncclProfiler_t ncclProfiler_v3 = {
  "Example-profiler",
  exampleProfilerInit,
  exampleProfilerStartEvent,
@@ -514,3 +593,17 @@ ncclProfiler_t ncclProfiler_v2 = {
  exampleProfilerRecordEventState,
  exampleProfilerFinalize,
 };
+
+int exampleProfilerStart(int eActivationMask) {
+  if (__atomic_load_n(&initialized, __ATOMIC_RELAXED)) {
+    __atomic_store_n(eActivationMaskPtr, eActivationMask, __ATOMIC_RELAXED);
+  }
+  return ncclSuccess;
+}
+
+int exampleProfilerStop(void) {
+  if (__atomic_load_n(&initialized, __ATOMIC_RELAXED)) {
+    __atomic_store_n(eActivationMaskPtr, 0, __ATOMIC_RELAXED);
+  }
+  return ncclSuccess;
+}
@@ -0,0 +1,13 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PLUGIN_H_
+#define PLUGIN_H_
+
+int exampleProfilerStart(int eActivationMask);
+int exampleProfilerStop(void);
+
+#endif
@@ -72,7 +72,7 @@ __hidden void printProxyOpEventTrailer(FILE* fh, struct proxyOp* event) {
 }

 static __thread int proxyStepId;
-__hidden void printProxyStepEvent(FILE* fh, struct proxyStep* event) {
+__hidden void printProxyStepEventHeader(FILE* fh, struct proxyStep* event) {
  if (event->isSend) {
    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
            "SendBufferWait", proxyStepId, getpid(), 1, event->startTs, event->step);
@@ -84,8 +84,6 @@ __hidden void printProxyStepEvent(FILE* fh, struct proxyStep* event) {
            "SendGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendWait)]);
    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
            "SendWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendWait)], event->step);
-    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
-            "SendWait", proxyStepId++, getpid(), 1, event->stopTs);
  } else {
    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
            "RecvBufferWait", proxyStepId, getpid(), 1, event->startTs, event->step);
@@ -93,6 +91,14 @@ __hidden void printProxyStepEvent(FILE* fh, struct proxyStep* event) {
            "RecvBufferWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvWait)]);
    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
            "RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvWait)], event->step);
+  }
+}
+
+__hidden void printProxyStepEventTrailer(FILE* fh, struct proxyStep* event) {
+  if (event->isSend) {
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+            "SendWait", proxyStepId++, getpid(), 1, event->stopTs);
+  } else {
    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
            "RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvFlushWait)]);
    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
@@ -106,6 +112,19 @@ __hidden void printProxyStepEvent(FILE* fh, struct proxyStep* event) {
  }
 }

+static __thread int kernelId;
+__hidden void printKernelChEventHeader(FILE* fh, struct kernelCh* event) {
+  if (event->type != ncclProfileKernelCh) return;
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GPU\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d}},\n",
+          "KernelCh", kernelId, getpid(), 1, event->startTs, event->channelId);
+}
+
+__hidden void printKernelChEventTrailer(FILE* fh, struct kernelCh* event) {
+  if (event->type != ncclProfileKernelCh) return;
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GPU\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+          "KernelCh", kernelId, getpid(), 1, event->stopTs);
+}
+
 static __thread int proxyCtrlId;
 __hidden void printProxyCtrlEvent(FILE* fh, struct proxyCtrl* event) {
  const char* str;
@@ -127,6 +146,29 @@ __hidden void printProxyCtrlEvent(FILE* fh, struct proxyCtrl* event) {
          str, proxyCtrlId++, getpid(), 1, event->stopTs);
 }

+static __thread int ibQpId, sockId;
+__hidden void printNetPluginEvent(FILE* fh, struct netPlugin* event) {
+  if (event->pluginType == NCCL_PROFILER_NET_TYPE_IB) {
+    if (event->pluginVer == 1) {
+      if (event->pluginEvent == ncclProfileQp) {
+        fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET_IB\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"device\": %d, \"qp_num\": %d, \"opcode\": %d, \"wr_id\": %lu, \"size\": %lu}},\n",
+                "Qp", ibQpId, getpid(), 1, event->startTs, event->qp.device, event->qp.qpNum, event->qp.opcode, event->qp.wr_id, event->qp.length);
+        fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET_IB\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+                "Qp", ibQpId++, getpid(), 1, event->stopTs);
+      }
+    }
+  } else if (event->pluginType == NCCL_PROFILER_NET_TYPE_SOCK) {
+    if (event->pluginVer == 1) {
+      if (event->pluginEvent == ncclProfileSocket) {
+        fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET_SOCK\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"sock\": %d, \"op\": %d, \"size\": %lu}},\n",
+                "Sock", sockId, getpid(), 1, event->startTs, event->sock.fd, event->sock.op, event->sock.length);
+        fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET_SOCK\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+                "Sock", sockId++, getpid(), 1, event->stopTs);
+      }
+    }
+  }
+}
+
 //#define DEBUG_EVENTS
 void debugEvent(void* eHandle, const char* tag) {
 #ifdef DEBUG_EVENTS
@@ -146,8 +188,10 @@ void debugEvent(void* eHandle, const char* tag) {
    fprintf(fh, "Collective event %p tag = %s {\n", event, tag);
    fprintf(fh, "  refCount          = %d\n", __atomic_load_n(&event->base.refCount, __ATOMIC_RELAXED));
    fprintf(fh, "  parent            = %p\n", event->base.parent);
-    for (int i = 0; i < MAX_CHANNELS; i++ ) if (event->send[i].type == ncclProfileProxyOp) fprintf(fh, "  send[%d]           = %p\n", i, &event->send[i]);
-    for (int i = 0; i < MAX_CHANNELS; i++ ) if (event->recv[i].type == ncclProfileProxyOp) fprintf(fh, "  recv[%d]           = %p\n", i, &event->recv[i]);
+    for (int j = 0; j < MAX_OPS; j++) {
+      for (int i = 0; i < MAX_CHANNELS; i++) if (event->send[i][j].type == ncclProfileProxyOp) fprintf(fh, "  send[%d]           = %p\n", i, &event->send[i]);
+      for (int i = 0; i < MAX_CHANNELS; i++) if (event->recv[i][j].type == ncclProfileProxyOp) fprintf(fh, "  recv[%d]           = %p\n", i, &event->recv[i]);
+    }
    fprintf(fh, "  startTs           = %f\n", event->base.startTs);
    fprintf(fh, "  stopTs            = %f\n", event->base.stopTs);
    fprintf(fh, "}\n");
@@ -178,6 +222,20 @@ void debugEvent(void* eHandle, const char* tag) {
    fprintf(fh, "  startTs           = %f\n", event->startTs);
    fprintf(fh, "  stopTs            = %f\n", event->stopTs);
    fprintf(fh, "}\n");
+  } else if (type == ncclProfileKernelCh) {
+    struct kernelCh* event = (struct kernelCh *)eHandle;
+    fprintf(fh, "KernelCh event %p tag = %s {\n", event, tag);
+    fprintf(fh, "  parent            = %p\n", event->parent);
+    fprintf(fh, "  channel           = %d\n", event->channelId);
+  } else if (type == ncclProfileNetPlugin) {
+    struct netPlugin* event = (struct netPlugin *)eHandle;
+    fprintf(fh, "NetPlugin event %p tag = %s {\n", event, tag);
+    fprintf(fh, "  pluginType        = %d\n", event->pluginType);
+    fprintf(fh, "  pluginVer         = %d\n", event->pluginVer);
+    fprintf(fh, "  pluginEvent       = %d\n", event->pluginEvent);
+    fprintf(fh, "  startTs           = %f\n", event->startTs);
+    fprintf(fh, "  stopTs            = %f\n", event->stopTs);
+    fprintf(fh, "}\n");
  }
  fclose(fh);
 #endif
@@ -200,17 +258,21 @@ void printEvent(FILE* fh, void* handle) {
    struct collective* c = (struct collective *)handle;
    printCollEventHeader(fh, c);
    for (int i = 0; i < MAX_CHANNELS; i++) {
+      printKernelChEventHeader(fh, &c->kernel[i]);
      for (int j = 0; j < c->nProxyOps[i]; j++) {
        printEvent(fh, &c->send[i][j]);
        printEvent(fh, &c->recv[i][j]);
      }
+      printKernelChEventTrailer(fh, &c->kernel[i]);
    }
    printCollEventTrailer(fh, c);
  } else if (type == ncclProfileP2p) {
    struct p2p* p = (struct p2p *)handle;
    printP2pEventHeader(fh, p);
    for (int i = 0; i < MAX_CHANNELS; i++) {
+      printKernelChEventHeader(fh, &p->kernel[i]);
      printEvent(fh, &p->op[i]);
+      printKernelChEventTrailer(fh, &p->kernel[i]);
    }
    printP2pEventTrailer(fh, p);
  } else if (type == ncclProfileProxyOp) {
@@ -222,7 +284,11 @@ void printEvent(FILE* fh, void* handle) {
    printProxyOpEventTrailer(fh, p);
  } else if (type == ncclProfileProxyStep) {
    struct proxyStep* p = (struct proxyStep *)handle;
-    printProxyStepEvent(fh, p);
+    printProxyStepEventHeader(fh, p);
+    for (int q = 0; q < p->nNetEvents; q++) {
+      printNetPluginEvent(fh, &p->net[q]);
+    }
+    printProxyStepEventTrailer(fh, p);
  } else if (type == ncclProfileProxyCtrl) {
    struct proxyCtrl* p = (struct proxyCtrl *)handle;
    printProxyCtrlEvent(fh, p);
@@ -16,6 +16,7 @@ WERROR ?= 0
 PROFAPI ?= 1
 NVTX ?= 1
 RDMA_CORE ?= 0
+NET_PROFILER ?= 0

 NVCC = $(CUDA_HOME)/bin/nvcc

@@ -137,3 +138,7 @@ endif
 ifneq ($(RDMA_CORE), 0)
 CXXFLAGS += -DNCCL_BUILD_RDMA_CORE=1
 endif
+
+ifneq ($(NET_PROFILER), 0)
+CXXFLAGS += -DNCCL_ENABLE_NET_PROFILING=1
+endif
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 25
-NCCL_PATCH   := 1
+NCCL_MINOR   := 26
+NCCL_PATCH   := 6
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
@@ -10,11 +10,15 @@ include ../makefiles/version.mk
 INCEXPORTS  := nccl.h
 LIBSRCFILES := \
 	bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \
-	init.cc init_nvtx.cc net.cc proxy.cc transport.cc mnnvl.cc \
+	init.cc init_nvtx.cc proxy.cc transport.cc mnnvl.cc \
 	$(wildcard graph/*.cc) \
 	$(wildcard misc/*.cc) \
 	$(wildcard transport/*.cc) \
 	$(wildcard register/*.cc) \
+	$(wildcard plugin/*.cc) \
+	$(wildcard plugin/net/*.cc) \
+	$(wildcard plugin/tuner/*.cc) \
+	$(wildcard plugin/profiler/*.cc) \
 	$(filter-out ras/client.cc,$(wildcard ras/*.cc))
 BINSRCFILES := ras/client.cc

@@ -49,6 +53,7 @@ LIBOBJ     := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o)
 BINOBJ     := $(BINSRCFILES:%.cc=$(OBJDIR)/%.o)
 DEPFILES   := $(LIBOBJ:%.o=%.d) $(BINOBJ:%.o=%.d)
 LDFLAGS    += -L${CUDA_LIB} -l$(CUDARTLIB) -lpthread -lrt -ldl
+INCPLUGIN  := include/plugin

 DEVMANIFEST := $(BUILDDIR)/obj/device/manifest

@@ -126,8 +131,8 @@ $(PKGDIR)/%.pc : %.pc
 $(OBJDIR)/%.o : %.cc $(INCTARGETS)
 	@printf "Compiling  %-35s > %s\n" $< $@
 	mkdir -p `dirname $@`
-	$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -c $< -o $@
-	@$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -M $< > $(@:%.o=%.d.tmp)
+	$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -I$(INCPLUGIN) -c $< -o $@
+	@$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -I$(INCPLUGIN) -M $< > $(@:%.o=%.d.tmp)
 	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d)
 	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \
                sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d)
@@ -154,7 +154,7 @@ static ncclResult_t netIsend(ncclNet_t* net, void* sendComm, void* data, int siz
                             int* done) {
  if (*done) return ncclSuccess;
  if (!*sendReq) {
-    NCCLCHECK(net->isend(sendComm, data, (size_t)size, tag, dataHandle, sendReq));
+    NCCLCHECK(net->isend(sendComm, data, (size_t)size, tag, dataHandle, NULL, sendReq));
  }
  if (*sendReq) {
    NCCLCHECK(net->test(*sendReq, done, NULL));
@@ -168,8 +168,8 @@ static ncclResult_t netIrecv(ncclNet_t* net, void* recvComm, void* data, int siz
                             int* done) {
  if (*done) return ncclSuccess;
  if (!*recvReq) {
-    size_t size64 = size; 
-    NCCLCHECK(net->irecv(recvComm, 1, &data, &size64, &tag, &dataHandle, recvReq));
+    size_t size64 = size;
+    NCCLCHECK(net->irecv(recvComm, 1, &data, &size64, &tag, &dataHandle, NULL, recvReq));
  }
  if (*recvReq) {
    NCCLCHECK(net->test(*recvReq, done, NULL));
@@ -485,7 +485,7 @@ static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) {
  if (devOOB < 0) {
    pthread_mutex_lock(&bootstrapNetLock);
    if (devOOB < 0) {
-      char* userIfEnv = getenv("NCCL_OOB_NET_IFNAME");
+      const char* userIfEnv = ncclGetEnv("NCCL_OOB_NET_IFNAME");
      if (userIfEnv && strlen(userIfEnv) > 0) {
        INFO(NCCL_BOOTSTRAP | NCCL_ENV, "NCCL_OOB_NET_IFNAME set to %s", userIfEnv);
        bool searchNot = userIfEnv && userIfEnv[0] == '^';
@@ -541,7 +541,7 @@ static ncclResult_t netRingConnect(ncclNet_t* net, struct bootstrapListen_t* lis
  do {
    NCCLCHECK(checkAbort(abortFlag, &abortCounter));
    if (!*sendComm)
-      NCCLCHECK(net->connect(listen->net.dev, peerHandle, sendComm, sendDevHandle));
+      NCCLCHECK(net->connect(listen->net.dev, NULL, peerHandle, sendComm, sendDevHandle));
    if (!*recvComm)
      NCCLCHECK(net->accept(listen->net.comm, recvComm, recvDevHandle));
  } while (!*sendComm || !*recvComm);
@@ -741,6 +741,8 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
    rasRanks[rank].pid = getpid();
    rasRanks[rank].cudaDev = comm->cudaDev;
    rasRanks[rank].nvmlDev = comm->nvmlDev;
+    rasRanks[rank].hostHash = getHostHash();
+    rasRanks[rank].pidHash = getPidHash();
    if (ncclRasCommInit(comm, rasRanks+rank) != ncclSuccess) {
      INFO(NCCL_INIT|NCCL_RAS, "Continuing in spite of a RAS initialization error");
      // We should still participate in the ringAllInfo below as the peers will be waiting for us.
@@ -972,7 +974,7 @@ ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int s
  NCCLCHECK(socketAccept(commState, peer, tag, &sock));
  TRACE(NCCL_BOOTSTRAP, "Receiving tag=%d peer=%d size=%d", tag, peer, size);
  NCCLCHECKGOTO(socketRecv(&sock, ((char*)data), size), ret, fail);
-  NCCLCHECK(ncclSocketClose(&sock));
+  NCCLCHECKGOTO(ncclSocketClose(&sock, /*wait*/true), ret, fail);
  return ret;
 fail:
  (void)ncclSocketClose(&sock);
@@ -1067,7 +1069,7 @@ static ncclResult_t bootstrapP2PBarrier(void* commState, int* ranks, int rank, i
   * Based on the dissemination algorithm by Debra Hensgen, Raphael Finkel, and Udi Manbet,
   * "Two Algorithms for Barrier Synchronization," International Journal of Parallel Programming, 17(1):1-17, 1988"
   */
-  int data[1];
+  int data[1] = {0};
  for (int mask = 1; mask < nranks; mask <<= 1) {
    int src = (rank - mask + nranks) % nranks;
    int dst = (rank + mask) % nranks;
@@ -20,8 +20,8 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
  channel->workFifoProduced = 0;

  struct ncclSharedResources* sharedRes = comm->sharedRes;
-
-  NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream));
+  cudaStream_t deviceStream;
+  NCCLCHECK(ncclStrongStreamAcquire(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false, &deviceStream));

  if (channel->peers == NULL) {
    // The extra on nRanks+1 is for collnet root (i.e. network)
@@ -39,33 +39,33 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelId) {

  if (channel->devPeers == NULL) {
    if (sharedRes->devPeers[channelId] == NULL) {
-      NCCLCHECK(ncclCudaCallocAsync(sharedRes->devPeers + channelId, sharedRes->tpNRanks, sharedRes->deviceStream.cudaStream));
+      NCCLCHECK(ncclCudaCallocAsync(sharedRes->devPeers + channelId, sharedRes->tpNRanks, deviceStream));
    }
    /* channel->devPeers is not shared, so just free it when calling commFree() */
-    NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nPeers, sharedRes->deviceStream.cudaStream));
+    NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nPeers, deviceStream));
    ncclCommPushCudaFree(comm, channel->devPeers);
    NCCLCHECK(ncclCalloc(&channel->devPeersHostPtr, nPeers));
    for (int r = 0; r < nRanks; r++) {
      uintptr_t addr = (uintptr_t)(comm->sharedRes->devPeers[channelId] + comm->topParentRanks[r]);
-      NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
+      NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + r), (uintptr_t*)&addr, 1, deviceStream));
      channel->devPeersHostPtr[r] = (struct ncclDevChannelPeer*)addr;
    }
  }

  channel->ring.userRanks = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks);
-  NCCLCHECK(ncclCudaCallocAsync(&channel->devRingUserRanks, nRanks, sharedRes->deviceStream.cudaStream));
+  NCCLCHECK(ncclCudaCallocAsync(&channel->devRingUserRanks, nRanks, deviceStream));
  ncclCommPushCudaFree(comm, channel->devRingUserRanks);

  /* guarantee addr has been copied into channel->devPeers */
+  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false));
  NCCLCHECK(ncclStrongStreamSynchronize(&sharedRes->deviceStream));
-  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream));
-
  return ncclSuccess;
 }

 ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share) {
  struct ncclChannel* channel = &comm->channels[channelId];
  struct ncclSharedResources* sharedRes = comm->sharedRes;
+  cudaStream_t deviceStream;

  if (channel->nvlsPeers != NULL)
    return ncclSuccess;
@@ -73,7 +73,7 @@ ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclCo
  if (channel->id == -1)
    NCCLCHECK(initChannel(comm, channelId));

-  NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream));
+  NCCLCHECK(ncclStrongStreamAcquire(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false, &deviceStream));

  int nvlsRanks = comm->localRanks;

@@ -84,24 +84,24 @@ ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclCo
      int tr = comm->topParentLocalRanks[r];
      uintptr_t addr = (uintptr_t)(parent->channels[channelId].nvlsDevPeers + tr);
      channel->peers[comm->nRanks + 1 + r] = parent->channels[channelId].nvlsPeers + tr;
-      NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
+      NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, deviceStream));
      channel->devPeersHostPtr[comm->nRanks + 1 + r] = (struct ncclDevChannelPeer*)addr;
      ncclAtomicRefCountIncrement(&parent->channels[channelId].nvlsPeers[tr].refCount);
    }
  } else {
    NCCLCHECK(ncclCalloc(&channel->nvlsPeers, nvlsRanks));
-    NCCLCHECK(ncclCudaCallocAsync(&channel->nvlsDevPeers, nvlsRanks, sharedRes->deviceStream.cudaStream));
+    NCCLCHECK(ncclCudaCallocAsync(&channel->nvlsDevPeers, nvlsRanks, deviceStream));
    for (int r = 0; r < nvlsRanks; ++r) {
      uintptr_t addr = (uintptr_t)(channel->nvlsDevPeers + r);
      channel->peers[comm->nRanks + 1 + r] = channel->nvlsPeers + r;
-      NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
+      NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, deviceStream));
      channel->devPeersHostPtr[comm->nRanks + 1 + r] = (struct ncclDevChannelPeer*)addr;
      ncclAtomicRefCountIncrement(&channel->nvlsPeers[r].refCount);
    }
  }

+  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false));
  NCCLCHECK(ncclStrongStreamSynchronize(&sharedRes->deviceStream));
-  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream));

  return ncclSuccess;
 }
@@ -110,6 +110,7 @@ ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncc
  struct ncclChannel* channel = &comm->channels[channelId];
  struct ncclSharedResources* sharedRes = comm->sharedRes;
  uintptr_t addr;
+  cudaStream_t deviceStream;

  if (channel->collnetPeers != NULL)
    return ncclSuccess;
@@ -117,28 +118,28 @@ ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncc
  if (channel->id == -1)
    NCCLCHECK(initChannel(comm, channelId));

-  NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream));
+  NCCLCHECK(ncclStrongStreamAcquire(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false, &deviceStream));

  if (share) {
    channel->collnetPeers = parent->channels[channelId].collnetPeers;
    channel->collnetDevPeers = parent->channels[channelId].collnetDevPeers;
    addr = (uintptr_t)parent->channels[channelId].collnetDevPeers;
    channel->peers[comm->nRanks] = parent->channels[channelId].collnetPeers;
-    NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
+    NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, deviceStream));
    channel->devPeersHostPtr[comm->nRanks] = (struct ncclDevChannelPeer*)addr;
    ncclAtomicRefCountIncrement(&parent->channels[channelId].collnetPeers->refCount);
  } else {
    NCCLCHECK(ncclCalloc(&channel->collnetPeers, 1));
-    NCCLCHECK(ncclCudaCallocAsync(&channel->collnetDevPeers, 1, sharedRes->deviceStream.cudaStream));
+    NCCLCHECK(ncclCudaCallocAsync(&channel->collnetDevPeers, 1, deviceStream));
    addr = (uintptr_t)channel->collnetDevPeers;
    channel->peers[comm->nRanks] = channel->collnetPeers;
-    NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
+    NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, deviceStream));
    channel->devPeersHostPtr[comm->nRanks] = (struct ncclDevChannelPeer*)addr;
    ncclAtomicRefCountIncrement(&channel->collnetPeers->refCount);
  }

+  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false));
  NCCLCHECK(ncclStrongStreamSynchronize(&sharedRes->deviceStream));
-  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream));

  return ncclSuccess;
 }
@@ -6,6 +6,7 @@

 #include "core.h"
 #include "nccl_net.h"
+#include <ctime>
 #include <stdlib.h>
 #include <stdarg.h>
 #include <stdio.h>
@@ -16,6 +17,11 @@
 #include "param.h"

 int ncclDebugLevel = -1;
+static uint32_t ncclDebugTimestampLevels = 0;     // bitmaps of levels that have timestamps turned on
+static char ncclDebugTimestampFormat[256];        // with space for subseconds
+static int ncclDebugTimestampSubsecondsStart;     // index where the subseconds starts
+static uint64_t ncclDebugTimestampMaxSubseconds;  // Max number of subseconds plus 1, used in duration ratio
+static int ncclDebugTimestampSubsecondDigits;     // Number of digits to display
 static int pid = -1;
 static char hostname[1024];
 thread_local int ncclDebugNoWarn = 0;
@@ -114,6 +120,88 @@ static void ncclDebugInit() {
      ncclWarnSetDebugInfo = value;
  }

+  // Determine which debug levels will have timestamps.
+  const char* timestamps = ncclGetEnv("NCCL_DEBUG_TIMESTAMP_LEVELS");
+  if (timestamps == nullptr) {
+    ncclDebugTimestampLevels = (1<<NCCL_LOG_WARN);
+  } else {
+    int invert = 0;
+    if (timestamps[0] == '^') { invert = 1; ++timestamps; }
+    ncclDebugTimestampLevels = invert ? ~0U : 0U;
+    char *timestampsDup = strdup(timestamps);
+    char *level = strtok(timestampsDup, ",");
+    while (level != NULL) {
+      uint32_t mask = 0;
+      if (strcasecmp(level, "ALL") == 0) {
+        mask = ~0U;
+      } else if (strcasecmp(level, "VERSION") == 0) {
+        mask = (1<<NCCL_LOG_VERSION);
+      } else if (strcasecmp(level, "WARN") == 0) {
+        mask = (1<<NCCL_LOG_WARN);
+      } else if (strcasecmp(level, "INFO") == 0) {
+        mask = (1<<NCCL_LOG_INFO);
+      } else if (strcasecmp(level, "ABORT") == 0) {
+        mask = (1<<NCCL_LOG_ABORT);
+      } else if (strcasecmp(level, "TRACE") == 0) {
+        mask = (1<<NCCL_LOG_TRACE);
+      } else {
+        // Silently fail.
+      }
+      if (mask) {
+        if (invert) ncclDebugTimestampLevels &= ~mask;
+        else ncclDebugTimestampLevels |= mask;
+      }
+      level = strtok(NULL, ",");
+    }
+    free(timestampsDup);
+  }
+
+  // Store a copy of the timestamp format with space for the subseconds, if used.
+  const char* tsFormat = ncclGetEnv("NCCL_DEBUG_TIMESTAMP_FORMAT");
+  if (tsFormat == nullptr) tsFormat = "[%F %T] ";
+  ncclDebugTimestampSubsecondsStart = -1;
+  // Find where the subseconds are in the format.
+  for (int i=0; tsFormat[i] != '\0'; ++i) {
+    if (tsFormat[i]=='%' && tsFormat[i+1]=='%') { // Next two chars are "%"
+      // Skip the next character, too, and restart checking after that.
+      ++i;
+      continue;
+    }
+    if (tsFormat[i]=='%' &&                               // Found a percentage
+        ('1' <= tsFormat[i+1] && tsFormat[i+1] <= '9') && // Next char is a digit between 1 and 9 inclusive
+        tsFormat[i+2]=='f'                                // Two characters later is an "f"
+        ) {
+      constexpr int replaceLen = sizeof("%Xf") - 1;
+      ncclDebugTimestampSubsecondDigits = tsFormat[i+1] - '0';
+      if (ncclDebugTimestampSubsecondDigits + strlen(tsFormat) - replaceLen > sizeof(ncclDebugTimestampFormat) - 1) {
+        // Won't fit; fall back on the default.
+        break;
+      }
+      ncclDebugTimestampSubsecondsStart = i;
+      ncclDebugTimestampMaxSubseconds = 1;
+
+      memcpy(ncclDebugTimestampFormat, tsFormat, i);
+      for (int j=0; j<ncclDebugTimestampSubsecondDigits; ++j) {
+        ncclDebugTimestampFormat[i+j] = ' ';
+        ncclDebugTimestampMaxSubseconds *= 10;
+      }
+      strcpy(ncclDebugTimestampFormat+i+ncclDebugTimestampSubsecondDigits, tsFormat+i+replaceLen);
+      break;
+    }
+  }
+  if (ncclDebugTimestampSubsecondsStart == -1) {
+    if (strlen(tsFormat) < sizeof(ncclDebugTimestampFormat)) {
+      strcpy(ncclDebugTimestampFormat, tsFormat);
+    } else {
+      strcpy(ncclDebugTimestampFormat, "[%F %T] ");
+    }
+  }
+
+  // Replace underscore with spaces... it is hard to put spaces in command line parameters.
+  for (int i=0; ncclDebugTimestampFormat[i] != '\0'; ++i) {
+    if (ncclDebugTimestampFormat[i]=='_') ncclDebugTimestampFormat[i] = ' ';
+  }
+
  // Cache pid and hostname
  getHostName(hostname, 1024, '.');
  pid = getpid();
@@ -194,39 +282,86 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
    tid = syscall(SYS_gettid);
  }

+  char buffer[1024];
+  size_t len = 0;
+
+  // WARNs come with an extra newline at the beginning.
+  if (level == NCCL_LOG_WARN) {
+    buffer[len++] = '\n';
+  };
+
+  // Add the timestamp to the buffer if they are turned on for this level.
+  if (ncclDebugTimestampLevels & (1<<level)) {
+    if (ncclDebugTimestampFormat[0] != '\0') {
+      struct timespec ts;
+      clock_gettime(CLOCK_REALTIME, &ts);   // clock_gettime failure should never happen
+      std::tm nowTm;
+      localtime_r(&ts.tv_sec, &nowTm);
+
+      // Add the subseconds portion if it is part of the format.
+      char localTimestampFormat[sizeof(ncclDebugTimestampFormat)];
+      const char* pformat = ncclDebugTimestampFormat;
+      if (ncclDebugTimestampSubsecondsStart != -1) {
+        pformat = localTimestampFormat;   // Need to use the local version which has subseconds
+        memcpy(localTimestampFormat, ncclDebugTimestampFormat, ncclDebugTimestampSubsecondsStart);
+        snprintf(localTimestampFormat + ncclDebugTimestampSubsecondsStart,
+                 ncclDebugTimestampSubsecondDigits+1,
+                 "%0*ld", ncclDebugTimestampSubsecondDigits,
+                 ts.tv_nsec / (1000000000UL/ncclDebugTimestampMaxSubseconds));
+        strcpy(    localTimestampFormat+ncclDebugTimestampSubsecondsStart+ncclDebugTimestampSubsecondDigits,
+               ncclDebugTimestampFormat+ncclDebugTimestampSubsecondsStart+ncclDebugTimestampSubsecondDigits);
+      }
+
+      // Format the time. If it runs out of space, fall back on a simpler format.
+      int adv = std::strftime(buffer+len, sizeof(buffer)-len, pformat, &nowTm);
+      if (adv==0 && ncclDebugTimestampFormat[0] != '\0') {
+        // Ran out of space. Fall back on the default. This should never fail.
+        adv = std::strftime(buffer+len, sizeof(buffer)-len, "[%F %T] ", &nowTm);
+      }
+      len += adv;
+    }
+  }
+  len = std::min(len, sizeof(buffer)-1);  // prevent overflows
+
+  // Add hostname, pid and tid portion of the log line.
+  if (level != NCCL_LOG_VERSION) {
+    len += snprintf(buffer+len, sizeof(buffer)-len, "%s:%d:%d ", hostname, pid, tid);
+    len = std::min(len, sizeof(buffer)-1);  // prevent overflows
+  }
+
  int cudaDev = 0;
  if (!(level == NCCL_LOG_TRACE && flags == NCCL_CALL)) {
    (void)cudaGetDevice(&cudaDev);
  }

-  char buffer[4096];
-  size_t len = 0;
+  // Add level specific formatting.
  if (level == NCCL_LOG_WARN) {
-    len = snprintf(buffer, sizeof(buffer), "\n%s:%d:%d [%d] %s:%d NCCL WARN ",
-                   hostname, pid, tid, cudaDev, filefunc, line);
+    len += snprintf(buffer+len, sizeof(buffer)-len, "[%d] %s:%d NCCL WARN ", cudaDev, filefunc, line);
    if (ncclWarnSetDebugInfo) ncclDebugLevel = NCCL_LOG_INFO;
  } else if (level == NCCL_LOG_INFO) {
-    len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] NCCL INFO ", hostname, pid, tid, cudaDev);
+    len += snprintf(buffer+len, sizeof(buffer)-len, "[%d] NCCL INFO ", cudaDev);
  } else if (level == NCCL_LOG_TRACE && flags == NCCL_CALL) {
-    len = snprintf(buffer, sizeof(buffer), "%s:%d:%d NCCL CALL ", hostname, pid, tid);
+    len += snprintf(buffer+len, sizeof(buffer)-len, "NCCL CALL ");
  } else if (level == NCCL_LOG_TRACE) {
    auto delta = std::chrono::steady_clock::now() - ncclEpoch;
    double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
-    len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] %f %s:%d NCCL TRACE ",
-                   hostname, pid, tid, cudaDev, timestamp, filefunc, line);
+    len += snprintf(buffer+len, sizeof(buffer)-len, "[%d] %f %s:%d NCCL TRACE ", cudaDev, timestamp, filefunc, line);
  }
+  len = std::min(len, sizeof(buffer)-1);  // prevent overflows

+  // Add the message as given by the call site.
  va_list vargs;
  va_start(vargs, fmt);
  len += vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
  va_end(vargs);
  // vsnprintf may return len >= sizeof(buffer) in the case of a truncated output.
-  // Rewind len so that we can replace the final \0 by \n
-  if (len >= sizeof(buffer)) len = sizeof(buffer)-1;
-  if (len) {
-    buffer[len++] = '\n';
-    fwrite(buffer, 1, len, ncclDebugFile);
-  }
+  // Rewind len so that we can replace the final \0 by "\n"
+  len = std::min(len, sizeof(buffer)-1);  // prevent overflows
+
+  // Add a newline and write it to the debug file. No terminating null is
+  // necessary since we write bytes instead of the string.
+  buffer[len++] = '\n';
+  fwrite(buffer, 1, len, ncclDebugFile);
 }

 NCCL_API(void, ncclResetDebugInit);
@@ -141,7 +141,7 @@ namespace {
        }
 #endif
        // Final wait/copy.
-        prims.directRecv(offset, offset, nelem);
+        prims.directRecv(offset, nelem);
  
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_EXIT)
        if (tid == 0) {
@@ -220,25 +220,63 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128
 template<typename T, typename RedOp>
 struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SIMPLE> {
  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
+#if __CUDA_ARCH__ >= 600
    using Proto = ProtoSimple<1, 1>;
    const int nranks = ncclShmem.comm.nRanks;
    const int rank = ncclShmem.comm.rank;
    size_t count, channelOffset, channelCount, chunkCount;
    ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &channelOffset, &channelCount, &chunkCount);

-    T *inputBuf = (T*)work->sendbuff;
-    T *outputBuf = (T*)work->recvbuff;
-    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
-      (tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, nullptr, 0, primsModePatAg);
+    static constexpr int nworkers = NCCL_PAT_NWORKERS;
+    struct ncclPatShmem* shmem = (struct ncclPatShmem*)ncclScratchForWarp(0);
+    uint64_t pollCount = 0;
+    __syncthreads(); // Don't start using shared mem until everyone arrives
+    for (int i=tid; i<NCCL_SHMEM_PAT_STEPS; i+=nthreads) shmem->patSteps[i].flags = 0;
+    if (tid == 0) shmem->localAccSize = 0;
+    if (tid == nworkers) shmem->parallelFactor = 0;
+    __syncthreads();

-    PatAGAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
-    int last = 0;
-    while (!last) {
-      int recvDim, sendDim, recvOffset, sendOffset, recvStepOffset, postRecv, postSend, nelem;
-      size_t inpIx, outIx;
-      patAlgo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, recvStepOffset, nelem, postRecv, postSend, last);
-      prims.patCopy(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, recvStepOffset, nelem, postRecv, postSend);
+    if (tid == nworkers) { // Algo computation thread
+      PatAGAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, NCCL_PAT_NWORKERS/WARP_SIZE, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
+      int parallelFactor = shmem->parallelFactor = patAlgo.getParallelFactor();
+      int step = 0;
+      while (1) {
+        struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS);
+        cuda::atomic_ref<int, cuda::thread_scope_block> poll(ps->flags);
+        while (poll.load(cuda::memory_order_acquire) != 0) pollCount++; // Wait for workers to be done with step 'step-NCCL_SHMEM_PAT_STEPS'
+        patAlgo.getNextOp(ps);
+        int last = ps->last;
+        step++;
+        if (last == 2) break;
+      }
+    } else if (tid < nworkers) { // Worker threads
+      T *inputBuf = (T*)work->sendbuff;
+      T *outputBuf = (T*)work->recvbuff;
+      int parallelFactor = 0;
+      volatile int* pfPtr = &shmem->parallelFactor;
+      while (parallelFactor == 0) parallelFactor = *pfPtr;
+
+      int groupSize = nworkers/(WARP_SIZE*parallelFactor) * WARP_SIZE;
+      int group = tid / groupSize;
+      int nGroups = nworkers / groupSize;
+      int tidInGroup = tid - group*groupSize;
+      // We don't use recvPeers/sendPeers so let's pass shmem structs instead
+      Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
+        (tidInGroup, groupSize, (int*)shmem->recvDims, (int*)shmem->sendDims, inputBuf, outputBuf, work->redOpArg, group, 0, 0, nullptr, nullptr, 0, primsModePatAg);
+
+      int step = group;
+      while(1) {
+        struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS);
+        cuda::atomic_ref<int, cuda::thread_scope_block> poll(ps->flags);
+        while (poll.load(cuda::memory_order_acquire) == 0) pollCount++; // Wait for compute thread
+        int last = ps->last;
+        prims.patCopy(ps, shmem);
+        if (tidInGroup == 0) poll.store(0, cuda::memory_order_release); // Return element to compute thread
+        if (last) break;
+        step += nGroups;
+      }
    }
+#endif
  }
 };

@@ -190,7 +190,7 @@ namespace {
      offset = gridOffset + elemOffset + chunkOffset;
      nelem = (int)min(chunkCount, remCount - chunkOffset);

-      prims.directRecv(offset, offset, nelem);
+      prims.directRecv(offset, nelem);

 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT)
      if (tid == 0) {
@@ -329,7 +329,7 @@ namespace {
        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
          offset = gridOffset + elemOffset;
          nelem = min(chunkCount, channelCount - elemOffset);
-          prims.directRecv(offset, offset, nelem);
+          prims.directRecv(offset, nelem);
        }
      }
      else {
@@ -528,7 +528,7 @@ namespace {
        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
          offset = gridOffset + elemOffset;
          nelem = min(chunkCount, channelCount - elemOffset);
-          prims.directRecv(offset, offset, nelem);
+          prims.directRecv(offset, nelem);
        }
      }
      else {
@@ -1055,7 +1055,7 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PR
            for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
              ssize_t offset = gridOffset + bid * int(chunkSize);
              int nelem = min(chunkSize, size - offset);
-              prims.directRecv(offset, offset, nelem, /*postOp*/true);
+              prims.directRecv(offset, nelem, /*postOp*/true);
            }
          }
        } else {
@@ -1082,7 +1082,7 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PR
          for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
            ssize_t offset = gridOffset + bid*int(chunkSize);
            int nelem = min(chunkSize, size-offset);
-            prims.directRecv(offset, offset, nelem);
+            prims.directRecv(offset, nelem);
          }
        } else {
          for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
@@ -83,7 +83,7 @@ namespace {
            prims.directCopySend(offset, offset, nelem);
          }
        } else if (nextRank == root) {
-          prims.directRecv(offset, offset, nelem);
+          prims.directRecv(offset, nelem);
        } else {
          prims.directRecvCopyDirectSend(offset, offset, nelem);
        }
@@ -144,6 +144,8 @@ struct ncclShmemData {
  int nWorks;
  int workSize;
  uint32_t workConsumed;
+  uint64_t workCounter;
+  bool profilerEnabled;
  struct ncclShmemGroup groups[NCCL_MAX_GROUPS];
  uint64_t redOpArgs[NCCL_MAX_NVLS_ARITY+1];

@@ -236,24 +238,6 @@ __device__ inline bool barrier_red_or(bool vote, int name, int nThreads) {
      : "=r"(ans) : "r"((int)vote), "r"(name), "r"(nThreads) : "memory");
  return bool(ans);
 }
-__device__ inline bool barrier_red_or_aligned(bool vote, int name) {
-  int ans;
-  asm volatile("{ .reg .pred p;"
-      "  setp.ne.s32 p, %1, 0;"
-      "  barrier.red.or.pred.aligned p, %2, p; "
-      "  selp.s32 %0, 1, 0, p; }"
-      : "=r"(ans) : "r"((int)vote), "r"(name) : "memory");
-  return bool(ans);
-}
-__device__ inline bool barrier_red_or_aligned(bool vote, int name, int nThreads) {
-  int ans;
-  asm("{ .reg .pred p;"
-      "  setp.ne.s32 p, %1, 0;"
-      "  barrier.red.or.pred.aligned p, %2, %3, p; "
-      "  selp.s32 %0, 1, 0, p; }"
-      : "=r"(ans) : "r"((int)vote), "r"(name), "r"(nThreads) : "memory");
-  return bool(ans);
-}

 #ifdef ENABLE_PROFILING
 #define __insert_timestamp(line_num) do { \
@@ -455,6 +439,48 @@ struct RunWorkBatch {
  }
 };

+#define START 0
+#define STOP  1
+#define FINI  2
+
+__device__ __forceinline__ bool profilerEnabled(void) {
+  // Check if any of the workItems in the batch is profiled. If so, there is an equivalent
+  // profiler ProxyOp waiting for the counter update in the host thread. If this check was
+  // done only for the first workItem the profiler counter for other workItems in the batch
+  // could never be updated, leaving the host thread spinning forever for the counter update
+  // and causing a hang.
+  bool enabled = false;
+  for (int i = 0; i < ncclShmem.nWorks && !enabled; i++) {
+    if (ncclShmem.workType == ncclDevWorkTypeP2p)
+      enabled = ((struct ncclDevWorkP2p*)ncclShmem.workStorage)[i].profilerEnabled;
+    else
+      enabled = ((struct ncclDevWorkColl*)ncclShmem.workStorage)[i].profilerEnabled;
+  }
+  return enabled;
+}
+
+__device__ __forceinline__ void profiler(int action) {
+  if (action == START) {
+    if (threadIdx.x == 0) {
+      // increment workCounter regardless of the profiler being active or not
+      ncclShmem.channel.workCounter += ncclShmem.nWorks;
+      if(!profilerEnabled()) return;
+      ncclShmem.comm.workStarted[ncclShmem.channelId] = ncclShmem.channel.workCounter;
+    }
+  } else if (action == STOP) {
+    if (threadIdx.x == 0 && profilerEnabled()) {
+      ncclShmem.comm.workCompleted[ncclShmem.channelId] = ncclShmem.channel.workCounter;
+    }
+  } else { // FINI
+    if (threadIdx.x == 0) {
+      // store the workCounter back to vidmem regardless of the profiler being active or not
+      ((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter = ncclShmem.channel.workCounter;
+      if (!profilerEnabled()) return;
+      ncclShmem.comm.workCompleted[ncclShmem.channelId] = ncclShmem.channel.workCounter;
+    }
+  }
+}
+
 template<int SpecializedFnId, typename SpecializedRunWorkBatch, bool COLLTRACE, int COLL_UNROLL>
 __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* args) {
  const int tid = threadIdx.x;
@@ -517,8 +543,13 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
    break;
  }
  __syncthreads(); // publish ncclShmem.{args, channelId}
+  /* set abort flag to 0 */
+  if (tid == 0) {
+    ncclShmem.aborted = 0;
+    ncclShmem.channel.workCounter = ((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter;
+  }

-  // Use first 2 warps to load comm and channel, and reamaining load work batch.
+  // Use first 2 warps to load comm and channel, and remaining load work batch.
  switch (tid/WARP_SIZE) {
  case 0:
    { void* dst = &ncclShmem.comm;
@@ -566,9 +597,9 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
    ncclShmem.comm.workConsumed[ncclShmem.channelId] = ncclShmem.workConsumed;
  }

-  while (true) {
+  while (ncclShmem.aborted == 0) {
    if (tid == 0) __insert_timestamp(__LINE__);
-
+    profiler(START);
    if (0 <= SpecializedFnId && ncclShmem.funcId == (unsigned)SpecializedFnId) {
      SpecializedRunWorkBatch().run();
    } else {
@@ -586,21 +617,14 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
      default:
        break;
    }
+    profiler(STOP);
    loadWorkBatchToShmem(tid%WARP_SIZE, tn, args, batchIx);
    __syncthreads();

-    // Check whether the last operation was aborted and make sure all threads exit
-    bool aborted = false;
-    if (tid == 0) aborted = *ncclShmem.comm.abortFlag;
-    aborted = __any(aborted); // publish ncclShmem.work
    if (tid == 0 && ncclShmem.args.workStorageType == ncclDevWorkStorageTypeFifo) {
-      // ncclShmem.workConsumed written by loadWorkBatchToShmem before barrier_red_or()
+      // ncclShmem.workConsumed written by loadWorkBatchToShmem before __syncthreads()
      ncclShmem.comm.workConsumed[ncclShmem.channelId] = ncclShmem.workConsumed;
    }
-    if (aborted) {
-      if(COLLTRACE && tid%WARP_SIZE == 0) traceAbort();
-      break;
-    }
    if (COLLTRACE && tid%WARP_SIZE == 0) traceKernelLaunch(ncclCollTraceCollLaunchType, batchIx);
  }
  if (COLLTRACE && tid%WARP_SIZE == 0) traceKernelEnd(ncclCollTraceKernelEndType);
@@ -13,7 +13,7 @@
 #include "common_kernel.h"
 #include "common.h"

-#define NCCL_SPINS_BEFORE_CHECK_ABORT 1000000
+#define NCCL_SPINS_BEFORE_CHECK_ABORT 10000

 #define barrier_by_group_common(__THREAD_FENCE) do { \
  if (nthreads == NCCL_MAX_NTHREADS) { \
@@ -154,7 +154,7 @@ struct PrimitivesWithoutDirect {
  __device__ void directSendFromOutput(intptr_t outIx, int eltN) {
    static_cast<RealPrimitives*>(this)->sendFromOutput(outIx, eltN);
  }
-  __device__ void directRecv(intptr_t inpIx, intptr_t outIx, int eltN) {
+  __device__ void directRecv(intptr_t outIx, int eltN) {
    static_cast<RealPrimitives*>(this)->recv(outIx, eltN, /*postOp=*/false);
  }
  __device__ void directCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
@@ -178,6 +178,18 @@ struct PrimitivesWithoutDirect {
  }
 };

+__device__ inline int checkAbort(int &abortCache, const int abortValue, int &spins) {
+  if (abortCache & abortValue) return 1;
+  if (++spins < NCCL_SPINS_BEFORE_CHECK_ABORT) return 0;
+  spins = 0;
+  int abort = __atomic_load_n((ncclShmem.comm.abortFlag), __ATOMIC_SEQ_CST);
+  if (abort) {
+    __atomic_store_n(&ncclShmem.aborted, abort, __ATOMIC_SEQ_CST);
+    abortCache |= abortValue;
+  }
+  return abort;
+}
+
 #include "prims_simple.h"
 #include "prims_ll.h"
 #include "prims_ll128.h"
@@ -85,15 +85,18 @@ private:
 #endif
  }

-  uint32_t abort = 0;
+  int abort = 0;

-  inline __device__ int checkAbort(int &spins, int send) {
-    spins++;
-    if (abort == 0 && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) {
-      abort = __atomic_load_n((ncclShmem.comm.abortFlag), __ATOMIC_SEQ_CST);
+  __device__ inline int checkAbort(int &abortCache, const int abortValue, int &spins) {
+    if (abortCache == 0 && ++spins == NCCL_SPINS_BEFORE_CHECK_ABORT) {
+      int abort = __atomic_load_n((ncclShmem.comm.abortFlag), __ATOMIC_SEQ_CST);
      spins = 0;
+      if (abort) {
+        __atomic_store_n(&ncclShmem.aborted, abort, __ATOMIC_SEQ_CST);
+        abortCache |= abortValue;
+      }
    }
-    return abort;
+    return abortCache;
  }

  inline __device__ void waitSend(int nbytes) {
@@ -108,7 +111,7 @@ private:
      while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) {
        __builtin_amdgcn_s_sleep(1);
        sendConnHeadCache = atomicAdd((unsigned long long *)sendConnHeadPtr, 0);
-        if (checkAbort(spins, 1)) break;
+        if (checkAbort(abort, 1, spins)) break;
      }
      if (sendConnFifo) {
        int size = ((sendConnHead & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? stepLines*sizeof(union ncclLLFifoLine) : nbytes;
@@ -168,7 +171,7 @@ private:
 #if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
      npkitWaitRecvSpins++;
 #endif
-      if (checkAbort(spins, 0)) break;
+      if (checkAbort(abort, 1, spins)) break;
    } while ((i4.flag1 != flag) || (i4.flag2 != flag));
    uint64_t val64 = (uint64_t)(i4.data1) + (((uint64_t)i4.data2) << 32);
 #else
@@ -177,7 +180,7 @@ private:
 #if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
      npkitWaitRecvSpins++;
 #endif
-      if (checkAbort(spins, 0)) break;
+      if (checkAbort(abort, 1, spins)) break;
    } while ((flag1 != flag) || (flag2 != flag));
    uint64_t val64 = data1 + (((uint64_t)data2) << 32);
 #endif
@@ -241,7 +244,7 @@ private:
 #if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
      npkitWaitRecvSpins++;
 #endif
-      if (checkAbort(spins, 0)) break;
+      if (checkAbort(abort, 1, spins)) break;
    } while(line[i].flag1 != flag || line[i].flag2 != flag);
    uint64_t val64 = line[i].data1 + (((uint64_t)line[i].data2) << 32);

@@ -86,16 +86,18 @@ private:
 #endif
  }

-  uint32_t abort = 0;
-  uint32_t* sync;
+  int abort = 0;

-  inline __device__ int checkAbort(int &spins, int i, int send) {
-    spins++;
-    if (abort == 0 && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) {
-      abort = __atomic_load_n(ncclShmem.comm.abortFlag, __ATOMIC_SEQ_CST);
+  __device__ inline int checkAbort(int &abortCache, const int abortValue, int &spins) {
+    if (abortCache == 0 && ++spins == NCCL_SPINS_BEFORE_CHECK_ABORT) {
+      int abort = __atomic_load_n((ncclShmem.comm.abortFlag), __ATOMIC_SEQ_CST);
      spins = 0;
+      if (abort) {
+        __atomic_store_n(&ncclShmem.aborted, abort, __ATOMIC_SEQ_CST);
+        abortCache |= abortValue;
+      }
    }
-    return abort;
+    return abortCache;
  }

  inline __device__ void waitSend(int nbytes) {
@@ -104,7 +106,7 @@ private:
      while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) {
        __builtin_amdgcn_s_sleep(1);
        sendConnHeadCache = __atomic_load_n(sendConnHeadPtr, __ATOMIC_RELAXED);
-        if (checkAbort(spins, wid, 1)) break;
+        if (checkAbort(abort, 1, spins)) break;
      }
      if (sendConnFifo) {
        sendConnFifo[sendStep[wid]%NCCL_STEPS].size = nbytes;
@@ -241,7 +243,7 @@ private:
          load128(ptr+u*WARP_SIZE, vr[u], vr[u+1]);
          needReload |= flagThread && (vr[u+1] != flag);
        }
-        needReload &= (0 == checkAbort(spins, 0, 0));
+        needReload &= (0 == checkAbort(abort, 1, spins));
      } while (__any(needReload));
      #pragma unroll
      for (int u=0; u<ELEMS_PER_THREAD; u+=2)
@@ -287,7 +289,7 @@ private:
            load128(ptr+u*WARP_SIZE, vr[u], vr[u+1]);
            needReload |= flagThread && (vr[u+1] != flag);
          }
-          needReload &= (0 == checkAbort(spins, i, 0));
+          needReload &= (0 == checkAbort(abort, 1, spins));
        } while (__any(needReload));

        #pragma unroll
@@ -59,7 +59,7 @@ class Primitives<
  uint64_t connStepCache; // Cache last seen value of (*connStepPtr)
  int      connStepSize; // Connection step size
  void*    netDeviceHandle;
-  uint64_t accSize; // Accumulated size. Used by PAT operations
+  uint64_t accSize;
  uint32_t* next_hdp_reg;
  uint64_t* barriers;
  uint64_t barrier_next = 0;
@@ -86,19 +86,21 @@ private:
      #endif
  }
  inline __device__ void subBarrier() {
+    if (nworkers == WARP_SIZE) __syncwarp();
+    else
+      barrier();
+  }
+
+  inline __device__ void patBarrier() {
    barrier();
  }

-  inline __device__ bool checkAbort(int &spins) {
-    spins++;
-    if (!(flags & Aborted) && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) {
-      if (__atomic_load_n(ncclShmem.comm.abortFlag, __ATOMIC_SEQ_CST)) {
-        flags |= Aborted;
-        ncclShmem.aborted = 1;
-      }
-      spins = 0;
-    }
-    return flags & Aborted;
+  inline __device__ void barrierAny() {
+    barrier();
+  }
+
+  inline __device__ void subBarrierAny() {
+    barrier();
  }

  inline __device__ uint64_t loadStepValue(uint64_t* ptr) {
@@ -129,7 +131,7 @@ private:
      while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) {
        __builtin_amdgcn_s_sleep(1);
        connStepCache = loadStepValue(connStepPtr);
-        if (checkAbort(spins)) break;
+        if (checkAbort(flags, Aborted, spins)) break;
        //if (spins == 0) printf("r=%d b=%d t=%d SPUN OUT got=%d want=%d\n", ncclShmem.comm.rank, blockIdx.x, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice));
        if (spins == 0 && repeat > 0) {
          repeat --;
@@ -482,13 +484,8 @@ public:
    peerPtr->recv[connIndex].step += steps;
    st_relaxed_sys_global(peerPtr->recv[connIndex].head, peerPtr->recv[connIndex].step);
    while (ld_volatile_global(peerPtr->recv[connIndex].tail) < peerPtr->recv[connIndex].step) {
-      if (spins++ == NCCL_SPINS_BEFORE_CHECK_ABORT) {
-        if (*ncclShmem.comm.abortFlag) {
-          ncclShmem.aborted = 1;
-          break;
-        }
-        spins = 0;
-      }
+      int abort = 0;
+      if (checkAbort(abort, 1, spins)) break;
    }
  }

@@ -503,7 +500,7 @@ public:
          int spins = 0;
          while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) {
            connStepCache = loadStepValue(connStepPtr);
-            if (checkAbort(spins)) break;
+            if (checkAbort(flags, Aborted, spins)) break;
          }
          void **ptrs = isSendNotRecv ? ncclShmem.groups[group].dsts
                                      : ncclShmem.groups[group].srcs;
@@ -754,6 +751,9 @@ public:
    flags = 0;
    index = -1;
    if (mode == primsModeDefault) { // Connect to ranks in sendPeers/recvPeers
+      // // For send operations, we need an extra warp to overlap the threadfence and the copy
+      // this->nworkers = nthreads - (MaxSend > 0 && nthreads >= NCCL_SIMPLE_EXTRA_GROUP_IF_NTHREADS_GE ? WARP_SIZE : 0);
+
      int nrecv=0, nsend=0;
      // Yes, for some template arguments this code will be unreachable.  That's fine.
      // coverity[dead_error_line]
@@ -783,68 +783,84 @@ public:

      if (flags & (RoleWaitRecv|RolePostRecv)) peer = recvPeers[index];
      if (flags & (RoleWaitSend|RolePostSend)) peer = sendPeers[index];
+
+      // Coverity thinks that index could be -1 here but that's not actually the case.
+      // coverity[negative_returns:FALSE]
+      int sendIpcReg;
+      int recvIpcReg;
+      int sendNetReg;
+      int recvNetReg;
+      if (P2p) {
+        sendIpcReg = p2pWork ? p2pWork->sendIpcReg : 0;
+        recvIpcReg = p2pWork ? p2pWork->recvIpcReg : 0;
+        sendNetReg = p2pWork ? p2pWork->sendNetReg : 0;
+        recvNetReg = p2pWork ? p2pWork->recvNetReg : 0;
+      } else {
+        recvIpcReg = sendIpcReg = collWork ? collWork->regUsed : 0;
+        recvNetReg = sendNetReg = collWork ? collWork->netRegUsed : 0;
+      }
+
+      // coverity[overrun-call] => Coverity think prims.index can be greater than 1
+      if (flags & (RoleWaitRecv|RolePostRecv)) loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, collWork ? collWork->direct : 0, recvIpcReg, recvNetReg);
+      // coverity[overrun-call] => Coverity think prims.index can be greater than 1
+      if (flags & (RoleWaitSend|RolePostSend)) loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, collWork ? collWork->direct : 0, sendIpcReg, sendNetReg);
+
+      // if (barrierAny(flags & NetDeviceUnpack)) {
+      //   flags |= AnyNetDeviceUnpack;
+      //   // RoleWaitRecv starts at tid=0, so this creates the bitmask of which recv peers
+      //   // have NetDeviceUnpack.
+      //   uint32_t mask = __ballot_sync(~0u, ((flags & RoleWaitRecv) && (flags & NetDeviceUnpack)) ? 1 : 0);
+      //   if (tid == 0) {
+      //     ncclShmem.groups[this->group].devicePlugin.unpack.unpackNetDeviceIndexMask = mask;
+      //   }
+      // }
+
+      // coverity[negative_returns:FALSE] => coverity thinks that index could be -1 but that's not actually the case
+      // coverity[var_deref_model] => coverity thinks work can dereferenced if NULL but this is not the case
+      setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclDevWorkCollReg*)collWork, sendIpcReg || recvIpcReg, peer);
+      // coverity[uninit_member] => coverity thinks fan.n is not initialized
    } else if (mode == primsModePatRs || mode == primsModePatAg) { // Connect to all ranks +/- 2^n
      flags |= PatMode;
-      accSize = 0;
+      const int roles[5] = { RoleWaitRecv, RolePostRecv, RoleWaitSend, RolePostSend, RoleInput | RoleOutput };
+      if (tid < 5) flags |= roles[tid];
+
      int nranks = ncclShmem.comm.nRanks;
-      int rank = ncclShmem.comm.rank;
-      // A thread is responsible for rank +/- 2 ^ (tid%32). That should be fine as long as rank is a 32-bits integer.
-      index = tid % 32;
-      uint32_t delta = 1 << index;
-      const int roles[4] = { RoleWaitRecv, RoleWaitSend, RolePostSend, RolePostRecv};
-      int block = tid / 32;
-      if (block < 4 && delta < nranks) {
-        int role = roles[block];
-        if (mode == primsModePatRs) {
-          if (role & (RoleWaitRecv|RolePostRecv)) peer = (rank - delta + nranks) % nranks;
-          if (role & (RoleWaitSend|RolePostSend)) peer = (rank + delta) % nranks;
-        } else if (mode == primsModePatAg) {
-          if (role & (RoleWaitSend|RolePostSend)) peer = (rank - delta + nranks) % nranks;
-          if (role & (RoleWaitRecv|RolePostRecv)) peer = (rank + delta) % nranks;
-        }
-        flags |= role;
-      } else if (tid == 128) {
-        flags |= RoleInput | RoleOutput; // Only one will be used depending on the operation
+      if (tid < 32 && ((1UL<<tid) < nranks)) {
+        int rank = ncclShmem.comm.rank;
+        uint32_t delta = 1 << tid;
+        // Load recv peer
+        int recvPeer = mode == primsModePatRs ? (rank - delta + nranks) % nranks : (rank + delta) % nranks;
+        struct ncclPatPeer* peer = ((struct ncclPatPeer*)recvPeers)+tid;
+        struct ncclConnInfo* conn = peer->conn = ncclShmem.channel.peers[recvPeer]->recv+connIndexRecv;
+        peer->step = conn->step;
+        peer->buff = conn->buffs[NCCL_PROTO_SIMPLE];
+        peer->stepCache = loadStepValue(peer->tailPtr = conn->tail);
+        peer->headPtr = conn->head;
+        peer->accSize = 0;
+        peer->connStepSize = conn->stepSize/sizeof(T);
+        // Load send peer
+        int sendPeer = mode == primsModePatAg ? (rank - delta + nranks) % nranks : (rank + delta) % nranks;
+        peer = ((struct ncclPatPeer*)sendPeers)+tid;
+        conn = peer->conn = ncclShmem.channel.peers[sendPeer]->send+connIndexSend;
+        peer->step = conn->step;
+        peer->connFifo = conn->connFifo;
+        peer->buff = conn->buffs[NCCL_PROTO_SIMPLE];
+        peer->stepCache = loadStepValue(peer->headPtr = conn->head);
+        peer->tailPtr = conn->tail;
+        peer->accSize = 0;
+        peer->connStepSize = conn->stepSize/sizeof(T);
      }
+      if (tid==0) {
+        ncclShmem.groups[group].userInput = (void*)inputBuf;
+        ncclShmem.groups[group].userOutput = (void*)outputBuf;
+        ncclShmem.redOpArgs[0] = redOpArg;  // scaler for local input
+      }
+      patBarrier();
    }
-
-    // Coverity thinks that index could be -1 here but that's not actually the case.
-    // coverity[negative_returns:FALSE]
-    int sendIpcReg;
-    int recvIpcReg;
-    int sendNetReg;
-    int recvNetReg;
-    if (P2p) {
-      sendIpcReg = p2pWork ? p2pWork->sendIpcReg : 0;
-      recvIpcReg = p2pWork ? p2pWork->recvIpcReg : 0;
-      sendNetReg = p2pWork ? p2pWork->sendNetReg : 0;
-      recvNetReg = p2pWork ? p2pWork->recvNetReg : 0;
-    } else {
-      recvIpcReg = sendIpcReg = collWork ? collWork->regUsed : 0;
-      recvNetReg = sendNetReg = collWork ? collWork->netRegUsed : 0;
-    }
-    // coverity[overrun-call] => Coverity think prims.index can be greater than 1
-    if (flags & (RoleWaitRecv|RolePostRecv)) loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, collWork ? collWork->direct : 0, recvIpcReg, recvNetReg);
-    // coverity[overrun-call] => Coverity think prims.index can be greater than 1
-    if (flags & (RoleWaitSend|RolePostSend)) loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, collWork ? collWork->direct : 0, sendIpcReg, sendNetReg);
-
-    // if (barrierAny(flags & NetDeviceUnpack)) {
-    //   flags |= AnyNetDeviceUnpack;
-    //   // RoleWaitRecv starts at tid=0, so this creates the bitmask of which recv peers
-    //   // have NetDeviceUnpack.
-    //   uint32_t mask = __ballot_sync(~0u, ((flags & RoleWaitRecv) && (flags & NetDeviceUnpack)) ? 1 : 0);
-    //   if (tid == 0) {
-    //     ncclShmem.groups[this->group].devicePlugin.unpack.unpackNetDeviceIndexMask = mask;
-    //   }
-    // }
-
-    // coverity[negative_returns:FALSE] => coverity thinks that index could be -1 but that's not actually the case
-    // coverity[var_deref_model] => coverity thinks work can dereferenced if NULL but this is not the case
-    setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclDevWorkCollReg*)collWork, sendIpcReg || recvIpcReg, peer);
-    // coverity[uninit_member] => coverity thinks fan.n is not initialized
  }

  __forceinline__ __device__ ~Primitives() {
+    if (flags&PatMode) return;
    // Save steps for the next operation
    if (flags & (RolePostSend|RolePostRecv)) conn->step = step;
    if ((flags & NetRegMode) && (flags & RoleWaitSend)) {
@@ -854,7 +870,7 @@ public:
      uint64_t prevStep = step - StepPerSlice;
      volatile ssize_t* ptr = &(connFifo[prevStep%NCCL_STEPS].size);
      int spins = 0;
-      while (*ptr != -1) if (checkAbort(spins)) break;
+      while (*ptr != -1) if (checkAbort(flags, Aborted, spins)) break;
    }

    if (flags & NetDeviceUnpack) {
@@ -872,7 +888,7 @@ public:
      int spins = 0;
      volatile uint64_t* tail = conn->tail;
      volatile uint64_t* head = conn->head;
-      while (*tail > *head) if (checkAbort(spins)) break;
+      while (*tail > *head) if (checkAbort(flags, Aborted, spins)) break;
    }
  }

@@ -895,7 +911,7 @@ public:
        if (slot) {
          T* exchgPtr;
          directBuff = (T*)outputBuf;
-          while ((void *)atomicAdd((unsigned long long *) slot,0) != nullptr && !checkAbort(spins));
+          while ((void *)atomicAdd((unsigned long long *) slot,0) != nullptr && !checkAbort(flags, Aborted, spins));
          if (P2p) {
            exchgPtr = (T*)outputBuf;
          } else {
@@ -912,7 +928,7 @@ public:
        void* ptr;
        while (slot) {
          ptr = (void *)atomicAdd((unsigned long long *) slot,0);
-          if (ptr != nullptr || checkAbort(spins)) break;
+          if (ptr != nullptr || checkAbort(flags, Aborted, spins)) break;
        }

        if (slot) {
@@ -931,7 +947,7 @@ public:
        // Wait for consumer to consume previous value before trampling it.
        if (slot && argSlot0 && argSlot1) {
          T* exchgPtr;
-          while (((void *)atomicAdd((unsigned long long *) slot,0) != nullptr || *argSlot0 != 0 || *argSlot1 != 0) && !checkAbort(spins));
+          while (((void *)atomicAdd((unsigned long long *) slot,0) != nullptr || *argSlot0 != 0 || *argSlot1 != 0) && !checkAbort(flags, Aborted, spins));
          // If there is no recv, then we are directly pulling from input buffer (e.g. directScatter)
          // Otherwise, we are pulling from output buffer (e.g. recvCopyDirectSend)
          directBuff = MaxRecv == 0 ? (T*)inputBuf : (T*)outputBuf;
@@ -961,7 +977,7 @@ public:
        void* ptr;
        while (slot) {
          ptr = (void *)atomicAdd((unsigned long long *) slot,0);
-          if (ptr != nullptr || checkAbort(spins)) break;
+          if (ptr != nullptr || checkAbort(flags, Aborted, spins)) break;
        }

        if (slot && argSlot0 && argSlot1) {
@@ -972,7 +988,7 @@ public:
            while (true) {
              arg0 = *argSlot0;
              arg1 = *argSlot1;
-              if ((arg0 != 0 && arg1 != 0) || checkAbort(spins)) break;
+              if ((arg0 != 0 && arg1 != 0) || checkAbort(flags, Aborted, spins)) break;
            }
            ncclShmem.redOpArgs[1 + index] = ((arg1 & 0xffffffff) << 32) | (arg0 & 0xffffffff);
          }
@@ -1020,8 +1036,8 @@ public:
  __device__ __forceinline__ void recv(intptr_t outIx, int eltN, bool postOp=false) {
    genericOp<0, 0, 1, 0, -1, Output>(-1, outIx, eltN, postOp);
  }
-  __device__ __forceinline__ void directRecv(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
-    genericOp<1, 0, 1, 0, -1, Output>(inpIx, outIx, eltN, postOp);
+  __device__ __forceinline__ void directRecv(intptr_t outIx, int eltN, bool postOp=false) {
+    genericOp<1, 0, 1, 0, -1, Output>(outIx, outIx, eltN, postOp);
  }
  __device__ __forceinline__ void directRecvCopy(intptr_t inpIx, intptr_t outIx, int eltN) {
    genericOp<1, 0, 1, 0, -1, Output>(inpIx, outIx, eltN, /*postOp=*/false);
@@ -1099,54 +1115,65 @@ public:
    ScatterGatherOp<1, 0, 1, 0>(-1, outIx, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false);
  }

-  __device__ __forceinline__ void patReduce(int recvPow2, int sendPow2, intptr_t inpIx, intptr_t outIx, int recvOffset, int sendOffset, int sendStepOffset, int nelem, int postRecv, int postSend) {
-    nelem = nelem < 0 ? 0 : nelem;
+  __device__ __forceinline__ void patReduce(struct ncclPatStep* ps, struct ncclPatShmem* shmem) {
+    if (ps->flags & PatSkipped) { patBarrier(); patBarrier(); return; } // Skipped
+    int nelem = ps->nelem < 0 ? 0 : ps->nelem;
    T* userInput = (T*)ncclShmem.groups[group].userInput;
    T* userOutput = (T*)ncclShmem.groups[group].userOutput;

-    if (recvPow2 >= 0 && recvPow2 == index && (flags & RoleWaitRecv)) {
-      ncclShmem.groups[group].srcs[0] = (T*)(connEltsFifo + (step%NCCL_STEPS)*connStepSize) + recvOffset;
-      int spins = 0;
-      while (connStepCache < step + StepPerSlice) {
-        connStepCache = loadStepValue(connStepPtr);
-        if (checkAbort(spins)) break;
-      }
-      if (postRecv) step += StepPerSlice;
+    bool recv = ps->recvDim >= 0 && (flags & (RolePostRecv|RoleWaitRecv));
+    bool send = ps->sendDim >= 0 && (flags & (RolePostSend|RoleWaitSend));
+    bool postRecv = ps->postRecv && recv;
+    bool postSend = ps->postSend && send;
+    struct ncclPatPeer* peer = NULL;
+    if (recv) {
+      peer = shmem->recvDims+ps->recvDim;
+      step = peer->step;
    }
-    if (sendPow2 >= 0 && sendPow2 == index && (flags & RoleWaitSend)) {
-      int spins = 0;
-      while (connStepCache + NCCL_STEPS < step + sendStepOffset + StepPerSlice) {
-        connStepCache = loadStepValue(connStepPtr);
-        if (checkAbort(spins)) break;
-      }
-      ncclShmem.groups[group].dsts[0] = (T*)(connEltsFifo + ((step+sendStepOffset)%NCCL_STEPS)*connStepSize) + sendOffset;
-      if (accSize < sendOffset + nelem + (step+sendStepOffset)*connStepSize) {
-        // New data, add our own data to it.
-        ncclShmem.groups[group].srcs[1] = userInput + inpIx;
-        accSize = sendOffset + nelem + (step+sendStepOffset)*connStepSize;
-        if (flags & ConnFifoEnabled)
-          connFifo[(step+sendStepOffset)%NCCL_STEPS].size = (sendOffset + nelem)*sizeof(T);
-      } else {
-        // There is already data in there, accumulate instead of writing to it.
-        ncclShmem.groups[group].srcs[1] = ncclShmem.groups[group].dsts[0];
-      }
-      if (postSend) step += StepPerSlice;
+    if (send) {
+      peer = shmem->sendDims+ps->sendDim;
+      step = peer->step;
    }
-    if (sendPow2 < 0 && (flags & RoleOutput)) { // Destination is our own local buffer
-      ncclShmem.groups[group].dsts[0] = userOutput + outIx;
-      if (accSize < outIx + nelem) {
+
+    if (recv && (flags & RoleWaitRecv)) {
+      ncclShmem.groups[group].srcs[0] = ((T*)peer->buff) + (step%NCCL_STEPS)*peer->connStepSize + ps->recvOffset;
+      int spins = 0;
+      while (peer->stepCache < step + StepPerSlice) {
+        peer->stepCache = loadStepValue(peer->tailPtr);
+        if (checkAbort(flags, Aborted, spins)) break;
+      }
+    }
+    if (send && (flags & RoleWaitSend)) {
+      int spins = 0;
+      while (peer->stepCache + NCCL_STEPS < step + ps->stepOffset + StepPerSlice) {
+        peer->stepCache = loadStepValue(peer->headPtr);
+        if (checkAbort(flags, Aborted, spins)) break;
+      }
+      ncclShmem.groups[group].dsts[0] = ((T*)peer->buff) + ((step+ps->stepOffset)%NCCL_STEPS)*peer->connStepSize + ps->sendOffset;
+      if (peer->accSize < ps->sendOffset + nelem + (step+ps->stepOffset)*peer->connStepSize) {
        // New data, add our own data to it.
-        ncclShmem.groups[group].srcs[1] = userInput + inpIx;
-        accSize = outIx + nelem;
+        ncclShmem.groups[group].srcs[1] = userInput + ps->inpIx;
      } else {
        // There is already data in there, accumulate instead of writing to it.
        ncclShmem.groups[group].srcs[1] = ncclShmem.groups[group].dsts[0];
      }
    }
-    barrier();
+    long long int localAccSize = shmem->localAccSize;
+    if (ps->sendDim < 0 && (flags & RoleOutput)) { // Destination is our own local buffer
+      ncclShmem.groups[group].dsts[0] = userOutput + ps->outIx;
+      if (localAccSize < ps->outIx + nelem) {
+        // New data, add our own data to it.
+        ncclShmem.groups[group].srcs[1] = userInput + ps->inpIx;
+        localAccSize = ps->outIx + nelem;
+      } else {
+        // There is already data in there, accumulate instead of writing to it.
+        ncclShmem.groups[group].srcs[1] = ncclShmem.groups[group].dsts[0];
+      }
+    }
+    patBarrier();
    int nSrcs = 2;
    void** srcs = ncclShmem.groups[group].srcs;
-    if (recvPow2 < 0) { srcs++; nSrcs--; } // No peer to receive from, remove one source
+    if (ps->recvDim < 0) { srcs++; nSrcs--; } // No peer to receive from, remove one source

    int workSize = ncclShmem.aborted ? 0 : nelem;

@@ -1154,59 +1181,92 @@ public:
      (tid, nthreads, ncclShmem.redOpArgs[0],  nullptr, /*postOp=*/false,
      nSrcs, srcs, 1, ncclShmem.groups[group].dsts, workSize);

-    barrier();
-    if (postRecv && recvPow2 >= 0 && recvPow2 == index && (flags & RolePostRecv)) postPeer<1, 0>(0 < nelem);
-    if (postSend && sendPow2 >= 0 && sendPow2 == index && (flags & RolePostSend)) postPeer<0, 1>(0 < nelem);
+    // Store conn step here inside the two barriers to make sure next reload will see the update.
+    if (postSend && (flags & RolePostSend)) {
+      if (peer->connFifo) {
+        peer->connFifo[step%NCCL_STEPS].size = (ps->sendOffset + nelem)*sizeof(T);
+      }
+      peer->step = step += StepPerSlice;
+      st_relaxed_sys_global(&peer->conn->step, step);
+    }
+    if (postRecv && (flags & RolePostRecv)) {
+      peer->step = step += StepPerSlice;
+      st_relaxed_sys_global(&peer->conn->step, step); // Also save in global mem for next op
+    }
+
+    // Update accSize
+    if (ps->sendDim < 0 && (flags & RoleOutput)) atomicMax(&shmem->localAccSize, localAccSize);
+    if (ps->sendDim >= 0 && (flags & RoleWaitSend)) atomicMax(&peer->accSize, ps->sendOffset + nelem + (step+ps->stepOffset)*peer->connStepSize);
+
+    patBarrier();
+
+    if (postSend && (flags & RolePostSend)) {
+      if (nelem > 0 || peer->connFifo) fence_acq_rel_sys();
+      st_relaxed_sys_global(peer->tailPtr, step);
+    }
+    if (postRecv && (flags & RolePostRecv)) {
+      st_relaxed_sys_global(peer->headPtr, step);
+    }
  }

-  __device__ __forceinline__ void patCopy(int recvPow2, int sendPow2, intptr_t inpIx, intptr_t outIx, int recvOffset, int sendOffset, int recvStepOffset, int nelem, int postRecv, int postSend) {
-    nelem = nelem < 0 ? 0 : nelem;
+  __device__ __forceinline__ void patCopy(struct ncclPatStep* ps, struct ncclPatShmem* shmem) {
+    if (ps->flags & PatSkipped) { patBarrier(); patBarrier(); return; } // Skipped
+    int nelem = ps->nelem < 0 ? 0 : ps->nelem;
    T* userInput = (T*)ncclShmem.groups[group].userInput;
    T* userOutput = (T*)ncclShmem.groups[group].userOutput;

-    if (recvPow2 >= 0 && recvPow2 == index && (flags & RoleWaitRecv)) {
-      ncclShmem.groups[group].srcs[0] = (T*)(connEltsFifo + ((step+recvStepOffset)%NCCL_STEPS)*connStepSize) + recvOffset;
-      int spins = 0;
-      while (connStepCache < step + recvStepOffset + StepPerSlice) {
-        connStepCache = loadStepValue(connStepPtr);
-        if (checkAbort(spins)) break;
-      }
-      if (accSize < recvOffset + nelem + (step+recvStepOffset)*connStepSize) {
-        // New data, copy to our output buffer.
-        ncclShmem.groups[group].dsts[1] = userOutput + outIx;
-        accSize = recvOffset + nelem + (step+recvStepOffset)*connStepSize;
-      } else {
-        ncclShmem.groups[group].dsts[1] = ncclShmem.groups[group].srcs[0]; // Already done
-      }
-      if (postRecv) step += StepPerSlice;
+    bool recv = ps->recvDim >= 0 && (flags & (RolePostRecv|RoleWaitRecv));
+    bool send = ps->sendDim >= 0 && (flags & (RolePostSend|RoleWaitSend));
+    bool postRecv = ps->postRecv && recv;
+    bool postSend = ps->postSend && send;
+    struct ncclPatPeer* peer = NULL;
+    if (recv) {
+      peer = shmem->recvDims+ps->recvDim;
+      step = peer->step;
    }
-    if (sendPow2 >= 0 && sendPow2 == index && (flags & RoleWaitSend)) {
-      int spins = 0;
-      while (connStepCache + NCCL_STEPS < step + StepPerSlice) {
-        connStepCache = loadStepValue(connStepPtr);
-        if (checkAbort(spins)) break;
-      }
-      ncclShmem.groups[group].dsts[0] = (T*)(connEltsFifo + (step%NCCL_STEPS)*connStepSize) + sendOffset;
-      if (postSend) {
-        if (flags & ConnFifoEnabled)
-          connFifo[step%NCCL_STEPS].size = (sendOffset + nelem)*sizeof(T);
-        step += StepPerSlice;
-      }
+    if (send) {
+      peer = shmem->sendDims+ps->sendDim;
+      step = peer->step;
    }
-    if (recvPow2 < 0 && (flags & RoleInput)) { // Source is our own local buffer
-      ncclShmem.groups[group].srcs[0] = userInput + inpIx;
-      if (accSize < inpIx + nelem) {
+
+    if (recv && (flags & RoleWaitRecv)) {
+      ncclShmem.groups[group].srcs[0] = ((T*)peer->buff) + ((step+ps->stepOffset)%NCCL_STEPS)*peer->connStepSize + ps->recvOffset;
+      int spins = 0;
+      while (peer->stepCache < step + ps->stepOffset + StepPerSlice) {
+        peer->stepCache = loadStepValue(peer->tailPtr);
+        if (checkAbort(flags, Aborted, spins)) break;
+      }
+      if (peer->accSize < ps->recvOffset + nelem + (step+ps->stepOffset)*peer->connStepSize) {
        // New data, copy to our output buffer.
-        ncclShmem.groups[group].dsts[1] = userOutput + outIx;
-        accSize = inpIx + nelem;
+        ncclShmem.groups[group].dsts[1] = userOutput + ps->outIx;
      } else {
        ncclShmem.groups[group].dsts[1] = ncclShmem.groups[group].srcs[0]; // Already done
      }
    }
-    barrier();
+    if (send && (flags & RoleWaitSend)) {
+      int spins = 0;
+      while (peer->stepCache + NCCL_STEPS < step + StepPerSlice) {
+        peer->stepCache = loadStepValue(peer->headPtr);
+        if (checkAbort(flags, Aborted, spins)) break;
+      }
+      ncclShmem.groups[group].dsts[0] = ((T*)peer->buff) + (step%NCCL_STEPS)*peer->connStepSize + ps->sendOffset;
+    }
+    long long int localAccSize = shmem->localAccSize;
+    if (ps->recvDim < 0 && (flags & RoleInput)) { // Source is our own local buffer
+      ncclShmem.groups[group].srcs[0] = userInput + ps->inpIx;
+      if (localAccSize < ps->inpIx + nelem) {
+        // New data, copy to our output buffer.
+        ncclShmem.groups[group].dsts[1] = userOutput + ps->outIx;
+        localAccSize = ps->inpIx + nelem;
+      } else {
+        // Already done
+        ncclShmem.groups[group].dsts[1] = ncclShmem.groups[group].srcs[0];
+      }
+    }
+    patBarrier();
    int nDsts = 2;
    void** dsts = ncclShmem.groups[group].dsts;
-    if (sendPow2 < 0) { dsts++; nDsts--; } // No peer to send to, remove one dest
+    if (ps->sendDim < 0) { dsts++; nDsts--; } // No peer to send to, remove one dest
    if (ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[1]) nDsts--; // In-place or already done.

    int workSize = ncclShmem.aborted ? 0 : nelem;
@@ -1215,9 +1275,32 @@ public:
      (tid, nthreads, ncclShmem.redOpArgs[0],  nullptr, /*postOp=*/false,
      1, ncclShmem.groups[group].srcs, nDsts, dsts, workSize);

-    barrier();
-    if (postRecv && recvPow2 >= 0 && recvPow2 == index && (flags & RolePostRecv)) postPeer<1, 0>(0 < nelem);
-    if (postSend && sendPow2 >= 0 && sendPow2 == index && (flags & RolePostSend)) postPeer<0, 1>(0 < nelem);
+    // Store conn step here inside the two barriers to make sure next reload will see the update.
+    if (postSend && (flags & RolePostSend)) {
+      if (peer->connFifo) {
+        peer->connFifo[step%NCCL_STEPS].size = (ps->sendOffset + nelem)*sizeof(T);
+      }
+      peer->step = step += StepPerSlice;
+      st_relaxed_sys_global(&peer->conn->step, step);
+    }
+    if (postRecv && (flags & RolePostRecv)) {
+      peer->step = step += StepPerSlice;
+      st_relaxed_sys_global(&peer->conn->step, step); // Also save in global mem for next op
+    }
+
+    // Update accSize
+    if (ps->recvDim < 0 && (flags & RoleInput)) atomicMax(&shmem->localAccSize, localAccSize);
+    if (ps->recvDim >= 0 && (flags & RoleWaitRecv)) atomicMax(&peer->accSize, ps->recvOffset + nelem + (step+ps->stepOffset)*peer->connStepSize);
+
+    patBarrier();
+
+    if (postSend && (flags & RolePostSend)) {
+      if (nelem > 0 || peer->connFifo) fence_acq_rel_sys();
+      st_relaxed_sys_global(peer->tailPtr, step);
+    }
+    if (postRecv && (flags & RolePostRecv)) {
+      st_relaxed_sys_global(peer->headPtr, step);
+    }
  }

  // MSCCL primitives
@@ -170,29 +170,66 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_L
 template<typename T, typename RedOp>
 struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SIMPLE> {
  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
+#if __CUDA_ARCH__ >= 600
    using Proto = ProtoSimple<1, 1>;
    const int nranks = ncclShmem.comm.nRanks;
    const int rank = ncclShmem.comm.rank;
    size_t count, channelOffset, channelCount, chunkCount;
    ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &channelOffset, &channelCount, &chunkCount);

-    T *inputBuf = (T*)work->sendbuff;
-    T *outputBuf = (T*)work->recvbuff;
-    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
-      (tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, nullptr, 0, primsModePatRs);
+    static constexpr int nworkers = NCCL_PAT_NWORKERS;
+    struct ncclPatShmem* shmem = (struct ncclPatShmem*)ncclScratchForWarp(0);
+    uint64_t pollCount = 0;
+    __syncthreads(); // Don't start using shared mem until everyone arrives
+    for (int i=tid; i<NCCL_SHMEM_PAT_STEPS; i+=nthreads) shmem->patSteps[i].flags = 0;
+    if (tid == 0) shmem->localAccSize = 0;
+    if (tid == nworkers) shmem->parallelFactor = 0;
+    __syncthreads();

-    PatRSAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
-    int last = 0;
-    while (!last) {
-      int recvDim, sendDim, recvOffset, sendOffset, sendStepOffset, postRecv, postSend, nelem;
-      size_t inpIx, outIx;
-      patAlgo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, sendStepOffset, nelem, postRecv, postSend, last);
-      prims.patReduce(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, sendStepOffset, nelem, postRecv, postSend);
+    if (tid == nworkers) { // Algo computation thread
+      PatRSAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, NCCL_PAT_NWORKERS/WARP_SIZE, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
+      int parallelFactor = shmem->parallelFactor = patAlgo.getParallelFactor();
+      int step = 0;
+      while (1) {
+        struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS);
+        cuda::atomic_ref<int, cuda::thread_scope_block> poll(ps->flags);
+        while (poll.load(cuda::memory_order_acquire) != 0) pollCount++; // Wait for workers to be done with step 'step-NCCL_SHMEM_PAT_STEPS'
+        patAlgo.getNextOp(ps);
+        int last = ps->last;
+        step++;
+        if (last == 2) break;
+      }
+    } else if (tid < nworkers) { // Worker threads
+      T *inputBuf = (T*)work->sendbuff;
+      T *outputBuf = (T*)work->recvbuff;
+      int parallelFactor = 0;
+      volatile int* pfPtr = &shmem->parallelFactor;
+      while (parallelFactor == 0) parallelFactor = *pfPtr;
+
+      int groupSize = nworkers/(WARP_SIZE*parallelFactor) * WARP_SIZE;
+      int group = tid / groupSize;
+      int nGroups = nworkers / groupSize;
+      int tidInGroup = tid - group*groupSize;
+      // We don't use recvPeers/sendPeers so let's pass shmem structs instead
+      Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
+        (tidInGroup, groupSize, (int*)shmem->recvDims, (int*)shmem->sendDims, inputBuf, outputBuf, work->redOpArg, group, 0, 0, nullptr, nullptr, 0, primsModePatRs);
+
+      int step = group;
+      while(1) {
+        struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS);
+        cuda::atomic_ref<int, cuda::thread_scope_block> poll(ps->flags);
+        while (poll.load(cuda::memory_order_acquire) == 0) pollCount++; // Wait for compute thread
+        int last = ps->last;
+        prims.patReduce(ps, shmem);
+        if (tidInGroup == 0) poll.store(0, cuda::memory_order_release); // Return element to compute thread
+        if (last) break;
+        step += nGroups;
+      }
    }
+#endif
  }
 };

-
 template<typename T, typename RedOp>
 struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
  __device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
@@ -122,7 +122,7 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
    size_t cursor = 0;
    do {
      int n = min(size_t(chunkSize), bytes-cursor);
-      prims.directRecv(cursor, cursor, n);
+      prims.directRecv(cursor, n);
      cursor += n;
    } while (cursor < bytes);

@@ -84,7 +84,6 @@ NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0);
 ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* maxStackSize) {
  constexpr int KernelCount = sizeof(rcclKernelTable)/sizeof(rcclKernelTable[0]);
  ncclResult_t result = ncclSuccess;
-  int print = 0;

  if (maxStackSize) *maxStackSize = 0;
  int carveout = ncclParamL1SharedMemoryCarveout();
@@ -115,11 +114,9 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* ma
    if (ncclMaxSharedMem != 0) {
      int sharedMemSize = ncclMaxSharedMem;
      if (sharedMemSize > (maxSharedMem-attr.sharedSizeBytes)) {
-        if (print++ == 0)
-          INFO(NCCL_INIT, "ncclMaxSharedMem %d exceeds device/fn maxSharedMem %zu",
-               sharedMemSize, maxSharedMem-attr.sharedSizeBytes);
-        // Reduce requested MaxDynamicSharedMemorySize attribute
-        sharedMemSize = maxSharedMem - attr.sharedSizeBytes;
+        WARN("cudaArch %d ncclMaxSharedMem %d exceeds device/fn maxSharedMem %zu",
+             cudaArch, sharedMemSize, maxSharedMem-attr.sharedSizeBytes);
+        return ncclSystemError;
      }
      CUDACHECKGOTO(cudaFuncSetAttribute(fn,
        cudaFuncAttributeMaxDynamicSharedMemorySize, sharedMemSize),
@@ -366,6 +363,7 @@ ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm) {
    devWork.rcclUseOneSlice = comm->rcclUseOneSlice;
    devWork.isOneRPN = comm->isOneRPN;
    devWork.netRegUsed = devWork.regUsed = 0;
+    devWork.profilerEnabled = ncclProfilerPluginLoaded() && (task->eActivationMask & ncclProfileKernelCh);
    if (task->regBufType & NCCL_NET_REG_BUFFER)
      devWork.netRegUsed = 1;
    if (task->regBufType & (NCCL_IPC_REG_BUFFER | NCCL_NVLS_REG_BUFFER))
@@ -467,6 +465,7 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
        struct ncclTaskColl* next = aggBeg->next;
        aggBeg->algorithm = agg.algorithm;
        aggBeg->protocol = agg.protocol;
+        if (aggBeg->protocol == NCCL_PROTO_LL) aggBeg->trafficBytes *= 4;
        aggBeg->nMaxChannels = agg.nMaxChannels;
        aggBeg->nWarps = agg.nWarps;
        aggBeg->devFuncId = agg.devFuncId;
@@ -526,6 +525,7 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
      devWork.redOpArgIsPtr = task->opDev.scalarArgIsPtr;
      devWork.oneNode = (comm->nNodes == 1);
      devWork.netRegUsed = devWork.regUsed = 0;
+      devWork.profilerEnabled = ncclProfilerPluginLoaded() && (task->eActivationMask & ncclProfileKernelCh);
      if (task->regBufType & NCCL_NET_REG_BUFFER)
        devWork.netRegUsed = 1;
      if (task->regBufType & (NCCL_IPC_REG_BUFFER | NCCL_NVLS_REG_BUFFER))
@@ -559,6 +559,14 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
  return ncclSuccess;
 }

+static ncclResult_t addProfilerProxyOpIfNeeded(struct ncclComm* comm, struct ncclKernelPlan* plan, struct ncclProxyOp* op) {
+  int tmp = op->pattern;
+  op->pattern = ncclPatternProfiler;
+  ncclResult_t ret = addProxyOpIfNeeded(comm, plan, op);
+  op->pattern = tmp;
+  return ret;
+}
+
 RCCL_PARAM(IntraNetThreshold, "INTRANET_THRESHOLD", 8388608);

 static ncclResult_t scheduleCollTasksToPlan(
@@ -571,7 +579,7 @@ static ncclResult_t scheduleCollTasksToPlan(
  int nChannels[2*2] = {0, 0, 0, 0}; // [collnet][nvls]
  int const nMaxChannels[2*2] = {comm->nChannels, comm->nvlsChannels, // [collnet][nvls]
                                 comm->nChannels, comm->nvlsChannels};
-  constexpr size_t MinTrafficPerChannel = 512; // Traffic as minimal
+  constexpr size_t MinTrafficPerChannel = 16 << 10; // 16K traffic as minimal
  do {
    size_t workBytes = 0;
    struct ncclTaskColl* task = ncclIntruQueueHead(&planner->collTaskQueue);
@@ -634,11 +642,16 @@ static ncclResult_t scheduleCollTasksToPlan(
        proxyOp.opCount = proxyOpId;
        proxyOp.task.coll = task;
        proxyOp.rank = comm->rank;
+        proxyOp.eActivationMask = task->eActivationMask;
+        proxyOp.incWorkCounter = true;
        addWorkBatchToPlan(comm, plan, c, workNode->workType, task->devFuncId, plan->workBytes);
+        // Set pattern to profiler to add a proxy profiler for kernel events
        NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOp));
+        NCCLCHECK(addProfilerProxyOpIfNeeded(comm, plan, &proxyOp));
      }
    } else { // not task->isCollnet
      int trafficPerByte = ncclFuncTrafficPerByte(task->func, comm->nRanks);
+      if (task->protocol == NCCL_PROTO_LL) trafficPerByte *= 4;
      size_t cellSize = divUp(divUp(MinTrafficPerChannel, (size_t)trafficPerByte), 16) * 16;
      int elementsPerCell = cellSize/elementSize;
      size_t cells = divUp(task->count*elementSize, cellSize);
@@ -762,6 +775,8 @@ static ncclResult_t scheduleCollTasksToPlan(
          }
          proxyOp->ringAlgo->incRefCount();
        }
+        proxyOp->eActivationMask = task->eActivationMask;
+        proxyOp->incWorkCounter = true;
        proxyOp->connIndex = 0;
        if (task->protocol == NCCL_PROTO_SIMPLE && task->algorithm == NCCL_ALGO_RING) {
          if (comm->useIntraNet && nBytes > rcclParamIntraNetThreshold()) {
@@ -773,6 +788,7 @@ static ncclResult_t scheduleCollTasksToPlan(
        // determine if that's actually true but it's also not clear if that would be an issue.
        // coverity[uninit_use_in_call:FALSE]
        NCCLCHECK(addProxyOpIfNeeded(comm, plan, proxyOp));
+        NCCLCHECK(addProfilerProxyOpIfNeeded(comm, plan, proxyOp));
      }
    }

@@ -915,7 +931,8 @@ static ncclResult_t addP2pToPlan(
    if (protocol[dir] == NCCL_PROTO_LL) chunkSize[dir] *= 2;

    if (network[dir]) {
-      if (bytes[dir] > 0 && proxySameProcess[dir] && protocol[dir] == NCCL_PROTO_SIMPLE && (ncclPxnDisable(comm) || !comm->isAllNvlink)) {
+      bool pxnUsed = !ncclPxnDisable(comm) && comm->isAllNvlink && comm->maxLocalRanks > 1;
+      if (bytes[dir] > 0 && proxySameProcess[dir] && protocol[dir] == NCCL_PROTO_SIMPLE && (!pxnUsed)) {
        int regFlag = 0;
        NCCLCHECK(ncclCalloc(&handles[dir], nChannelsMax));
        for (int part = 0; part < nChannelsMax; part++) {
@@ -991,6 +1008,7 @@ static ncclResult_t addP2pToPlan(
  work->recvRank = recvRank;
  work->recvAddr = recvAddr;
  work->recvBytes = recvBytes==-1 ? 0 : recvBytes;
+  work->profilerEnabled = ncclProfilerPluginLoaded() && ((p2pTasks[0] ? p2pTasks[0] : p2pTasks[1])->eActivationMask & ncclProfileKernelCh);
  work->recvConnIndex = connIndex[0];
  work->recvOpCount = recvOpCount;

@@ -1010,6 +1028,7 @@ static ncclResult_t addP2pToPlan(
    op->coll = p2pTasks[dir] ? p2pTasks[dir]->func : 0;
    op->task.p2p = p2pTasks[dir];
    op->rank = comm->rank;
+    op->eActivationMask = p2pTasks[dir] ? p2pTasks[dir]->eActivationMask : 0;
    op->connIndex = connIndex[dir];
    // The following are modified per channel part in addWorkToChannels():
    // op->buffer, op->nbytes, op->nsteps = ...;
@@ -1017,6 +1036,7 @@ static ncclResult_t addP2pToPlan(

  nChannelsMax = std::max(nChannels[0], nChannels[1]);
  for (int part=0; part < nChannelsMax; part++) {
+    int incWorkCounter = -1;
    int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, part, comm->p2pnChannelsPerPeer, comm->nNodes);
    plan->channelMask.masks[channelId/64] |= uint64_t(1)<<(channelId%64);
    // Add batch first.
@@ -1058,12 +1078,19 @@ static ncclResult_t addP2pToPlan(
        }
      }

+      // Increment work counter for <send, recv> pair rather than individual p2p
+      if (proxyOps[dir].nsteps && incWorkCounter < 0) {
+        proxyOps[dir].incWorkCounter = true;
+        incWorkCounter = dir;
+      }
+
      if (proxyOps[dir].nsteps != 0) {
        // Calculate the opCount after adding batch since then the batch count will
        // equal one plus the batch index this p2p settled in.
        proxyOps[dir].channelId = channelId;
        proxyOps[dir].opCount = uint64_t(comm->planner.wipPlan.channels[channelId].nWorkBatchesP2p)<<1 | 1;
        NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOps[dir]));
+        NCCLCHECK(addProfilerProxyOpIfNeeded(comm, plan, &proxyOps[dir]));
      }
    }
  }
@@ -1289,22 +1316,23 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla
      struct uploadWork_cleanup_t* cleanup = nullptr;
      cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
      void* fifoBufDev = nullptr;
+      cudaStream_t deviceStream;
+
      CUDACHECKGOTO(cudaThreadExchangeStreamCaptureMode(&mode), result, fail);

-      // Acquire deviceStream to gain access to deviceStream.cudaStream. Since the
-      // user's graph will be launched later, and it also acquires the deviceStream,
-      // it will observe this upload.
-      NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), result, fail);
+      // Acquire deviceStream. Since the user's graph will be launched later and it also
+      // acquires the deviceStream, it will observe this upload.
+      NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), result, fail);

-      CUDACHECKGOTO(cudaMallocAsync(&fifoBufDev, workBytes, comm->memPool, comm->sharedRes->deviceStream.cudaStream), result, fail);
+      CUDACHECKGOTO(cudaMallocAsync(&fifoBufDev, workBytes, comm->memPool, deviceStream), result, fail);
      plan->workBufPersistent = fifoBufDev;
      plan->kernelArgs->workBuf = fifoBufDev;

      // coverity[uninit_use_in_call:FALSE] => fifoBufHost is never NULL
-      CUDACHECKGOTO(cudaMemcpyAsync(fifoBufDev, fifoBufHost, workBytes, cudaMemcpyDefault, comm->sharedRes->deviceStream.cudaStream), result, fail);
+      CUDACHECKGOTO(cudaMemcpyAsync(fifoBufDev, fifoBufHost, workBytes, cudaMemcpyDefault, deviceStream), result, fail);
      cudaEvent_t memcpyDone;
      CUDACHECKGOTO(cudaEventCreateWithFlags(&memcpyDone, cudaEventDisableTiming), result, fail);
-      CUDACHECKGOTO(cudaEventRecord(memcpyDone, comm->sharedRes->deviceStream.cudaStream), result, fail);
+      CUDACHECKGOTO(cudaEventRecord(memcpyDone, deviceStream), result, fail);

      NCCLCHECKGOTO(ncclCalloc(&cleanup, 1), result, fail);
      cleanup->base.fn = uploadWork_cleanup_fn;
@@ -1312,7 +1340,7 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla
      cleanup->hostBuf = fifoBufHost;
      ncclIntruQueueEnqueue(&comm->eventCallbackQueue, (struct ncclCommEventCallback *)cleanup);

-      NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream), result, fail);
+      NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false), result, fail);
      NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm), result, fail);

    finish_scope:
@@ -1386,15 +1414,38 @@ static void HIPRT_CB hostStreamPlanCallback(void *plan_) {
  if (result != ncclSuccess) {
    WARN("hostStreamPlanCallback() failed : %s", ncclGetErrorString(result));
  }
-  if (!plan->persistent) ncclAtomicRefCountDecrement(&plan->comm->noncapturedRefs);
+  if (!plan->persistent) ncclAtomicRefCountDecrement(&plan->comm->sharedRes->noncapturedRefs);
  return;
 }

 static ncclResult_t reclaimPlan(struct ncclComm* comm, struct ncclCommCallback* me) {
  struct ncclKernelPlan* plan = (struct ncclKernelPlan*)me; // cast from first member `reclaim`
  if (plan->persistent) {
-    comm->persistentRefs -= 1;
-    NCCLCHECK(ncclCudaFree(plan->workBufPersistent));
+    comm->sharedRes->persistentRefs -= 1;
+    comm->localPersistentRefs -= 1;
+    if (plan->workStorageType == ncclDevWorkStorageTypePersistent) {
+      cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
+      CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+      CUDACHECK(cudaFree(plan->workBufPersistent));
+      CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+    }
+  }
+  // Free coll tasks
+  struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue);
+  while (ct != nullptr) {
+    struct ncclTaskColl* ct1 = ct->next;
+    free(ct->sendNetHandles);
+    free(ct->recvNetHandles);
+    free(ct->srecvNetHandles);
+    ncclMemoryPoolFree(&comm->memPool_ncclTaskColl, ct);
+    ct = ct1;
+  }
+  // Free p2p tasks
+  struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue);
+  while (pt != nullptr) {
+    struct ncclTaskP2p* pt1 = pt->next;
+    ncclMemoryPoolFree(&comm->memPool_ncclTaskP2p, pt);
+    pt = pt1;
  }
  // Free proxy ops
  struct ncclProxyOp* q = ncclIntruQueueHead(&plan->proxyOpQueue);
@@ -1427,6 +1478,32 @@ static void persistentDestructor(void* plans_) {
  }
 }

+NCCL_PARAM(LaunchOrderImplicit, "LAUNCH_ORDER_IMPLICIT", 0);
+
+namespace {
+  enum ncclImplicitOrder {
+    ncclImplicitOrderNone,
+    ncclImplicitOrderSerial,
+    ncclImplicitOrderLaunch
+  };
+}
+
+static ncclResult_t getImplicitOrder(enum ncclImplicitOrder *mode, bool capturing, int driver=-1) {
+  if (ncclParamLaunchOrderImplicit()) {
+#if !defined(__HIP_PLATFORM_AMD__) || !defined(__HIPCC__)
+    // Due to an unresolved bug in CUDA ncclImplicitOrderLaunch is not supported in graphs
+    if (capturing) { *mode = ncclImplicitOrderSerial; return ncclSuccess; }
+    if (driver < 0) { NCCLCHECK(ncclCudaDriverVersion(&driver)); }
+    *mode = 12030 <= std::min<int>(CUDART_VERSION, driver) ? ncclImplicitOrderLaunch : ncclImplicitOrderSerial;
+#else
+    *mode = ncclImplicitOrderNone;
+#endif
+    return ncclSuccess;
+  }
+  *mode = ncclImplicitOrderNone;
+  return ncclSuccess;
+}
+
 ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
  ncclResult_t result = ncclSuccess;
  struct ncclKernelPlanner* planner = &comm->planner;
@@ -1474,63 +1551,65 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {

    if (nPlans == 0) return ncclSuccess;

-    // Semantically we want these dependencies for the kernels launched:
-    //   1. Launch host task on hostStream.
-    //   2. Launch kernel, depends on all of {deviceStream, hostStream, userStream[i]...}
-    //   3. {deviceStream, userStream[i]...} depend on kernel.
-    // We achieve this by:
-    //   1. userStream[0] waits on deviceStream
-    //   2. deviceStream waits on each of userStream[1...]
-    //   3. host task launch on hostStream
-    //   4. userStream[0] waits on hostStream
-    //   5. kernel launch on userStream[0]
-    //   6. deviceStream waits on userStream[0]
-    //   7. userStream[1...] each waits on deviceStream
-    // The two-level fan-in fan-out is because ncclStrongStreamWaitStream() requires
-    // at least one of the two streams to be strong-stream.
    cudaStream_t launchStream = planner->streams->stream;
-    NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->deviceStream), result, failure);
+    cudaStream_t deviceStream, launchOrder;
+    NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), result, failure);

-    if (planner->numStreams != 1 || persistent) {
-      // Create dependency for device stream on user streams. First from extra user
-      // streams to deviceStream. Then deviceStream to first user stream.
+    if (persistent || planner->numStreams != 1) {
+      // userStream[0] waits on each userStream[i]...
      for (struct ncclCudaStreamList* l=planner->streams->next; l != nullptr; l = l->next) {
-        NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, &comm->sharedRes->deviceStream, l->stream), result, failure);
+        CUDACHECKGOTO(cudaEventRecord(comm->sharedRes->scratchEvent, l->stream), result, failure);
+        CUDACHECKGOTO(cudaStreamWaitEvent(launchStream, comm->sharedRes->scratchEvent, 0), result, failure);
      }
-      NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, launchStream, &comm->sharedRes->deviceStream), result, failure);
+      // userStream[0] waits on deviceStream
+      NCCLCHECKGOTO(ncclStreamWaitStream(launchStream, deviceStream, comm->sharedRes->scratchEvent), result, failure);
    } else if (planner->streams->stream != comm->lastStream && comm->lastStream != nullptr && !persistent) {
      // Stream changed from last call, create dependency against last NCCL kernel launch
-      CUDACHECK(hipStreamWaitEvent(planner->streams->stream, comm->doneEvent, 0));
+      CUDACHECKGOTO(hipStreamWaitEvent(planner->streams->stream, comm->doneEvent, 0), result, failure);
    }

-    if (persistent || comm->persistentRefs != 0 || ncclCudaLaunchBlocking || __atomic_load_n(&comm->noncapturedRefs, __ATOMIC_ACQUIRE)) {
+    bool capturing = ncclCudaGraphValid(planner->capturingGraph);
+    enum ncclImplicitOrder implicitOrder;
+    NCCLCHECKGOTO(getImplicitOrder(&implicitOrder, capturing), result, failure);
+
+    if (implicitOrder != ncclImplicitOrderNone) {
+      // userStream[0] waits on per-device (context) launchOrder. Concurrent strong stream access is
+      // required if this is a graph capture, non-captured cannot be concurrent because that would violate
+      // deterministic program order of launches.
+      bool concurrent = capturing;
+      NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->context->launchOrder, concurrent, &launchOrder), result, failure);
+      NCCLCHECKGOTO(ncclStreamWaitStream(launchStream, launchOrder, comm->sharedRes->scratchEvent), result, failure);
+    }
+
+    if (persistent || comm->sharedRes->persistentRefs != 0 || ncclCudaLaunchBlocking || __atomic_load_n(&comm->sharedRes->noncapturedRefs, __ATOMIC_ACQUIRE)) {
      // We have to launch host tasks to push proxy args. We are careful to only
      // do this if necessary since host tasks impose a high performance cost in CUDA.
      bool acquired = false;
+      cudaStream_t hostStream;
      for (struct ncclKernelPlan* plan=planHead; plan != nullptr; plan = plan->next) {
        if (plan->hasProxyOps) {
          if (!acquired) {
            acquired = true;
-            NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->hostStream), result, failure);
+            NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->hostStream, /*concurrent=*/false, &hostStream), result, failure);
          }
-          if (!persistent) ncclAtomicRefCountIncrement(&comm->noncapturedRefs);
+          if (!persistent) ncclAtomicRefCountIncrement(&comm->sharedRes->noncapturedRefs);
          plan->isHostCbEnq = true;
-          NCCLCHECKGOTO(ncclStrongStreamLaunchHost(planner->capturingGraph, &comm->sharedRes->hostStream, hostStreamPlanCallback, plan), result, failure);
+          CUDACHECKGOTO(cudaLaunchHostFunc(hostStream, hostStreamPlanCallback, plan), result, failure);
        }
      }
      if (acquired) {
        // Make to-be-launched kernels dependent on just-launched host stream tasks.
-        NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, launchStream, &comm->sharedRes->hostStream), result, failure);
-        NCCLCHECKGOTO(ncclStrongStreamRelease(planner->capturingGraph, &comm->sharedRes->hostStream), result, failure);
+        NCCLCHECKGOTO(ncclStreamWaitStream(launchStream, hostStream, comm->sharedRes->scratchEvent), result, failure);
+        NCCLCHECKGOTO(ncclStrongStreamRelease(planner->capturingGraph, &comm->sharedRes->hostStream, /*concurrent=*/false), result, failure);
      }
    }

    if (persistent) {
-      comm->persistentRefs += nPlans;
+      comm->sharedRes->persistentRefs += nPlans;
+      comm->localPersistentRefs += nPlans;
      NCCLCHECKGOTO(ncclCudaGraphAddDestructor(planner->capturingGraph, persistentDestructor, (void*)planHead), result, failure);
    }
  }
-
 failure:
  return result;
 }
@@ -1549,6 +1628,7 @@ NCCL_PARAM(MemSyncDomain, "MEM_SYNC_DOMAIN", cudaLaunchMemSyncDomainRemote);
 #endif

 ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan) {
+  ncclResult_t ret = ncclSuccess;
  struct ncclKernelPlanner* planner = &comm->planner;
  int nChannels = 0;
  for (int i = 0; i < MAXCHANNELS/64; i++)
@@ -1561,23 +1641,28 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
  void* extra[] = {plan->kernelArgs, &plan->kernelArgsSize};

  if (planner->numStreams == 1 && !plan->persistent) {
-    CUDACHECK(hipExtLaunchKernel(plan->kernelFn, grid, block, extra, 0, launchStream, NULL, comm->doneEvent, 0));
    comm->lastStream = planner->streams->stream;
+    CUDACHECKGOTO(hipExtLaunchKernel(plan->kernelFn, grid, block, extra, 0, launchStream, NULL, comm->doneEvent, 0), ret, do_return);
    return ncclSuccess;
  }

  // CUfunction fn;
  // CUDACHECK(cudaGetFuncBySymbol(&fn, sym));

-  #if CUDART_VERSION >= 11080
+#if !defined(__HIP_PLATFORM_AMD__) || !defined(__HIPCC__)
  int driverVersion;
-  NCCLCHECK(ncclCudaDriverVersion(&driverVersion));
-  if (driverVersion >= 11080) {
+  NCCLCHECKGOTO(ncclCudaDriverVersion(&driverVersion), ret, do_return);
+
+  CUfunction fn;
+  CUDACHECKGOTO(cudaGetFuncBySymbol(&fn, sym), ret, do_return);
+
+  if (CUDART_VERSION >= 11080 && driverVersion >= 11080) {
+  #if CUDART_VERSION >= 11080
    int compCap = comm->compCap;
    unsigned int clusterSize = (compCap >= 90) ? comm->config.cgaClusterSize : 0;

    CUlaunchConfig launchConfig = {0};
-    CUlaunchAttribute launchAttrs[3];
+    CUlaunchAttribute launchAttrs[4] = {};
    int attrs = 0;
    /* Cooperative Group Array (CGA)
     * On sm90 and later we have an extra level of hierarchy where we
@@ -1604,6 +1689,17 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
      launchAttrs[attrs++].value.memSyncDomain = (CUlaunchMemSyncDomain) ncclParamMemSyncDomain();
    }
    #endif
+    #if CUDART_VERSION >= 12030
+    bool capturing = ncclCudaGraphValid(planner->capturingGraph);
+    enum ncclImplicitOrder implicitOrder;
+    NCCLCHECKGOTO(getImplicitOrder(&implicitOrder, capturing, driverVersion), ret, do_return);
+    if (implicitOrder == ncclImplicitOrderLaunch) {
+      launchAttrs[attrs].id = CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT;
+      launchAttrs[attrs].value.launchCompletionEvent.event = comm->sharedRes->launchEvent;
+      launchAttrs[attrs].value.launchCompletionEvent.flags = 0;
+      attrs++;
+    }
+    #endif
    launchConfig.gridDimX = grid.x;
    launchConfig.gridDimY = grid.y;
    launchConfig.gridDimZ = grid.z;
@@ -1615,15 +1711,19 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
    launchConfig.numAttrs = attrs;
    launchConfig.hStream = launchStream;

-    //CUDACHECK(cudaLaunchKernelExC(&launchConfig, fnAddr, args));
-    CUCHECK(cuLaunchKernelEx(&launchConfig, fn, nullptr, extra));
-    return ncclSuccess;
-  }
+    CUCHECKGOTO(cuLaunchKernelEx(&launchConfig, fn, nullptr, extra), ret, do_return);
  #endif
+  } else {
+    // Standard kernel launch
+    CUCHECKGOTO(cuLaunchKernel(fn, grid.x, grid.y, grid.z, block.x, block.y, block.z, smem, launchStream, nullptr, extra), ret, do_return);
+  }
+#endif
  // Standard kernel launch
  //cuLaunchKernel(sym, grid.x, grid.y, grid.z, block.x, block.y, block.z, smem, launchStream, nullptr, extra);
-  CUDACHECK(cudaLaunchKernel(sym, grid, block, extra, smem, launchStream));
-  return ncclSuccess;
+  CUDACHECKGOTO(cudaLaunchKernel(sym, grid, block, extra, smem, launchStream), ret, do_return);
+
+do_return:
+  return ret;
 }

 ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan) {
@@ -1643,36 +1743,51 @@ ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKern
 }

 ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
-  ncclResult_t result = ncclSuccess;
  struct ncclKernelPlanner* planner = &comm->planner;
-  bool persistent = ncclCudaGraphValid(planner->capturingGraph);
-
  if (!ncclIntruQueueEmpty(&planner->planQueue)) {
    // Reset queue to empty without destroying plans since those will be sent
    // back to us for reclaiming via callbackQueue.
    ncclIntruQueueConstruct(&planner->planQueue);
+
+    bool capturing = ncclCudaGraphValid(planner->capturingGraph);
    cudaStream_t launchStream = planner->streams->stream; // First user stream gets launch
-    // Create dependency for deviceStream on launchStream. We know that deviceStream
-    // hasn't been modified since launchStream waited on it (in ncclLaunchPrepare),
-    // so we can say that launchStream subsumes it.
-    if (persistent || planner->numStreams != 1) NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, &comm->sharedRes->deviceStream, launchStream, /*b_subsumes_a=*/true), result, resume1);
-  resume1:
-    // Create dependency for other user streams (skip launch stream) on deviceStream.
-    // Again, the user streams haven't been touched since deviceStream waited on them
-    // so we can say they are subsumed by deviceStream.
-    struct ncclCudaStreamList* sl = planner->streams->next;
-    planner->streams = nullptr; // Reset comm->planner.streams to empty.
-    while (sl != nullptr && (planner->numStreams != 1 || persistent)) {
-      NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, sl->stream, &comm->sharedRes->deviceStream, /*b_subsumes_a=*/true), result, resume2);
-    resume2:
-      sl = sl->next;
+    cudaStream_t deviceStream, launchOrder;
+
+    if (capturing || planner->numStreams != 1) {
+      // CUDACHECK(cudaEventRecord(comm->sharedRes->scratchEvent, launchStream));
+      // deviceStream waits on userStream[0]
+      NCCLCHECK(ncclStrongStreamAcquiredWorkStream(planner->capturingGraph, &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream));
+
+      // We know that deviceStream is strictly behind the launchStream because launchStream
+      // synced with it before kernel launch. This allows us to to see deviceStream waiting
+      // on launchStream as a fast-forward. When building CUDA graphs fast forwards should
+      // be handled specially so as not to create graphs with a blowup in the number of edges.
+      // So we could do this:
+      //   CUDACHECK(cudaStreamWaitEvent(deviceStream, comm->sharedRes->scratchEvent, 0));
+      // But instead we do:
+      NCCLCHECK(ncclStreamAdvanceToEvent(planner->capturingGraph, deviceStream, comm->sharedRes->scratchEvent));
+
+      // Each userStream[i] waits on userStream[0]
+      for (struct ncclCudaStreamList* l=planner->streams->next; l != nullptr; l = l->next) {
+        CUDACHECK(cudaStreamWaitEvent(l->stream, comm->sharedRes->scratchEvent, 0));
+      }
    }
-    planner->numStreams = 0;
-    // Release device stream as acquired in ncclLaunchPrepare()
-    NCCLCHECKGOTO(ncclStrongStreamRelease(planner->capturingGraph, &comm->sharedRes->deviceStream), result, resume3);
-  resume3:;
+    enum ncclImplicitOrder implicitOrder;
+    NCCLCHECK(getImplicitOrder(&implicitOrder, capturing));
+    if (implicitOrder != ncclImplicitOrderNone) {
+      // As in ncclLaunchPrepare, strong stream can be non-concurrent when non-captured.
+      bool concurrent = capturing;
+      // Incorporate launch event into per-device (context) launch order.
+      NCCLCHECK(ncclStrongStreamAcquiredWorkStream(planner->capturingGraph, &comm->context->launchOrder, concurrent, &launchOrder));
+      // If we don't have launch events (requires CUDA 12.3) then just use completion event (serialize execution).
+      CUDACHECK(cudaStreamWaitEvent(launchOrder, implicitOrder == ncclImplicitOrderLaunch ? comm->sharedRes->launchEvent : comm->sharedRes->scratchEvent));
+      // Release launchOrder as acquired in ncclLaunchPrepare()
+      NCCLCHECK(ncclStrongStreamRelease(planner->capturingGraph, &comm->context->launchOrder, concurrent));
+    }
+    // Release deviceStream as acquired in ncclLaunchPrepare()
+    NCCLCHECK(ncclStrongStreamRelease(planner->capturingGraph, &comm->sharedRes->deviceStream, /*concurrent=*/false));
  }
-  return result;
+  return ncclSuccess;
 }

 /*****************************************************************************/
@@ -1780,11 +1895,11 @@ static ncclResult_t topoGetAlgoInfo(
  if (info->algorithm == NCCL_ALGO_UNDEF || info->protocol == NCCL_PROTO_UNDEF) {
    char ncclAlgoEnvStr[1024] = "";
    char ncclProtoEnvStr[1024] = "";
-    char* algoEnv = getenv("NCCL_ALGO");
+    const char* algoEnv = ncclGetEnv("NCCL_ALGO");
    if (algoEnv) {
      snprintf(ncclAlgoEnvStr, 1023, " NCCL_ALGO was set to %s.", algoEnv);
    }
-    char* protoEnv = getenv("NCCL_PROTO");
+    const char* protoEnv = ncclGetEnv("NCCL_PROTO");
    if (protoEnv) {
      snprintf(ncclProtoEnvStr, 1023, " NCCL_PROTO was set to %s.", protoEnv);
    }
@@ -2265,12 +2380,13 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {

    // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`.
    ncclGroupCommJoin(info->comm);
-    struct ncclTaskP2p* p2p = ncclMemoryStackAlloc<struct ncclTaskP2p>(&comm->memScoped);
+    struct ncclTaskP2p* p2p = ncclMemoryPoolAlloc<struct ncclTaskP2p>(&comm->memPool_ncclTaskP2p, &comm->memPermanent);
    p2p->buff = (void*)info->recvbuff;
    p2p->count = info->count;
    p2p->datatype = info->datatype;
    p2p->root = info->root;
    p2p->bytes = nBytes;
+    p2p->eActivationMask = __atomic_load_n(&ncclProfilerEventMask, __ATOMIC_RELAXED);
    p2p->opCount = comm->opCount;
    ncclIntruQueueEnqueue(
      isSendNotRecv ? &planner->peers[peer].sendQueue : &planner->peers[peer].recvQueue,
@@ -2280,6 +2396,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
    // Mark channels that need pre-connect
    if (comm->rank != peer) {
      if (!(isSendNotRecv ? planner->peers[peer].sendSeen : planner->peers[peer].recvSeen)) {
+        // planner->peers[peer].send/recvSeen is private to each comm, so we need to set it anyway.
        (isSendNotRecv ? planner->peers[peer].sendSeen : planner->peers[peer].recvSeen) = true;
        int round = 0;
        while (peer != (isSendNotRecv ? comm->p2pSchedule[round].sendRank
@@ -2290,23 +2407,30 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
        for (int c=0; c < comm->p2pnChannelsPerPeer; c++) {
          int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, c, comm->p2pnChannelsPerPeer, comm->nNodes);
          if (isSendNotRecv) {
-            if (comm->channels[channelId].peers[peer]->send[1].connected == 0) { // P2P uses only 1 connector
+            if (comm->channels[channelId].peers[peer]->send[1].hasSeen == 0) { // P2P uses only 1 connector
+              // the send/recv connector is shared among split shared comms. We need to set hasSeen to
+              // 1 in order to avoid duplicate connection setup if user group sendrecv ops with split
+              // shared comms together.
+              comm->channels[channelId].peers[peer]->send[1].hasSeen = 1;
              //comm->connectSend[peer] |= (1UL<<channelId);
 	            comm->connectSend[peer].masks[channelId/64] |= (1UL<<(channelId%64));
              ncclGroupCommPreconnect(comm);
            }
-            if (comm->p2pNet && comm->channels[channelId].peers[peer]->send[NCCL_CONN_IDX_P2P_NET].connected == 0) {
+            if (comm->p2pNet && comm->channels[channelId].peers[peer]->send[NCCL_CONN_IDX_P2P_NET].hasSeen == 0) {
+              comm->channels[channelId].peers[peer]->send[1].hasSeen = 1;
              //comm->connectSend[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET] |= (1UL<<channelId);
 	            comm->connectSend[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET].masks[channelId/64] |= (1UL<<(channelId%64));
              ncclGroupCommPreconnect(comm);
            }
          } else {
-            if (comm->channels[channelId].peers[peer]->recv[1].connected == 0) { // P2P uses only 1 connector
+            if (comm->channels[channelId].peers[peer]->recv[1].hasSeen == 0) { // P2P uses only 1 connector
+              comm->channels[channelId].peers[peer]->recv[1].hasSeen = 1;
              //comm->connectRecv[peer] |= (1UL<<channelId);
 	            comm->connectRecv[peer].masks[channelId/64] |= (1UL<<(channelId%64));
              ncclGroupCommPreconnect(comm);
            }
-            if (comm->p2pNet && comm->channels[channelId].peers[peer]->recv[NCCL_CONN_IDX_P2P_NET].connected == 0) {
+            if (comm->p2pNet && comm->channels[channelId].peers[peer]->recv[NCCL_CONN_IDX_P2P_NET].hasSeen == 0) {
+              comm->channels[channelId].peers[peer]->recv[1].hasSeen = 1;
              //comm->connectRecv[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET] |= (1UL<<channelId);
 	            comm->connectRecv[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET].masks[channelId/64] |= (1UL<<(channelId%64));
              ncclGroupCommPreconnect(comm);
@@ -2337,7 +2461,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
    } else {
      // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`.
      ncclGroupCommJoin(info->comm);
-      struct ncclTaskColl* t = ncclMemoryStackAlloc<struct ncclTaskColl>(&comm->memScoped);
+      struct ncclTaskColl* t = ncclMemoryPoolAlloc<struct ncclTaskColl>(&comm->memPool_ncclTaskColl, &comm->memPermanent);
      t->func = info->coll;
      t->sendbuff = info->sendbuff;
      t->recvbuff = info->recvbuff;
@@ -2355,6 +2479,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
      t->opDev = opDev; // C++ struct assignment
      t->chunkSteps = info->chunkSteps;
      t->sliceSteps = info->sliceSteps;
+      t->eActivationMask = __atomic_load_n(&ncclProfilerEventMask, __ATOMIC_RELAXED);
      t->opCount = comm->opCount;

      planner->nTasksColl += 1;
@@ -731,7 +731,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
  NCCLCHECKGOTO(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS), ret, fail);

  // Alternate rings to avoid crossing rails
-  if (graphs[NCCL_ALGO_RING]->crossNic && (nChannels % 2) == 0) {
+  if (graphs[NCCL_ALGO_RING]->crossNic == 2 && (nChannels % 2) == 0) {
    for (int r=0; r<comm->nRanks; r++) {
      if (comm->rankToNode[r] % 2 == 1) {
        // Exchange rings
@@ -380,8 +380,8 @@ ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerIn
  if ((((long *)&fabricInfo2->clusterUuid)[0]|((long *)fabricInfo2->clusterUuid)[1]) == 0) return ncclSuccess;
  if ((memcmp(fabricInfo1->clusterUuid, fabricInfo2->clusterUuid, NVML_GPU_FABRIC_UUID_LEN) == 0) &&
      (fabricInfo1->cliqueId == fabricInfo2->cliqueId)) {
-    INFO(NCCL_NET, "MNNVL matching peer 0x%lx UUID %lx.%lx cliqueId 0x%x",
-        info2->busId, ((long *)fabricInfo2->clusterUuid)[0], ((long *)fabricInfo2->clusterUuid)[1], fabricInfo2->cliqueId);
+    TRACE(NCCL_NET, "MNNVL matching peer 0x%lx UUID %lx.%lx cliqueId 0x%x",
+         info2->busId, ((long *)fabricInfo2->clusterUuid)[0], ((long *)fabricInfo2->clusterUuid)[1], fabricInfo2->cliqueId);
    *ret = 1;
  }
  return ncclSuccess;
@@ -389,9 +389,12 @@ ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerIn

 NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
 int ncclTopoUserGdrLevel = -1;
+const char* ncclTopoGdrModeStr[ncclTopoGdrModeNum] = { "Disabled", "Default", "PCI" };

-ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int rank, int64_t netId, int read, int* useGdr) {
-  *useGdr = 0;
+NCCL_PARAM(NetGdrC2c, "NET_GDR_C2C", 0);
+
+ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int rank, int64_t netId, int read, enum ncclTopoGdrMode* gdrMode) {
+  *gdrMode = ncclTopoGdrModeDisable;

  // Get GPU and NET
  int n, g;
@@ -434,7 +437,7 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int rank, int64_t n
  else {
    int arch, vendor, model;
    NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
-    if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD && model == NCCL_TOPO_CPU_TYPE_ROME) {
+    if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD && model == NCCL_TOPO_CPU_MODEL_AMD_ROME) {
      int i, d1 = -1, d2 = -1;
      for (i = 0; i < system->nodes[CPU].count; i++)
        if (system->nodes[GPU].nodes[g].paths[CPU][i].count == 2) break;
@@ -452,25 +455,37 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int rank, int64_t n
  int distance = gpu->paths[NET][n].type;
  if (distance == PATH_PXN) {
    // In case of PXN, use the intermediate GPU distance instead
-    int proxyRank, g;
+    int proxyRank;
    NCCLCHECK(ncclTopoGetIntermediateRank(system, gpu->gpu.rank, netId, &proxyRank));
    NCCLCHECK(ncclTopoRankToIndex(system, proxyRank, &g));
-    struct ncclTopoNode* proxyGpu = system->nodes[GPU].nodes+g;
-    distance = proxyGpu->paths[NET][n].type;
+    gpu = system->nodes[GPU].nodes+g;
+    distance = gpu->paths[NET][n].type;
  }
+
+  int c;
+  NCCLCHECK(ncclGetLocalCpu(system, g, &c));
+  if (ncclParamNetGdrC2c() && distance == PATH_PHB && gpu->paths[CPU][c].type == PATH_C2C) {
+    // On C2C platforms we can still use GDRDMA on NICs connected to the CPUs
+    INFO(NCCL_NET, "GPU %d / HCA %lx connected to CPU %d via C2C link", rank, netId, c);
+    distance = PATH_C2C;
+  }
+
  if (distance > netGdrLevel) {
    INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %d / HCA %lx (distance %d > %d)", rank, netId, distance, netGdrLevel);
    return ncclSuccess;
  }

-  *useGdr = 1;
-  INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %d / HCA %lx (distance %d <= %d), read %d", rank, netId, distance, netGdrLevel, read);
+  // Force PCIe mapping if path goes through PCI on a C2C system
+  if (gpu->paths[CPU][c].type == PATH_C2C && distance != PATH_C2C) *gdrMode = ncclTopoGdrModePci;
+  else *gdrMode = ncclTopoGdrModeDefault;
+
+  INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %d / HCA %lx (distance %d <= %d), read %d mode %s", rank, netId, distance, netGdrLevel, read, ncclTopoGdrModeStr[*gdrMode]);
  return ncclSuccess;
 }

 ncclResult_t ncclTopoIsGdrAvail(struct ncclTopoSystem* system, int rank, bool *avail) {
  int netNum = system->nodes[NET].count;
-  int useGdr = 0;
+  enum ncclTopoGdrMode useGdr = ncclTopoGdrModeDisable;
  *avail = false;
  for (int n = 0; n < netNum; n++) {
    int64_t netId = system->nodes[NET].nodes[n].id;
@@ -492,7 +507,7 @@ ncclResult_t ncclTopoIsGdrAvail(struct ncclTopoSystem* system, int rank, bool *a
 NCCL_PARAM(NetForceFlush, "NET_FORCE_FLUSH", 0);

 // Determine whether we need to flush the GDR recv buffers
-ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int netDev, int rank, int* flush) {
+ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int64_t netId, int netDev, int rank, int* flush) {
  *flush = 1;
  ncclNetProperties_t props;
  NCCLCHECK(comm->ncclNet->getProperties(netDev, &props));
@@ -506,6 +521,14 @@ ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int netDev, int rank, int*
 #else
  // Flush is required on Ampere and earlier
  if (gpu->gpu.cudaCompCap >= 90) *flush = 0;
+  // On C2C platforms, data could go through a PCI switch while completions and
+  // flags would go through C2C. In that case, force a flush.
+  int c, n;
+  NCCLCHECK(ncclGetLocalCpu(system, g, &c));
+  NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &n));
+  if (gpu->paths[NET][n].type <= PATH_PXB && gpu->paths[CPU][c].type == PATH_C2C) {
+    *flush = 1;
+  }
 #endif
  return ncclSuccess;
 }
@@ -576,7 +599,7 @@ NCCL_PARAM(PxnDisable, "PXN_DISABLE", 1);
 int ncclPxnDisable(struct ncclComm* comm) {
  static int pxnDisable = -1;
  if (pxnDisable == -1) {
-    if (comm && ncclNetVersion(comm) == 4) {
+    if (comm && comm->ncclNetVer == 4) {
      INFO(NCCL_INIT, "PXN Disabled as plugin is v4");
      pxnDisable = 1;
    } else {
@@ -599,9 +622,9 @@ ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks,
    int proxyRank;
    NCCLCHECK(ncclTopoGetNetDev(comm, comm->rank, NULL, 0, rank, &netId, NULL, &proxyRank));
    if (proxyRank == comm->rank) continue;
-    int useGdr;
+    enum ncclTopoGdrMode useGdr;
    NCCLCHECK(ncclTopoCheckGdr(comm->topo, comm->rank, netId, 1, &useGdr));
-    if (useGdr == 0) continue;
+    if (useGdr == ncclTopoGdrModeDisable) continue;
    int found = 0;
    for (int r=0; r<nr; r++) {
      if (ranks[r] == proxyRank) found = 1;
@@ -746,7 +769,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
      }
      if (gpu->paths[NET][n].type < PATH_PHB) {
        // Update path when we dont want to / can't use GPU Direct RDMA.
-        int gdr;
+        enum ncclTopoGdrMode gdr;
        NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].gpu.rank, netNode->id, 0, &gdr));
        if (gdr == 0) {
          // We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU
@@ -770,7 +793,7 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
  int myDomain = 0;
  int ngpus = system->nodes[GPU].count;
  int remove = 1;
-  int gdr = 1;
+  enum ncclTopoGdrMode useGdr = ncclTopoGdrModeDefault;
  bool allXgmi = true;
  NCCLCHECK(ncclCalloc(&domains, system->nodes[GPU].count));
  NCCLCHECKGOTO(ncclCalloc(&ids, system->nodes[GPU].count), ret, fail);
@@ -839,10 +862,10 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
  for (int g = 0; g < system->nodes[GPU].count; g++) {
    int64_t netId;
    NCCLCHECKGOTO(ncclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank, 0, &netId, nullptr), ret, fail);
-    NCCLCHECKGOTO(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].gpu.rank, netId, 1, &gdr), ret, fail);
-    if (!gdr) break;
+    NCCLCHECKGOTO(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].gpu.rank, netId, 1, &useGdr), ret, fail);
+    if (!useGdr) break;
  }
-  if (gdr && !allXgmi) {
+  if (useGdr && !allXgmi) {
    remove = 0;
    system->type |= RCCL_TOPO_GDR_ALL;
    INFO(NCCL_GRAPH, "GDR is available on all GPUs");
@@ -1014,3 +1037,38 @@ ncclResult_t ncclTopoPathAllNVLink(struct ncclTopoSystem* system, int* allNvLink
  *allNvLink = maxPath >= PATH_PIX ? 0 : 1;
  return ncclSuccess;
 }
+
+// Check whether we are in a split NVLink situation, with two NVLink domains, not
+// connected through NVLink (e.g. QPI).
+ncclResult_t ncclTopoSplitNvLink(struct ncclTopoSystem* system, int* splitNvLink) {
+  ncclResult_t res = ncclSuccess;
+  int nvlDomains = 0;
+  int *nvlDomain = NULL, *nvlDomainCount = NULL;
+  // Compute NVLink domains
+  NCCLCHECKGOTO(ncclCalloc(&nvlDomain, system->nodes[GPU].count), res, exit);
+  for (int g=0; g<system->nodes[GPU].count; g++) nvlDomain[g] = g;
+  for (int g=0; g<system->nodes[GPU].count; g++) {
+    struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+    int domain = nvlDomain[g];
+    for (int p=g+1; p<system->nodes[GPU].count; p++) {
+      if (gpu->paths[GPU][p].type == PATH_NVL) {
+        nvlDomain[p] = domain;
+      }
+    }
+  }
+  // Compute number of GPUs per NVLink domain.
+  NCCLCHECKGOTO(ncclCalloc(&nvlDomainCount, system->nodes[GPU].count), res, exit);
+  for (int g=0; g<system->nodes[GPU].count; g++) {
+    nvlDomainCount[nvlDomain[g]]++;
+  }
+  // Count the number of NVLink domains
+  for (int g=0; g<system->nodes[GPU].count; g++) {
+    if (nvlDomainCount[g] > 1) nvlDomains++;
+  }
+  *splitNvLink = nvlDomains == 2 ? 1 : 0;
+
+exit:
+  if(nvlDomain) free(nvlDomain);
+  if(nvlDomainCount) free(nvlDomainCount);
+  return res;
+}
@@ -2075,7 +2075,7 @@ ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph*

  for (i = 0; i < sizeof(romeTopoModels)/sizeof(romeTopoModels[0]); i++) {
    bool ignore_cpu = checkOption(romeTopoModels[i].options, "noCpuCheck");
-    if (!ignore_cpu && (arch != NCCL_TOPO_CPU_ARCH_X86 || vendor != NCCL_TOPO_CPU_VENDOR_AMD || model != NCCL_TOPO_CPU_TYPE_ROME))
+    if (!ignore_cpu && (arch != NCCL_TOPO_CPU_ARCH_X86 || vendor != NCCL_TOPO_CPU_VENDOR_AMD || model != NCCL_TOPO_CPU_MODEL_AMD_ROME))
      continue;
    bool ignore_numa = checkOption(romeTopoModels[i].options, "disableNumaMatching");
    if (!ignore_numa && romeTopo.nCpus != romeTopoModels[i].nCpus) continue;
@@ -2225,7 +2225,7 @@ ncclResult_t parse1H16P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra
  // only valid on Rome
  int arch, vendor, model;
  NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
-  if (arch != NCCL_TOPO_CPU_ARCH_X86 || vendor != NCCL_TOPO_CPU_VENDOR_AMD || model != NCCL_TOPO_CPU_TYPE_ROME)
+  if (arch != NCCL_TOPO_CPU_ARCH_X86 || vendor != NCCL_TOPO_CPU_VENDOR_AMD || model != NCCL_TOPO_CPU_MODEL_AMD_ROME)
    return ncclSuccess;

  // number of GPUs and NICs on each numa node is used as first screening pattern
@@ -2396,7 +2396,7 @@ ncclResult_t parse4H4P(struct ncclTopoSystem* system, struct ncclTopoGraph* grap
  // only valid on Rome
  int arch, vendor, model;
  NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
-  if (arch != NCCL_TOPO_CPU_ARCH_X86 || vendor != NCCL_TOPO_CPU_VENDOR_AMD || model != NCCL_TOPO_CPU_TYPE_ROME)
+  if (arch != NCCL_TOPO_CPU_ARCH_X86 || vendor != NCCL_TOPO_CPU_VENDOR_AMD || model != NCCL_TOPO_CPU_MODEL_AMD_ROME)
    return ncclSuccess;

  // number of GPUs and NICs on each numa node is used as first screening pattern
@@ -2460,7 +2460,7 @@ ncclResult_t parse4H4P(struct ncclTopoSystem* system, struct ncclTopoGraph* grap
    }
  }
  INFO(NCCL_GRAPH, "%s", line);
-  if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD && model == NCCL_TOPO_CPU_TYPE_ROME)
+  if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD && model == NCCL_TOPO_CPU_MODEL_AMD_ROME)
    system->type |= RCCL_TOPO_4P2H_ROME;
  parseOptions(system, rome_model_68.options);
  // create 4P4H based on reference and remapped ids
@@ -483,12 +483,11 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop
 // 1. Select NETs starting with those close to GPU(s), based on paths[n].type.
 // 2. add other NETs satisfying typeInter but not already in the list.

-ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int* nets, int* netCountRet) {
+ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int nets[NCCL_TOPO_MAX_NODES], int* netCountRet) {
  ncclResult_t ret = ncclSuccess;
  int netCount = 0;
  int localNetCount;
-  int* localNets;
-  NCCLCHECK(ncclCalloc(&localNets, MAXCHANNELS));
+  int localNets[MAXCHANNELS];

  // First add the preferred NICs
  for (int g=0; g<system->nodes[GPU].count; g++) {
@@ -497,8 +496,8 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
    struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
    for (int c = 0; c<MAXCHANNELS; c++) {
      int64_t netId;
-      NCCLCHECKGOTO(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId, NULL), ret, fail);
-      NCCLCHECKGOTO(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount), ret, fail);
+      NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId, NULL));
+      NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount));
      if (localNetCount > 0 && localNets[localNetCount] == localNets[0]) break;
      localNetCount++;
    }
@@ -506,7 +505,7 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
    for (int i=0; i<localNetCount; i++) {
      int n = localNets[i];
      int found = 0;
-      while (nets[found] != n && found<netCount) found++;
+      while (found<netCount && nets[found] != n) found++;
      if (found == netCount) nets[netCount++] = n;
    }
  }
@@ -525,22 +524,17 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
      for (int i=0; i<localNetCount; i++) {
        int n = localNets[i];
        int found = 0;
-        while (nets[found] != n && found<netCount) found++;
+        while (found<netCount && nets[found] != n) found++;
        if (found == netCount) nets[netCount++] = n;
      }
    }
  }

  *netCountRet = netCount;
-exit:
-  free(localNets);
  return ret;
-fail:
-  goto exit;
 }

 ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time) {
-  ncclResult_t ret = ncclSuccess;
  if ((*time) <= 0) return ncclSuccess;
  (*time)--;

@@ -562,7 +556,7 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
  }
  graph->intra[graph->nChannels*ngpus+step] = gpu->gpu.rank;
  int g = gpu - system->nodes[GPU].nodes;
-  int* nets = NULL;
+  int nets[NCCL_TOPO_MAX_NODES];
  if (step == backToNet) {
    // first get back to NIC
    if (system->nodes[NET].count) {
@@ -570,8 +564,7 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
      NCCLCHECK(getNetIndex(system, graph->inter[graph->nChannels*2], &startNetIndex));
      struct ncclTopoNode* startNet = system->nodes[NET].nodes+startNetIndex;
      int netCount;
-      NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
-      NCCLCHECKGOTO(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netCount), ret, fail);
+      NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netCount));
      for (int i=0; i<netCount; i++) {
        int n = nets[i];
        struct ncclTopoNode* net = system->nodes[NET].nodes+n;
@@ -592,14 +585,14 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
          graph->bwInter /= 2;
        }

-        NCCLCHECKGOTO(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net), ret, fail);
+        NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net));
        graph->bwInter = bwInterSave;
        if (net) {
          graph->inter[graph->nChannels*2+1] = net->id;
-          NCCLCHECKGOTO(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, nextBackToNet, backToFirstRank, forcedOrder, time), ret, fail);
+          NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, nextBackToNet, backToFirstRank, forcedOrder, time));

          if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) graph->bwInter /= 2;
-          NCCLCHECKGOTO(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net), ret, fail);
+          NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net));
          graph->bwInter = bwInterSave;
        }
      }
@@ -638,21 +631,15 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
    // Next path
    NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, ngpus, -1, -1, forcedOrder, time));
  }
-exit:
-  if (nets) free(nets);
-  return ret;
-fail:
-  goto exit;
+  return ncclSuccess;
 }

 ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int* time) {
-  ncclResult_t ret = ncclSuccess;
  const int bw = graph->bwInter;
-  int* nets;
-  NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
+  int nets[NCCL_TOPO_MAX_NODES];
  int netCount;
  int graphFound = 0;
-  NCCLCHECKGOTO(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netCount), ret, fail);
+  NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netCount));
  for (int i=0; i<netCount; i++) {
    if ((graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) && graphFound) break;
    int n = nets[(graph->nChannels+i)%netCount];
@@ -676,7 +663,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
      // NVLS search only tries to find NIC:GPU combinations to compute the heads.
      if (graph->nChannels < netCount) {
        int gpu;
-        NCCLCHECKGOTO(ncclTopoGetLocalGpu(system, net->id, &gpu), ret, fail);
+        NCCLCHECK(ncclTopoGetLocalGpu(system, net->id, &gpu));
        if (gpu != -1) {
          int duplicate = 0;
          // check whether there is duplicate head when one GPU connects with multiple NICs
@@ -687,7 +674,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
            }
          }
          if (!duplicate) {
-            NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, gpu), ret, fail);
+            NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, gpu));
            graphFound = 1;
          }
        }
@@ -696,8 +683,8 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
      if (graph->nChannels > 0) {
        // Try to replay the last channel
        int g;
-        NCCLCHECKGOTO(ncclTopoReplayGetGpu(system, graph, -1, &g), ret, fail);
-        NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g), ret, fail);
+        NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g));
+        NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g));
      }
      if (graph->nChannels == 0 || graph->sameChannels == 0) {
        if (graph->nChannels == 0 && system->nodes[NVS].count == 0) {
@@ -708,16 +695,16 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
          for (int i = 0; i<system->nodes[GPU].count; i++) {
            if (paths[i].count <= paths[f].count) {
              // prefer GPU direct RDMA
-              int gdr;
-              NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[i].id, net->id, 0, &gdr));
-              if (paths[i].count < paths[f].count || (paths[i].count == paths[f].count && !f_gdr && gdr)) {
+              enum ncclTopoGdrMode useGdr;
+              NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[i].id, net->id, 0, &useGdr));
+              if (paths[i].count < paths[f].count || (paths[i].count == paths[f].count && !f_gdr && useGdr)) {
                f = i;
-                f_gdr = gdr;
+                f_gdr = useGdr;
              }
            }
          }
          int t = 1 << 10;
-          NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0), ret, fail);
+          NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0));
          if (t == -1) *time = -1;
        }

@@ -737,7 +724,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
          for (int i=0; i<system->nodes[GPU].count; i++) {
            int g = (graph->nChannels+i)%system->nodes[GPU].count;
            if (paths[g].bw == maxBw && paths[g].count == minHops) {
-              NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g), ret, fail);
+              NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g));
            }
          }
        }
@@ -751,11 +738,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
      }
    }
  }
-exit:
-  free(nets);
-  return ret;
-fail:
-  goto exit;
+  return ncclSuccess;
 }

 /* Search Patterns
@@ -1061,7 +1044,7 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
        graph->pattern == NCCL_TOPO_PATTERN_RING ? system->hostIdx % 2 : 0));
      int arch, vendor, model;
      NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
-      if (graph->nChannels && arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD && model == NCCL_TOPO_CPU_TYPE_ROME) {
+      if (graph->nChannels && arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD && model == NCCL_TOPO_CPU_MODEL_AMD_ROME) {
        system->type |= RCCL_TOPO_4P2H_ROME;
      }
    }
@@ -1107,6 +1090,15 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
    graph->minChannels = graph->maxChannels;
  }

+  int splitNvLink;
+  NCCLCHECK(ncclTopoSplitNvLink(system, &splitNvLink));
+  if (graph->pattern == NCCL_TOPO_PATTERN_RING && splitNvLink) {
+    // We have two sockets with NVLink and a slower link in between (typically QPI).
+    // Tree is likely going to work better but it needs at least 2 channels.
+    // Since Tree needs to have the same number of channels as Ring, also force Ring to use 2 channels.
+    if (graph->maxChannels >= 2 && graph->minChannels == 1) graph->minChannels = 2;
+  }
+
  struct ncclTopoGraph tmpGraph;
  memcpy(&tmpGraph, graph, sizeof(struct ncclTopoGraph));

@@ -24,11 +24,11 @@

 const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" };
 #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
-const char* topoLinkTypeStr[] = { "LOC", "XGMI", "",    "PCI",    "",    "",    "", "SYS", "NET" };
-const char* topoPathTypeStr[] = { "LOC", "XGMI", "NVB", "PIX", "PXB", "PXN", "PHB", "SYS", "DIS" };
+const char* topoLinkTypeStr[] = { "LOC", "XGMI", "",    "C2C", "PCI",    "",    "",    "", "SYS", "NET" };
+const char* topoPathTypeStr[] = { "LOC", "XGMI", "NVB", "C2C", "PIX", "PXB", "PXN", "PHB", "SYS", "DIS" };
 #else
-const char* topoLinkTypeStr[] = { "LOC", "NVL", "",    "PCI",    "",    "",    "", "SYS", "NET" };
-const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "PIX", "PXB", "PXN", "PHB", "SYS", "NET", "DIS" };
+const char* topoLinkTypeStr[] = { "LOC", "NVL", "",    "C2C", "PCI",    "",    "",    "", "SYS", "NET" };
+const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "C2C", "PIX", "PXB", "PXN", "PHB", "SYS", "NET", "DIS" };
 #endif

 /******************************************************************/
@@ -51,7 +51,7 @@ ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id)
  return ncclSuccess;
 }

-static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode** cpu) {
+static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode** cpu, struct ncclTopoNode* from) {
  *cpu = NULL;
  if (node->type == CPU) {
    *cpu = node;
@@ -60,9 +60,10 @@ static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode*
  for (int l=0; l<node->nlinks; l++) {
    // Go up the PCI tree to find the CPU. Follow only PCI switches.
    if (node->links[l].type == LINK_PCI
-  && (node->links[l].remNode->type == PCI
-      || node->links[l].remNode->type == CPU)) {
-      NCCLCHECK(findLocalCpu(node->links[l].remNode, cpu));
+	&& node->links[l].remNode != from
+	&& (node->links[l].remNode->type == PCI
+	    || node->links[l].remNode->type == CPU)) {
+      NCCLCHECK(findLocalCpu(node->links[l].remNode, cpu, node));
    }
    if (*cpu != NULL) return ncclSuccess;
  }
@@ -83,13 +84,17 @@ static ncclResult_t ncclTopoGetInterCpuBw(struct ncclTopoNode* cpu, float* bw) {
    return ncclSuccess;
  }
  if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
-    *bw = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_BW : QPI_BW;
+    *bw =
+      cpu->cpu.model == NCCL_TOPO_CPU_MODEL_INTEL_ERP ? ERP_QPI_BW :
+      cpu->cpu.model == NCCL_TOPO_CPU_MODEL_INTEL_SRP ? SRP_QPI_BW :
+      cpu->cpu.model == NCCL_TOPO_CPU_MODEL_INTEL_SKL ? SKL_QPI_BW :
+      BDW_QPI_BW;
  }
  if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_AMD) {
    *bw = AMD_BW;
  }
  if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
-    *bw = cpu->cpu.model ==  NCCL_TOPO_CPU_TYPE_YONGFENG ? YONGFENG_ZPI_BW : ZPI_BW;
+    *bw = cpu->cpu.model ==  NCCL_TOPO_CPU_MODEL_YONGFENG ? YONGFENG_ZPI_BW : ZPI_BW;
  }
  return ncclSuccess;
 }
@@ -534,19 +539,23 @@ ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* s
      int familyId, modelId;
      NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
      NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
-      cpu->cpu.model = (familyId == 6 && modelId >= 0x55) ? NCCL_TOPO_CPU_TYPE_SKL : NCCL_TOPO_CPU_INTEL_BDW;
+      cpu->cpu.model =
+        (familyId == 6 && modelId >= 0xCF) ? NCCL_TOPO_CPU_MODEL_INTEL_ERP :
+        (familyId == 6 && modelId >= 0x8F) ? NCCL_TOPO_CPU_MODEL_INTEL_SRP :
+        (familyId == 6 && modelId >= 0x55) ? NCCL_TOPO_CPU_MODEL_INTEL_SKL :
+        NCCL_TOPO_CPU_MODEL_INTEL_BDW;
    } else if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
      int familyId, modelId;
      NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
      NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
-      if (familyId == 7 && modelId == 0x5B) cpu->cpu.model = NCCL_TOPO_CPU_TYPE_YONGFENG;
+      if (familyId == 7 && modelId == 0x5B) cpu->cpu.model = NCCL_TOPO_CPU_MODEL_YONGFENG;
    }
    if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_AMD) {
      int familyId, modelId;
      NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
      NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
      // Treat "Milan" also as "Rome"
-      cpu->cpu.model = ((familyId == 143 && modelId >= 49) || familyId == 175) ? NCCL_TOPO_CPU_TYPE_ROME : NCCL_TOPO_CPU_TYPE_ZEN;
+      cpu->cpu.model = ((familyId == 143 && modelId >= 49) || familyId == 175) ? NCCL_TOPO_CPU_MODEL_AMD_ROME : NCCL_TOPO_CPU_MODEL_AMD_ZEN;
    }
  }
  for (int s=0; s<xmlCpu->nSubs; s++) {
@@ -595,7 +604,7 @@ ncclResult_t ncclTopoAddXGMI(struct ncclXmlNode* node, struct ncclTopoSystem* sy
      NCCLCHECK(ncclTopoGetNode(system, &remote, GPU, busId));
    } else if (targetType == CPU) {
      // NVL connection to the local CPU
-      NCCLCHECK(findLocalCpu(gpu, &remote));
+      NCCLCHECK(findLocalCpu(gpu, &remote, NULL));
    } else {
      if (system->nodes[NVS].count == 0) {
        NCCLCHECK(ncclTopoCreateNode(system, &remote, NVS, 0));
@@ -647,7 +656,7 @@ ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem*
      NCCLCHECK(ncclTopoGetNode(system, &remote, GPU, NCCL_TOPO_ID(systemId, busId)));
    } else if (targetType == CPU) {
      // NVL connection to the local CPU
-      NCCLCHECK(findLocalCpu(gpu, &remote));
+      NCCLCHECK(findLocalCpu(gpu, &remote, NULL));
    } else {
      if (system->nodes[NVS].count == 0) {
        NCCLCHECK(ncclTopoCreateNode(system, &remote, NVS, 0));
@@ -725,10 +734,10 @@ ncclResult_t ncclTopoAddC2c(struct ncclXmlNode* node, struct ncclTopoSystem* sys
    NCCLCHECK(xmlGetAttrInt(node, "bw", &bw));
    double c2cBw = (bw*count)/1000.0;
    struct ncclTopoNode* cpu = NULL;
-    NCCLCHECK(findLocalCpu(gpu, &cpu));
+    NCCLCHECK(findLocalCpu(gpu, &cpu, NULL));
    if (cpu == NULL) return ncclSuccess;
-    NCCLCHECK(ncclTopoConnectNodes(gpu, cpu, LINK_NVL, c2cBw));
-    NCCLCHECK(ncclTopoConnectNodes(cpu, gpu, LINK_NVL, c2cBw));
+    NCCLCHECK(ncclTopoConnectNodes(gpu, cpu, LINK_C2C, c2cBw));
+    NCCLCHECK(ncclTopoConnectNodes(cpu, gpu, LINK_C2C, c2cBw));
  } else {
    if (strcmp(node->name, "cpu") == 0) {
      NCCLCHECK(ncclGetSystemId(system, node, &systemId));
@@ -1048,26 +1057,31 @@ struct ncclXmlNode** physNetNodes, struct ncclXmlNode** netNode, ncclResult_t (*
  // Trigger the merge, then get the new device's properties
  int vDevIndex = 0;
  ncclResult_t ret = makeVDevice(&vDevIndex, vProps);
-  if (ret == ncclInvalidUsage) {
-    WARN("TOPO/NET : Tried merging multiple devices together and failed. Try setting NCCL_NET_MERGE_LEVEL=LOC");
-    NCCLCHECK(ret);
+  if (ret != ncclSuccess) {
+    INFO(NCCL_GRAPH|NCCL_INIT|NCCL_NET, "TOPO/NET : Tried merging multiple devices together and failed. vProps={ndevs=%d, devs=[%d %d %d %d]}. Set NCCL_NET_MERGE_LEVEL=LOC to disable NIC fusion.",
+      vProps->ndevs, vProps->devs[0], vProps->devs[1], vProps->devs[2], vProps->devs[3]);
+    return ret;
  }

  INFO(NCCL_GRAPH, "TOPO/NET : Made vNic %d", vDevIndex);
  return ncclSuccess;
 }

-ncclResult_t ncclTopoForceMerge(ncclComm_t comm, struct ncclXml* xml, char* str, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
+ncclResult_t ncclTopoForceMerge(ncclComm_t comm, struct ncclXml* xml, const char* str, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
+  ncclResult_t ret = ncclSuccess;
  INFO(NCCL_ENV|NCCL_NET, "TOPO/NET : Force-fusing NICs using NCCL_NET_FORCE_MERGE=%s", str);
+  char* ncStr;
+  NCCLCHECK(ncclCalloc(&ncStr, strlen(str)+1));
+  strcpy(ncStr, str);
  char* semi_token;
-  char* semi = strtok_r(str, ";", &semi_token);
+  char* semi = strtok_r(ncStr, ";", &semi_token);
  while (semi) {
    TRACE(NCCL_NET, "Fusing %s", semi);
    struct netIf userIfs[NCCL_NET_MAX_DEVS_PER_NIC];
    int nUserIfs = parseStringList(semi, userIfs, NCCL_NET_MAX_DEVS_PER_NIC);
    if (nUserIfs == 0) {
      INFO(NCCL_NET, "NET/IB : Invalid NCCL_NET_FORCE_MERGE specified %s. Couldn't parse substring %s. Please provide a semicolon-delimited list of comma-delimited NIC groups.",
-        str, semi);
+        ncStr, semi);
      continue;
    }

@@ -1081,26 +1095,37 @@ ncclResult_t ncclTopoForceMerge(ncclComm_t comm, struct ncclXml* xml, char* str,
    if (vProps.ndevs != nUserIfs) {
      WARN("TOPO/NET : Only matched %d devices, %d requested from %s",
        vProps.ndevs, nUserIfs, semi);
-      return ncclInvalidUsage;
+      ret = ncclInvalidUsage;
+      goto fail;
    }

    if (vProps.ndevs > NCCL_NET_MAX_DEVS_PER_NIC) {
      WARN("Specified fused NIC %s which has too many devices (%d). Max %d", semi, vProps.ndevs, NCCL_NET_MAX_DEVS_PER_NIC);
-      return ncclInvalidUsage;
+      ret = ncclInvalidUsage;
+      goto fail;
    }

    struct ncclXmlNode* netNode;
-    NCCLCHECK(ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice));
-
-    // Only set that a device is "placed" after successfully making a vNic (it's possible to exit before this)
-    for (int i = 0; i < vProps.ndevs; i++) {
-      placedDevs[vProps.devs[i]] = 1;
+    ret = ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice);
+    if (ret == ncclSuccess) {
+      // Only set that a device is "placed" after successfully making a vNic (it's possible to exit before this)
+      for (int i = 0; i < vProps.ndevs; i++) {
+        placedDevs[vProps.devs[i]] = 1;
+      }
+    } else {
+      WARN("TOPO/NET : Could not force merge NICs %s. Please specify a valid NCCL_NET_FORCE_MERGE string.", semi);
+      ret = ncclInvalidUsage;
+      goto fail;
    }

    semi = strtok_r(NULL, ";", &semi_token);;
  }

-  return ncclSuccess;
+exit:
+  free(ncStr);
+  return ret;
+fail:
+  goto exit;
 }

 ncclResult_t ncclTopoAutoMerge(ncclComm_t comm, struct ncclXml* xml, int mergeLevel, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
@@ -1148,7 +1173,24 @@ ncclResult_t ncclTopoAutoMerge(ncclComm_t comm, struct ncclXml* xml, int mergeLe
      }

      struct ncclXmlNode* netNode;
-      NCCLCHECKGOTO(ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice), res, out);
+      ncclResult_t ret = ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice);
+
+      // Merging failed.
+      // Mark all as unplaced and increase their distance to disconnected (PATH_DIS)
+      // Set i to 0 to restart the automatic merging process and ensure all are placed
+      if (ret != ncclSuccess) {
+        INFO(NCCL_GRAPH|NCCL_INIT|NCCL_NET, "Marking physical devices as unplaced, increasing distance and restarting search.");
+        placedDevs[i] = 0;
+        TRACE(NCCL_GRAPH, "Setting dev %d as unplaced, keeping distance -> self as PATH_LOC", i);
+        for (int k = 1; k < vProps.ndevs; k++) {
+          int dev = vProps.devs[k];
+          placedDevs[dev] = 0;
+          paths[i*nPhysDevs + dev] = PATH_DIS;
+          paths[dev*nPhysDevs + i] = PATH_DIS;
+          TRACE(NCCL_GRAPH, "Setting dev %d as unplaced, setting distance -> %d as PATH_DIS", dev, i);
+        }
+        i = 0;
+      }
    }
  }

@@ -1212,16 +1254,16 @@ ncclResult_t ncclTopoMakeVNics(ncclComm_t comm, struct ncclXml* xml, ncclResult_
  // By default, don't merge any devices
  int mergeLevel;
  mergeLevel = PATH_PORT;
-  char* mergeLevelEnv;
-  mergeLevelEnv = getenv("NCCL_NET_MERGE_LEVEL");
-  if (mergeLevelEnv) kvConvertToInt(mergeLevelEnv, &mergeLevel, nicPathKvList);
-  char* forceMerge;
-  forceMerge = getenv("NCCL_NET_FORCE_MERGE");
-  NCCLCHECK(ncclCalloc(&placedDevs, physicalDevs));
-  memset(placedDevs, 0, sizeof(int)*physicalDevs);
+  { // Avoids warnings related to jumping to "out"
+    const char* mergeLevelEnv = ncclGetEnv("NCCL_NET_MERGE_LEVEL");
+    if (mergeLevelEnv) kvConvertToInt(mergeLevelEnv, &mergeLevel, nicPathKvList);
+    const char* forceMerge = ncclGetEnv("NCCL_NET_FORCE_MERGE");
+    NCCLCHECK(ncclCalloc(&placedDevs, physicalDevs));
+    memset(placedDevs, 0, sizeof(int)*physicalDevs);

-  if (forceMerge) {
-    NCCLCHECKGOTO(ncclTopoForceMerge(comm, xml, forceMerge, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out);
+    if (forceMerge) {
+      NCCLCHECKGOTO(ncclTopoForceMerge(comm, xml, forceMerge, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out);
+    }
  }
  NCCLCHECKGOTO(ncclTopoAutoMerge(comm, xml, mergeLevel, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out);

@@ -21,9 +21,11 @@
 #define SM86_NVLINK_BW 12.0
 #define SM100_NVLINK_BW 40.0
 #define PCI_BW 12.0           // PCI Gen3 x16
-#define QPI_BW 6.0
 #define AMD_BW 16.0
+#define BDW_QPI_BW 6.0
 #define SKL_QPI_BW 10.0
+#define SRP_QPI_BW 22.0
+#define ERP_QPI_BW 40.0
 #define ZPI_BW 6.0
 #define YONGFENG_ZPI_BW 9.0
 #define P9_BW 32.0
@@ -51,12 +53,13 @@ extern const char* topoNodeTypeStr[];
 #define LINK_LOC 0
 #define LINK_NVL 1
 // Skipping 2 for PATH_NVB
-#define LINK_PCI 3
-// Skipping 4 for PATH_PXB
-// Skipping 5 for PATH_PXN
-// Skipping 6 for PATH_PHB
-#define LINK_SYS 7
-#define LINK_NET 8
+#define LINK_C2C 3
+#define LINK_PCI 4
+// Skipping 5 for PATH_PXB
+// Skipping 6 for PATH_PXN
+// Skipping 7 for PATH_PHB
+#define LINK_SYS 8
+#define LINK_NET 9
 extern const char* topoLinkTypeStr[];

 // Local (myself)
@@ -68,29 +71,32 @@ extern const char* topoLinkTypeStr[];
 // Connection through NVLink using an intermediate GPU
 #define PATH_NVB 2

+// Connection through C2C
+#define PATH_C2C 3
+
 // Connection traversing at most a single PCIe bridge
-#define PATH_PIX 3
+#define PATH_PIX 4

 // Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
-#define PATH_PXB 4
+#define PATH_PXB 5

 // Connection between a GPU and a NIC using an intermediate GPU. Used to enable rail-local, aggregated network send/recv operations.
-#define PATH_PXN 5
+#define PATH_PXN 6

 // Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
-#define PATH_PHB 6
+#define PATH_PHB 7

 // Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
-#define PATH_SYS 7
+#define PATH_SYS 8

 // Connection through the network
-#define PATH_NET 8
+#define PATH_NET 9

 // New type of path which should precede PATH_PIX
 #define PATH_PORT PATH_NVL

 // Disconnected
-#define PATH_DIS 9
+#define PATH_DIS 10
 extern const char* topoPathTypeStr[];

 struct ncclTopoNode;
@@ -110,9 +116,6 @@ struct ncclTopoLinkList {
  int type;
 };

-#define NCCL_TOPO_CPU_INTEL_BDW 1
-#define NCCL_TOPO_CPU_INTEL_SKL 2
-
 #define NCCL_TOPO_UNDEF (-1)

 #define NCCL_TOPO_ID_LOCAL_ID_MASK 0x00ffffffffffffff
@@ -212,6 +215,7 @@ ncclResult_t ncclTopoLoadSystem(const char* xmlTopoFile, struct ncclTopoSystem*
 ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int64_t netId, int* intermediateRank);
 ncclResult_t ncclTopoGetGpuMinPath(struct ncclTopoSystem* system, int type, int* min);
 ncclResult_t ncclTopoGetGpuMaxPath(struct ncclTopoSystem* system, int type, int* max);
+ncclResult_t ncclTopoSplitNvLink(struct ncclTopoSystem* system, int* splitNvLink);

 #define NCCL_TOPO_XML_MAX_NODES 8192
 #define NCCL_GRAPH_XML_MAX_NODES 8192
@@ -382,6 +382,7 @@ static const double perChMaxTreeBws[][3] = {
 NCCL_PARAM(PatEnable, "PAT_ENABLE", 2);
 static int ncclPatEnable(struct ncclComm* comm) {
  int patEnable = ncclParamPatEnable();
+  if (comm->minCompCap < 60) return 0; // Need SM60 or higher for CUDA atomics
  if (patEnable != 2) return patEnable;
  if (comm->nNodes != comm->nRanks) return 0; // PAT only supports 1 GPU per node
  if (comm->netDeviceType != NCCL_NET_DEVICE_HOST) return 0;   // PAT doesn't support net device offload
@@ -486,7 +487,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
        if (a == NCCL_ALGO_TREE && coll == ncclFuncAllReduce) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw);
        if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw);
        if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), graphs[a]->nChannels*perChMaxTreeLL128Bw);
-        if (a == NCCL_ALGO_TREE && graphs[a]->pattern == NCCL_TOPO_PATTERN_TREE) busBw *= .85;
+        if (a == NCCL_ALGO_TREE && comm->maxTreePattern == NCCL_TOPO_PATTERN_TREE) busBw *= .85;
        if (a == NCCL_ALGO_PAT) busBw *= .75;
        if (a == NCCL_ALGO_COLLNET_DIRECT && p != NCCL_PROTO_SIMPLE) busBw = 0;  // Not used
        if (a == NCCL_ALGO_COLLNET_CHAIN && p != NCCL_PROTO_SIMPLE) busBw = 0;  // Not used
@@ -221,7 +221,6 @@ fail:

 static ncclResult_t doLaunches(struct ncclComm* head) {
  ncclResult_t result = ncclSuccess;
-  struct ncclComm* cliqueComm0 = head->intraComm0;
  struct ncclComm* cliqueHead = head;
  struct ncclComm* cliqueNextHead;
  bool useBarrier = ncclParamLaunchMode == ncclLaunchModeGroup;
@@ -237,7 +236,7 @@ static ncclResult_t doLaunches(struct ncclComm* head) {
      NCCLCHECKGOTO(ncclLaunchPrepare(comm), result, failure);
      if (useBarrier) ncclCommIntraBarrierIn(comm, 1);
      comm = comm->groupNext;
-    } while (comm != nullptr && comm->intraComm0 == cliqueComm0);
+    } while (comm != nullptr && comm->intraComm0 == cliqueHead->intraComm0);
    cliqueNextHead = comm;

    if (capturingYes && capturingNo) {
@@ -454,38 +453,47 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf

  /* Connect channels at runtime if cumem is supported */
  if (groupCommHeadMain != nullptr) {
-    struct ncclComm* comm = groupCommHeadMain;
+    struct ncclComm* cliqueHead = groupCommHeadMain;
+    struct ncclComm* comm = NULL;
    struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> asyncCollJobs;
    ncclIntruQueueConstruct(&asyncCollJobs);
    do {
-      bool needConnect = false;
-      bool algoNeedConnect[NCCL_NUM_ALGORITHMS];
-      memset(algoNeedConnect, 0, sizeof(bool) * NCCL_NUM_ALGORITHMS);
+      // We need to preconnect connections for collectives clique by clique to avoid
+      // race condition for split shared comms which can connect the same connections
+      // at the same time.
+      comm = cliqueHead;
+      do {
+        bool needConnect = false;
+        bool algoNeedConnect[NCCL_NUM_ALGORITHMS];
+        memset(algoNeedConnect, 0, sizeof(bool) * NCCL_NUM_ALGORITHMS);

-      // CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
-      NCCLCHECKGOTO(ncclPrepareTasks(comm, algoNeedConnect, &needConnect, simInfo), ret, fail);
+        CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
+        NCCLCHECKGOTO(ncclPrepareTasks(comm, algoNeedConnect, &needConnect, simInfo), ret, fail);

-      if (comm->cuMemSupport && needConnect) {
-        struct ncclPreconnectJob* job;
-        NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
-        job->base.func = ncclCollPreconnectFunc;
-        job->base.undo = nullptr;
-        job->base.destructor = free;
-        job->base.state = ncclGroupJobRunning;
-        job->base.abortFlag = comm->abortFlag;
-        job->base.abortFlagDev = comm->abortFlagDev;
-        job->comm = comm;
-        NCCLCHECKGOTO(ncclCalloc(&job->algoNeedConnect, NCCL_NUM_ALGORITHMS), ret, fail);
-        memcpy(job->algoNeedConnect, algoNeedConnect, sizeof(bool) * NCCL_NUM_ALGORITHMS);
-        ncclIntruQueueEnqueue(&asyncCollJobs, &job->base);
+        if (comm->cuMemSupport && needConnect) {
+          struct ncclPreconnectJob* job;
+          NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
+          job->base.func = ncclCollPreconnectFunc;
+          job->base.undo = nullptr;
+          job->base.destructor = free;
+          job->base.state = ncclGroupJobRunning;
+          job->base.abortFlag = comm->abortFlag;
+          job->base.abortFlagDev = comm->abortFlagDev;
+          job->comm = comm;
+          NCCLCHECKGOTO(ncclCalloc(&job->algoNeedConnect, NCCL_NUM_ALGORITHMS), ret, fail);
+          memcpy(job->algoNeedConnect, algoNeedConnect, sizeof(bool) * NCCL_NUM_ALGORITHMS);
+          ncclIntruQueueEnqueue(&asyncCollJobs, &job->base);
+        }
+        comm = comm->groupNext;
+      } while (comm != nullptr && comm->intraComm0 == cliqueHead->intraComm0);
+      // connect
+      NCCLCHECKGOTO(asyncJobLaunch(&asyncCollJobs, groupAbortFlag), ret, fail);
+      while (!ncclIntruQueueEmpty(&asyncCollJobs)) {
+        struct ncclAsyncJob* job = ncclIntruQueueDequeue(&asyncCollJobs);
+        if (job->destructor) job->destructor((void*)job);
      }
-      comm = comm->groupNext;
-    } while (comm);
-    NCCLCHECKGOTO(asyncJobLaunch(&asyncCollJobs, groupAbortFlag), ret, fail);
-    while (!ncclIntruQueueEmpty(&asyncCollJobs)) {
-      struct ncclAsyncJob* job = ncclIntruQueueDequeue(&asyncCollJobs);
-      if (job->destructor) job->destructor((void*)job);
-    }
+      cliqueHead = comm;
+    } while (cliqueHead != nullptr);

    // done with all buffer allocation, start registration and enqueue
    comm = groupCommHeadMain;
@@ -8,6 +8,7 @@
 #define NCCL_BITOPS_H_

 #include <stdint.h>
+#include <string.h>

 #if !__NVCC__
  #ifndef __host__
@@ -276,13 +277,53 @@ inline __host__ __device__ uint32_t u32fp8Decode(uint8_t x) {
  return u32fpDecode(x, 3);
 }

-inline __host__ __device__ uint64_t getHash(const char* string, int n) {
-  // Based on DJB2a, result = result * 33 ^ char
-  uint64_t result = 5381;
-  for (int c = 0; c < n; c++) {
-    result = ((result << 5) + result) ^ string[c];
+// The hash isn't just a function of the bytes but also where the bytes are split
+// into different calls to eatHash().
+inline __host__ __device__ void eatHash(uint64_t acc[2], const void* bytes, size_t size) {
+  char const* ptr = (char const*)bytes;
+  acc[0] ^= size;
+  while (size != 0) {
+    // Mix the accumulator bits.
+    acc[0] += acc[1];
+    acc[1] ^= acc[0];
+    acc[0] ^= acc[0] >> 31;
+    acc[0] *= 0x9de62bbc8cef3ce3;
+    acc[1] ^= acc[1] >> 32;
+    acc[1] *= 0x485cd6311b599e79;
+    // Read in a chunk of input.
+    size_t chunkSize = size < sizeof(uint64_t) ? size : sizeof(uint64_t);
+    uint64_t x = 0;
+    memcpy(&x, ptr, chunkSize);
+    ptr += chunkSize;
+    size -= chunkSize;
+    // Add to accumulator.
+    acc[0] += x;
  }
-  return result;
+}
+
+template<typename T>
+inline __host__ __device__ void eatHash(uint64_t acc[2], const T* bytes) {
+  eatHash(acc, (const void*)bytes, sizeof(T));
+}
+
+inline __host__ __device__ uint64_t digestHash(uint64_t const acc[2]) {
+  uint64_t h = acc[0];
+  h ^= h >> 31;
+  h *= 0xbac3bd562846de6b;
+  h += acc[1];
+  h ^= h >> 32;
+  h *= 0x995a187a14e7b445;
+  return h;
+}
+
+inline __host__ __device__ uint64_t getHash(const void* bytes, size_t size) {
+  uint64_t acc[2] = {1, 1};
+  eatHash(acc, bytes, size);
+  return digestHash(acc);
+}
+template<typename T>
+inline __host__ __device__ uint64_t getHash(const T* bytes) {
+  return getHash((const void*)bytes, sizeof(T));
 }

 #endif
@@ -12,6 +12,7 @@
 #include "nccl.h"
 #include "nccl_common.h"
 #include "device.h"
+
 #define NCCL_MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two.

 // CHUNKSIZE must be a multiple of SLICESIZE
@@ -396,6 +397,42 @@ public:
  ~RingBCAlgorithm() {}
 };

+#if !defined (__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
+// #include <cuda/atomic>
+#endif
+
+// Need a power of two to ensure it divides by parallelFactor (which is also a power of two)
+#define NCCL_PAT_NWORKERS 512
+
+static constexpr int PatUsed = 0x1,
+                     PatSkipped = 0x2;
+
+struct ncclPatStep {
+  int recvDim, sendDim, recvOffset, sendOffset, stepOffset, postRecv, postSend, nelem, last, flags;
+  size_t inpIx, outIx;
+};
+
+struct ncclPatPeer {
+    uint64_t step;
+    struct ncclConnInfo* conn;
+    struct ncclConnFifo* connFifo;
+    void* buff;
+    uint64_t *headPtr;
+    uint64_t *tailPtr;
+    uint64_t stepCache;
+    long long int accSize;
+    int connStepSize;
+};
+
+#define NCCL_SHMEM_PAT_STEPS 32
+struct ncclPatShmem {
+  struct ncclPatStep patSteps[NCCL_SHMEM_PAT_STEPS];
+  int parallelFactor;
+  long long int localAccSize;
+  struct ncclPatPeer sendDims[32]; // Should cover 2^32 ranks
+  struct ncclPatPeer recvDims[32];
+};
+
 template<typename T>
 class PatRSAlgorithm{
  size_t offset;
@@ -408,18 +445,17 @@ class PatRSAlgorithm{
  int nrPow2;
  int postFreq;
  int lastA;
-
+  int parallelFactor;
  int aggFactor;
  int as; // aggregated steps
  int a; // step inside aggregated step
  int sendSkipped; // number of skipped steps during aggregation
-  int recvSkipped; // number of skipped steps during aggregation
-  int phase2recv;  // receive offset for phase 2
+  int stepOffset;
  int aggDelta;
  int scale;
  int phase;

-  __device__ __host__ int min(int a, int b) {
+  __device__ __host__ ssize_t min(ssize_t a, ssize_t b) {
    return (a<b)?a:b;
  }

@@ -447,16 +483,16 @@ class PatRSAlgorithm{

  __device__ __host__ void resetA() {
    a = 0;
-    sendSkipped = recvSkipped = 0;
+    sendSkipped = stepOffset = 0;
    lastA = aggFactor;
    if (phase >= 2) lastA /= 2*scale;
+    if (phase == 4) lastA = 1;
  }

  __device__ __host__ void reset() {
    nelem = getNelem();
    phase = 0;
    scale = 1;
-    phase2recv = 0;
    as = aggDelta - 1;
    resetA();
  }
@@ -479,8 +515,9 @@ class PatRSAlgorithm{
  }

 public:
-   __device__ __host__ PatRSAlgorithm(int stepSize, int stepDepth, size_t offset, size_t end, size_t count, int chunkCount, int rank, int nranks):
+   __device__ __host__ PatRSAlgorithm(int stepSize, int stepDepth, int maxParallelFactor, size_t offset, size_t end, size_t count, int chunkCount, int rank, int nranks):
     offset(offset), end(end), count(count), chunkCount(chunkCount), rank(rank), nranks(nranks) {
+    parallelFactor = maxParallelFactor;
    aggDelta = nrPow2 = (1<<log2Up(nranks));

    aggFactor = 1;
@@ -490,6 +527,7 @@ public:
      aggDelta /= 2;
    }
    postFreq = aggFactor;
+    if (postFreq < parallelFactor) parallelFactor = postFreq;
    int d = stepDepth;
    while (d > 1 && aggFactor < nranks/2) {
      d /= 2;
@@ -500,160 +538,151 @@ public:
    reset();
  }

-  __device__ __host__ void getNextOp(int &recvDim, int &sendDim, size_t &inpIx, size_t &outIx, int &recvOffset, int &sendOffset, int &sendStepOffset, int &nelemOut, int &postRecv, int &postSend, int &last) {
-restart:
-    last = 0;
-    nelemOut = nelem;
-    outIx = offset;
+  __device__ __host__ int getParallelFactor() {
+    return parallelFactor;
+  }
+
+  __device__ __host__ void getNextOp(struct ncclPatStep* ps) {
+    ps->last = 0;
+    ps->nelem = nelem;
+    ps->outIx = offset;
+    ps->stepOffset = stepOffset;
    int skip = 0;
-    //printf("Phase %d as %d/%d a %d/%d scale %d\n", phase, as, aggDelta, a, lastA, scale);
-    if (phase == 0) {
+    if (a >= lastA) {
+      skip = 1;
+    } else if (phase == 0) {
      int s = mirrorInvert(a, lastA)*aggDelta + as;
      if (s >= nranks) skip = 1;
      int sendDataRank = (rank + s) % nranks;
-      inpIx = sendDataRank * count + offset;
-      recvDim = -1;
-      sendDim = 0;
-      outIx = 0;
-      recvOffset = -1;
-      sendOffset = ((a - sendSkipped)%postFreq) * nelem;
-      sendStepOffset = 0;
-      if ((((a - sendSkipped)%postFreq) + 1 >= postFreq) || (a == lastA-1)) {
-        postSend = 1;
+      ps->inpIx = sendDataRank * count + offset;
+      ps->recvDim = -1;
+      ps->sendDim = 0;
+      ps->outIx = 0;
+      ps->recvOffset = -1;
+      ps->sendOffset = (a%postFreq) * nelem;
+      if (((a%postFreq) + 1 >= postFreq) || (a == lastA-1)) {
+        ps->postSend = 1;
      } else {
-        postSend = 0;
+        ps->postSend = 0;
      }
-      postRecv = 0;
-      if (skip) sendSkipped++;
-      if (++a == lastA) {
-        phase = as == 1 ? (aggFactor > 1 ? 2 : 4) : 1; // If as == 1, switch to phase 2
-        resetA();
-      }
-      if (skip == 0) return;
+      ps->postRecv = 0;
    } else if (phase == 1) {
      int s = mirrorInvert(a, lastA)*aggDelta + as;
      if (s >= nranks) skip = 1;
-      recvDim = firstBitSet(s, nrPow2);
-      sendOffset = ((a - sendSkipped)%postFreq)*nelem;
-      recvOffset = ((a - recvSkipped)%postFreq)*nelem;
-      postSend = 0;
-      if (recvDim == 0) {
-        if ((((a - sendSkipped)%postFreq) + 1 >= postFreq) || (a == lastA-1)) postSend = 1;
-        sendStepOffset = 0;
+      ps->recvDim = firstBitSet(s, nrPow2);
+      ps->sendOffset = (a%postFreq)*nelem;
+      ps->recvOffset = (a%postFreq)*nelem;
+      ps->postSend = 0;
+      if (ps->recvDim == 0 && (((a%postFreq) + 1 >= postFreq) || (a == lastA-1))) ps->postSend = 1;
+      if (((a%postFreq) + 1 >= postFreq) || (a == lastA-1)) {
+        ps->postRecv = 1;
      } else {
-        sendStepOffset = (a - sendSkipped)/postFreq;
+        ps->postRecv = 0;
      }
-      if ((((a - recvSkipped)%postFreq) + 1 >= postFreq) || (a == lastA-1)) {
-        postRecv = 1;
-      } else {
-        postRecv = 0;
-      }
-      s -= (1<<recvDim);
+      s -= (1<<ps->recvDim);
      int recvDataRank = (rank + nranks + s) % nranks;
-      inpIx = recvDataRank * count + offset;
-      sendDim = s ? firstBitSet(s, nrPow2) : -1;
-      if (sendDim == -1) {
-        sendOffset = -1;
-        sendStepOffset = 0;
-      } else if (as - (1<<recvDim) == 0) {
-        if (newPeer(a, aggFactor)) sendSkipped = a;
+      ps->inpIx = recvDataRank * count + offset;
+      ps->sendDim = s ? firstBitSet(s, nrPow2) : -1;
+      if (ps->sendDim == -1) {
+        ps->sendOffset = -1;
+      } else if (as - (1<<ps->recvDim) == 0) {
+        if (newPeer(a, aggFactor)) { sendSkipped = a; ps->stepOffset = stepOffset = 0; }
        int foffset = a - sendSkipped;
-        sendStepOffset = recvDim == 0 ? 0 : foffset/postFreq;
-        sendOffset = (foffset%postFreq)*nelem;
+        ps->sendOffset = (foffset%postFreq)*nelem;
      }
+      int recvDim = ps->recvDim;
      if (s < nranks && skip) {
-        recvDim = -1;
-        recvOffset = -1;
-        postRecv = 0;
+        ps->recvDim = -1;
+        ps->recvOffset = -1;
+        ps->postRecv = 0;
        skip = 0;
      }
-      if (skip || recvDim == -1) recvSkipped++;
-      if (skip) sendSkipped++;
-      if (++a == lastA) {
-        as--;
-        phase = as % 2 == 1 ? 0 : 1;
-        resetA();
-      }
-      if (skip == 0) return;
+      if (recvDim > 0 && (((a-sendSkipped)%postFreq) + 1 >= postFreq) && skip == 0) stepOffset++;
    } else if (phase == 2) {
      int s = (2*mirrorInvert(a, lastA)+1)*scale*aggDelta + 1;
-      postRecv = 0;
+      ps->postRecv = 0;
      if (s >= nranks) skip = 1;
-      recvDim = 0;
-      postSend = a == lastA-1 ? 1 : 0;
+      ps->recvDim = 0;
+      ps->postSend = a == lastA-1 ? 1 : 0;
      s -= 1;
      if (s < nranks && skip) {
-        recvDim = -1;
-        recvOffset = -1;
+        ps->recvDim = -1;
+        ps->recvOffset = -1;
        skip = 0;
      } else if (!skip) {
-        int foffset = phase2recv;
-        phase2recv++;
-        postRecv |= ((foffset+1)%postFreq) == 0 ? 1 : 0;
-        recvOffset = (foffset%postFreq) * nelem;
+        int foffset = a + aggFactor - aggFactor/scale;
+        ps->postRecv |= ((foffset+1)%postFreq) == 0 ? 1 : 0;
+        ps->recvOffset = (foffset%postFreq) * nelem;
      }
      int recvDataRank = (rank + nranks + s) % nranks;
-      inpIx = recvDataRank * count + offset;
-      sendDim = s ? firstBitSet(s, nrPow2) : -1;
-      int foffset = a - sendSkipped;
-      postSend |= ((foffset+1)%postFreq) == 0 ? 1 : 0;
-      sendStepOffset = 0;
-      sendOffset = (foffset%postFreq) * nelem;
-      if (skip || sendDim == -1) sendSkipped++;
-      if (++a == lastA) {
-        phase = 3;
-        resetA();
-      }
-      if (skip == 0) return;
+      ps->inpIx = recvDataRank * count + offset;
+      ps->sendDim = s ? firstBitSet(s, nrPow2) : -1;
+      int foffset = a;
+      ps->postSend |= ((foffset+1)%postFreq) == 0 ? 1 : 0;
+      ps->sendOffset = (foffset%postFreq) * nelem;
    } else if (phase == 3) {
      int s = (2*mirrorInvert(a, lastA)+1)*scale*aggDelta;
-      postRecv = a == lastA-1 ? 1 : 0;
+      ps->postRecv = a == lastA-1 ? 1 : 0;
      if (s >= nranks) skip = 1;
-      recvDim = firstBitSet(s, nrPow2);
-      postSend = 0;
-      s -= (1<<recvDim);
-      int foffset = a - recvSkipped;
-      postRecv |= (foffset+1)%postFreq == 0 ? 1 : 0;
-      recvOffset = (foffset%postFreq) * nelem;
+      ps->recvDim = firstBitSet(s, nrPow2);
+      ps->postSend = 0;
+      s -= (1<<ps->recvDim);
+      int foffset = a;
+      ps->postRecv |= (foffset+1)%postFreq == 0 ? 1 : 0;
+      ps->recvOffset = (foffset%postFreq) * nelem;
      int recvDataRank = (rank + nranks + s) % nranks;
-      inpIx = recvDataRank * count + offset;
-      sendDim = s ? firstBitSet(s, nrPow2) : -1;
+      ps->inpIx = recvDataRank * count + offset;
+      ps->sendDim = s ? firstBitSet(s, nrPow2) : -1;
      if (s < nranks && skip) {
-        recvDim = -1;
-        recvOffset = -1;
-        postRecv = 0;
+        ps->recvDim = -1;
+        ps->recvOffset = -1;
+        ps->postRecv = 0;
        skip = 0;
      }
-      if (newPeer(a, aggFactor/(2*scale))) sendSkipped = a;
+      if (newPeer(a, aggFactor/(2*scale))) { sendSkipped = a; ps->stepOffset = stepOffset = 0; }
      foffset = a - sendSkipped;
-      sendStepOffset = foffset / postFreq; // Accumulate on next steps
-      sendOffset = sendDim >= 0 ? (foffset%postFreq) * nelem : -1;
-      if (skip || recvDim == -1) recvSkipped++;
-      if (skip) sendSkipped++;
-      if (++a == lastA) {
-        scale *= 2;
-        phase = scale < aggFactor ? 2 : 4;
+      if ((foffset%postFreq) + 1 >= postFreq && skip == 0) stepOffset++;
+      ps->sendOffset = ps->sendDim >= 0 ? (foffset%postFreq) * nelem : -1;
+    } else if (phase == 4) {
+      ps->recvDim = 0;
+      ps->sendDim = -1;
+      ps->inpIx = rank * count + offset;
+      ps->recvOffset = ((aggFactor-1)%postFreq) * nelem;
+      ps->sendOffset = -1;
+      ps->postRecv = 1;
+      ps->postSend = 0;
+      offset += chunkCount;
+    }
+    a++;
+    if (a >= lastA && a >= parallelFactor) {
+      int p = phase;
+      if (p == 1) as--;
+      if (p == 3) scale *= 2;
+      phase =
+        p == 0 ? as == 1 ? (aggFactor > 1 ? 2 : 4) : 1 :
+        p == 1 ? as % 2 == 1 ? 0 : 1 :
+        p == 2 ? 3 :
+        p == 3 ? scale < aggFactor ? 2 : 4 :
+        5;
+      if (p == 4) {
+        if (offset >= end) {
+          ps->last = 2;
+        } else {
+          reset();
+        }
+      } else {
        resetA();
      }
-      if (skip == 0) return;
-    } else if (phase == 4) {
-      recvDim = 0;
-      sendDim = -1;
-      inpIx = rank * count + offset;
-      recvOffset = (phase2recv%postFreq) * nelem;
-      sendStepOffset = 0;
-      sendOffset = -1;
-      postRecv = 1;
-      postSend = 0;
-      offset += chunkCount;
-      if (offset >= end) {
-        last = 1;
-      } else {
-        reset();
-      }
-      return;
+    } else if (phase == 4 && offset >= end) {
+      ps->last = 1;
    }
-    goto restart;
+    int flags = PatUsed | (skip ? PatSkipped : 0);
+#if __CUDA_ARCH__ >= 600
+    cuda::atomic_ref<int, cuda::thread_scope_block> a(ps->flags);
+    a.store(flags, cuda::memory_order_release);
+#else
+    ps->flags = flags;
+#endif
  }
 };

@@ -669,14 +698,12 @@ class PatAGAlgorithm{
  int nrPow2;
  int postFreq;
  int lastA;
-
+  int parallelFactor;
  int aggFactor;
  int as; // aggregated steps
  int a; // step inside aggregated step
  int aggDelta;
-
  int scale;
-
  int phase;

  // AS computation
@@ -685,7 +712,7 @@ class PatAGAlgorithm{
  int bitCount[32];
  int bitZeroStep[32];

-  __device__ __host__ int min(int a, int b) {
+  __device__ __host__ ssize_t min(ssize_t a, ssize_t b) {
    return (a<b)?a:b;
  }

@@ -752,8 +779,9 @@ class PatAGAlgorithm{


 public:
-   __device__ __host__ PatAGAlgorithm(int stepSize, int stepDepth, size_t offset, size_t end, size_t count, int chunkCount, int rank, int nranks):
+   __device__ __host__ PatAGAlgorithm(int stepSize, int stepDepth, int maxParallelFactor, size_t offset, size_t end, size_t count, int chunkCount, int rank, int nranks):
     offset(offset), end(end), count(count), chunkCount(chunkCount), rank(rank), nranks(nranks) {
+    parallelFactor = maxParallelFactor;
    aggDelta = nrPow2 = (1<<log2Up(nranks));

    aggFactor = 1;
@@ -763,120 +791,120 @@ public:
      aggDelta /= 2;
    }
    postFreq = aggFactor;
+    if (postFreq < parallelFactor) parallelFactor = postFreq;
    int d = stepDepth;
    while (d > 1 && aggFactor < nranks/2) {
      d /= 2;
      aggFactor *= 2;
      aggDelta /= 2;
    }
-    //printf("AggFactor %d PostFreq %d AggDelta %d\n", aggFactor, postFreq, aggDelta);

    asDim = log2Up(aggDelta);
    reset();
  }

-  __device__ __host__ void getNextOp(int &recvDim, int &sendDim, size_t &inpIx, size_t &outIx, int &recvOffset, int &sendOffset, int &recvStepOffset, int &nelemOut, int &postRecv, int &postSend, int &last) {
-restart:
-    //printf("Phase %d as %d/%d a %d/%d scale %d\n", phase, as, aggDelta, a, lastA, scale);
-    last = 0;
-    nelemOut = nelem;
-    inpIx = offset;
+  __device__ __host__ int getParallelFactor() {
+    return parallelFactor;
+  }
+
+  __device__ __host__ void getNextOp(struct ncclPatStep* ps) {
+    ps->last = 0;
+    ps->nelem = nelem;
+    ps->inpIx = offset;
    int skip = 0;
-    if (phase == 0) {
+    if (a >= lastA) {
+      skip = 1;
+    } else if (phase == 0) {
      int s = a*aggDelta + as;
      if (s >= nranks) skip = 1;
-      int nextSkip = (a+1)*aggDelta + as >= nranks ? 1 : 0;
      int recvDataRank = (rank + s) % nranks;
-      outIx = recvDataRank * count + offset;
-      sendDim = -1;
-      recvDim = 0;
-      inpIx = 0;
-      sendOffset = -1;
-      recvOffset = (a % postFreq) * nelem;
-      recvStepOffset = 0;
-      postRecv = (a % postFreq == postFreq-1) || ((a+1)*aggDelta+as >= nranks) ? 1 : 0;
-      postSend = 0;
-      a++;
-      if (nextSkip) {
-        as = nextAs();
-        if (as == aggDelta/2) {
-          offset += chunkCount;
-          if (offset >= end) {
-            last = 1;
-          } else {
-            reset();
-          }
-          return;
-        }
-        phase = 1;
-        resetA();
-      }
-      if (skip == 0) return;
+      ps->outIx = recvDataRank * count + offset;
+      ps->sendDim = -1;
+      ps->recvDim = 0;
+      ps->inpIx = 0;
+      ps->sendOffset = -1;
+      ps->recvOffset = (a % postFreq) * nelem;
+      ps->stepOffset = 0;
+      ps->postRecv = (a % postFreq == postFreq-1) || ((a+1)*aggDelta+as >= nranks) ? 1 : 0;
+      ps->postSend = 0;
   } else if (phase == 1) {
      int s = a*aggDelta + as;
      if (s >= nranks) skip = 1;
-      sendDim = firstBitSet(s, nrPow2);
-      s -= (1<<sendDim);
+      ps->sendDim = firstBitSet(s, nrPow2);
+      s -= (1<<ps->sendDim);
      int sendDataRank = (rank + nranks + s) % nranks;
-      outIx = sendDataRank * count + offset;
-      recvDim = s ? firstBitSet(s, nrPow2) : -1;
-      sendOffset = recvOffset = (a % postFreq) * nelem;
-      postSend = (a % postFreq == postFreq-1) || ((a+1)*aggDelta+as >= nranks) ? 1 : 0;
-      postRecv = (sendDim == 0) && ((a % postFreq == postFreq-1) || ((a+1)*aggDelta+as-1 >= nranks)) ? 1 : 0;
-      recvStepOffset = (sendDim == 0) ? 0 : a/postFreq;
-      if (recvDim == -1) {
-        recvOffset = -1;
-        postRecv = 0;
-      } else if (as - (1<<sendDim) == 0) {
-        int foffset = (a*aggDelta) >> (recvDim+1);
-        recvOffset = (foffset%postFreq)*nelem;
-        postRecv = (sendDim == 0) && ((foffset % postFreq == postFreq-1) || ((((foffset+1)*2)+1)<<recvDim) >= nranks) ? 1 : 0;
-        recvStepOffset = (sendDim == 0) ? 0 : foffset/postFreq;
+      ps->outIx = sendDataRank * count + offset;
+      ps->recvDim = s ? firstBitSet(s, nrPow2) : -1;
+      ps->sendOffset = ps->recvOffset = (a % postFreq) * nelem;
+      ps->postSend = (a % postFreq == postFreq-1) || ((a+1)*aggDelta+as >= nranks) ? 1 : 0;
+      ps->postRecv = (ps->sendDim == 0) && ((a % postFreq == postFreq-1) || ((a+1)*aggDelta+as-1 >= nranks)) ? 1 : 0;
+      ps->stepOffset = (ps->sendDim == 0) ? 0 : a/postFreq;
+      if (ps->recvDim == -1) {
+        ps->recvOffset = -1;
+        ps->postRecv = 0;
+      } else if (as - (1<<ps->sendDim) == 0) {
+        int foffset = (a*aggDelta) >> (ps->recvDim+1);
+        ps->recvOffset = (foffset%postFreq)*nelem;
+        ps->postRecv = (ps->sendDim == 0) && ((foffset % postFreq == postFreq-1) || ((((foffset+1)*2)+1)<<ps->recvDim) >= nranks) ? 1 : 0;
+        ps->stepOffset = (ps->sendDim == 0) ? 0 : foffset/postFreq;
      }
-      if (s < nranks && sendDim == 0 && skip) {
+      if (s < nranks && ps->sendDim == 0 && skip) {
        // Don't forget to receive at least once even if we don't send afterwards
-        sendDim = -1;
-        sendOffset = -1;
-        postSend = 0;
+        ps->sendDim = -1;
+        ps->sendOffset = -1;
+        ps->postSend = 0;
        skip = 0;
      }
-      if (++a == lastA) {
-        if (as % 2 == 1) {
-          phase = 0;
-        } else {
-          as = nextAs();
-        }
-        resetA();
-      }
-      if (skip == 0) return;
    } else if (phase == 2) {
      int s = (2*a+1)*scale*aggDelta;
-      postSend = (a % postFreq == postFreq-1) || ((2*(a+1)+1)*scale*aggDelta >= nranks) ? 1 : 0;
-      postRecv = 0;
+      ps->postSend = (a % postFreq == postFreq-1) || ((2*(a+1)+1)*scale*aggDelta >= nranks) ? 1 : 0;
+      ps->postRecv = 0;
      if (s >= nranks) skip = 1;
-      sendDim = firstBitSet(s, nrPow2);
-      s -= (1<<sendDim);
-      sendOffset = (a%postFreq) * nelem;
-      recvStepOffset = a / postFreq;
+      ps->sendDim = firstBitSet(s, nrPow2);
+      s -= (1<<ps->sendDim);
+      ps->sendOffset = (a%postFreq) * nelem;
+      ps->stepOffset = a / postFreq;
      int sendDataRank = (rank + nranks + s) % nranks;
-      outIx = sendDataRank * count + offset;
-      recvDim = s ? firstBitSet(s, nrPow2) : -1;
-      if (recvDim == -1) {
-        recvOffset = -1;
+      ps->outIx = sendDataRank * count + offset;
+      ps->recvDim = s ? firstBitSet(s, nrPow2) : -1;
+      if (ps->recvDim == -1) {
+        ps->recvOffset = -1;
      } else {
-        s -= (1<<recvDim);
-        int foffset = (a*2*scale*aggDelta) >> (recvDim+1);
-        recvOffset = (foffset%postFreq)*nelem;
-        recvStepOffset = foffset / postFreq;
+        s -= (1<<ps->recvDim);
+        int foffset = (a*2*scale*aggDelta) >> (ps->recvDim+1);
+        ps->recvOffset = (foffset%postFreq)*nelem;
+        ps->stepOffset = foffset / postFreq;
      }
-      if (++a == lastA) {
-        scale /= 2;
-        phase = scale ? 2 : 1;
+    }
+    a++;
+    if (a >= lastA && a >= parallelFactor) {
+      int p = phase;
+      if (p == 2) scale /= 2;
+      phase =
+        p == 2 ? scale ? 2 : 1 :
+        p == 1 ? as % 2 == 1 ? 0 : 1 :
+        1;
+      if (p == 0 || (p == 1 && as % 2 == 0)) as = nextAs();
+      if (p == 0 && as == aggDelta/2) {
+        offset += chunkCount;
+        if (offset >= end) {
+          ps->last = 2;
+        } else {
+          reset();
+        }
+      } else {
        resetA();
      }
-      if (skip == 0) return;
+    } else if (phase == 0 && as == 1 && offset + chunkCount >= end && a-1 >= ((lastA-1) / parallelFactor) * parallelFactor) {
+      ps->last = 1;
    }
-    goto restart;
+    int flags = PatUsed | (skip ? PatSkipped : 0);
+#if __CUDA_ARCH__ >= 600
+    cuda::atomic_ref<int, cuda::thread_scope_block> a(ps->flags);
+    a.store(flags, cuda::memory_order_release);
+#else
+    ps->flags = flags;
+#endif
  }
 };
 #endif
@@ -139,6 +139,9 @@ struct ncclSharedResources {
  int* tpRankToLocalRank;
  // Internal streams
  struct ncclStrongStream deviceStream, hostStream;
+  int noncapturedRefs; // number of non-captured hostStreamPlanCallback on the stream
+  int persistentRefs;
+  cudaEvent_t launchEvent, scratchEvent;

  /* proxy related shared res */
  struct ncclProxyState* proxyState;
@@ -437,6 +440,7 @@ struct ncclComm {
  // List of destructors to run when comm is destructed
  struct ncclDestructor* destructorHead;

+  struct ncclCudaContext* context;
  struct ncclSharedResources* sharedRes;
  /* map to top parent ranks. */
  int* topParentRanks;
@@ -449,6 +453,7 @@ struct ncclComm {

  int netPluginLoaded;
  ncclNet_t* ncclNet;
+  int ncclNetVer;
  ncclNetDeviceType netDeviceType;
  ncclCollNet_t* ncclCollNet;
  void* bootstrap;
@@ -456,6 +461,7 @@ struct ncclComm {
  struct channelMasks* connectSend;
  struct channelMasks* connectRecv;
  struct ncclTopoGraph graphs[NCCL_NUM_ALGORITHMS];
+  int maxTreePattern;
  bool initAlgoChannels[NCCL_NUM_ALGORITHMS];
  bool runtimeConn; // if dynamic connection is supported
  bool directMode;
@@ -603,8 +609,7 @@ struct ncclComm {
  struct ncclComm* groupNext;
  // Subset of those in groupNext list. Holds 0x1 if not needing preconnect.
  struct ncclComm* preconnectNext;
-  int persistentRefs; // number of persistent plan-lists capturing this comm
-  int noncapturedRefs; // number of non-captured hostStreamPlanCallback on the stream
+  int localPersistentRefs; // number of persistent plan-lists capturing this comm
  struct P2pSchedulePair { int sendRank; int recvRank; } *p2pSchedule;

  struct ncclKernelPlanner planner;
@@ -669,6 +674,7 @@ struct ncclComm {
  // Profiler plugin
  void* profilerContext;
  uint64_t seqNumber[NCCL_NUM_FUNCTIONS];
+  struct ncclProfilerProxy profiler;

  // buffer registration cache
  struct ncclRegCache regCache;
@@ -163,6 +163,7 @@ struct ncclProxyConnector {

 struct ncclConnector {
  int connected;
+  int hasSeen;
  struct ncclProxyConnector proxyConn;
  struct ncclTransportComm* transportComm;
  void* transportResources;
@@ -256,6 +257,8 @@ struct alignas(16) ncclDevWorkP2p {
  uint8_t sendNetReg:1, recvNetReg:1;
  uint8_t sendIpcReg:1, recvIpcReg:1;

+  uint8_t profilerEnabled:1;
+
  uint8_t sendConnIndex:2, recvConnIndex:2;
 };

@@ -304,7 +307,7 @@ struct alignas(16) ncclDevWorkColl {
  uint32_t nWarps:8;
  uint32_t redOpArgIsPtr:1, regUsed:1, netRegUsed:1, oneNode:1, direct:2, isOneRPN:1, rcclUseOneSlice:1;
  uint32_t root:30, connIndex:2;
-  uint16_t pivotA2ANumBiRings;
+  uint16_t pivotA2ANumBiRings:15, profilerEnabled:1;
  void* recvbuff;
  void* sendbuff;
  uintptr_t sendbuffOffset;
@@ -498,6 +501,7 @@ struct alignas(16) ncclDevChannel {
  struct ncclTree binTree;
  struct ncclNvls nvls;
  uint32_t* workFifoDone; // Location of done counter, device writes index+1 of last work processed
+  uint64_t workCounter;
 };

 struct ncclDevComm {
@@ -523,6 +527,10 @@ struct ncclDevComm {

  int* rankToLocalRank;

+  // Profiler counters
+  uint64_t* workStarted/*[MAXCHANNELS]*/;
+  uint64_t* workCompleted/*[MAXCHANNELS]*/;
+
 #if defined(ENABLE_NPKIT)
  NpKitEventCollectContext* npKitEventCollectContexts;
  uint64_t* cpuTimestamp;
@@ -621,7 +629,7 @@ __host__ __device__ constexpr int ncclCalcUnroll(int bytePerPack, int insns, int

 __host__ __device__ constexpr int ncclCollUnroll(int cudaArch = NCCL_CUDA_ARCH) {
  // Our collective unroll should move to the same bytes&insns model as NVLS.
-  return cudaArch >= 800 ? 8 : 4;
+  return cudaArch >= 800 ? (cudaArch == 1200 ? 6 : 8) : 4;
 }

 __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; }
@@ -37,17 +37,24 @@ ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm);
 ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int64_t* id, int* dev, int* proxyRank);
 ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank);
 ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* ret);
-ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int rank, int64_t netId, int read, int* useGdr);
-#define MAX_XGMI_INTER_GPUS 4
-ncclResult_t ncclTopoGetIntraNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int type, int64_t* id, int* dev);
-ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, int maxInter=MAX_XGMI_INTER_GPUS, int nInter=0, int *inter=nullptr);
-ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int netDev, int rank, int* flush);
+enum ncclTopoGdrMode {
+  ncclTopoGdrModeDisable = 0,
+  ncclTopoGdrModeDefault = 1,
+  ncclTopoGdrModePci = 2,
+  ncclTopoGdrModeNum = 3
+};
+ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int rank, int64_t netId, int read, enum ncclTopoGdrMode* gdrMode);
+ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int64_t netId, int netDev, int rank, int* flush);
 ncclResult_t ncclTopoIsGdrAvail(struct ncclTopoSystem* system, int rank, bool *avail);
 ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int rank1, int rank2, int* net);
 int ncclPxnDisable(struct ncclComm* comm);
 ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks);
 ncclResult_t ncclGetLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu);

+#define MAX_XGMI_INTER_GPUS 4
+ncclResult_t ncclTopoGetIntraNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int type, int64_t* id, int* dev);
+ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, int maxInter=MAX_XGMI_INTER_GPUS, int nInter=0, int *inter=nullptr);
+
 // Find CPU affinity
 ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity);

@@ -59,11 +66,13 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu
 #define NCCL_TOPO_CPU_VENDOR_AMD 2
 #define NCCL_TOPO_CPU_VENDOR_ZHAOXIN 3
 #define NCCL_TOPO_CPU_VENDOR_MIXED 4
-#define NCCL_TOPO_CPU_TYPE_BDW 1
-#define NCCL_TOPO_CPU_TYPE_SKL 2
-#define NCCL_TOPO_CPU_TYPE_ZEN 3
-#define NCCL_TOPO_CPU_TYPE_ROME 4
-#define NCCL_TOPO_CPU_TYPE_YONGFENG 1
+#define NCCL_TOPO_CPU_MODEL_INTEL_BDW 1
+#define NCCL_TOPO_CPU_MODEL_INTEL_SKL 2
+#define NCCL_TOPO_CPU_MODEL_INTEL_SRP 3
+#define NCCL_TOPO_CPU_MODEL_INTEL_ERP 4
+#define NCCL_TOPO_CPU_MODEL_AMD_ZEN 5
+#define NCCL_TOPO_CPU_MODEL_AMD_ROME 6
+#define NCCL_TOPO_CPU_MODEL_YONGFENG 1
 ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model);
 ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count);
 ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count);
@@ -108,6 +108,12 @@ inline void ncclGroupCommJoin(struct ncclComm* comm) {
    struct ncclComm** pp = &ncclGroupCommHead;
    while (*pp != nullptr && comm->intraComm0 != (*pp)->intraComm0)
      pp = &(*pp)->groupNext;
+
+    // didn't find its clique, we need to insert it with ascending order based on commHash
+    if (*pp == nullptr) {
+      pp = &ncclGroupCommHead;
+      while (*pp != nullptr && (*pp)->commHash < comm->commHash) pp = &(*pp)->groupNext;
+    }
    comm->groupNext = *pp;
    *pp = comm;
    // Comms gets a new memory stack scope upon joining. Each task batched for
@@ -1,610 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_NET_H_
-#define NCCL_NET_H_
-
-#include "nccl.h"
-#include "nccl_common.h"
-#include "net_device.h"
-#include <stdint.h>
-
-#define NCCL_NET_HANDLE_MAXSIZE 128
-//Maximum value NCCL can accept for maxP2pBytes and maxCollBytes net properties
-#define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L)
-#define NCCL_NET_OPTIONAL_RECV_COMPLETION 0x1
-
-#define NCCL_PTR_HOST 0x1
-#define NCCL_PTR_CUDA 0x2
-#define NCCL_PTR_DMABUF 0x4
-
-// Maximum number of requests per comm object
-#define NCCL_NET_MAX_REQUESTS 32
-
-// Max number of ncclNet objects which can live in the same process
-#define NCCL_NET_MAX_PLUGINS 3
-
-#define NCCL_NET_MAX_DEVS_PER_NIC_V9 4
-#define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V9
-
-typedef struct {
-  int ndevs;
-  int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9];
-} ncclNetVDeviceProps_v9_t;
-typedef ncclNetVDeviceProps_v9_t ncclNetVDeviceProps_t;
-
-typedef struct {
-  char* name;                      // Used mostly for logging.
-  char* pciPath;                   // Path to the PCI device in /sys.
-  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
-                                   // cards with multiple PCI functions (Physical or virtual).
-  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
-  int regIsGlobal;                 // regMr is not tied to a particular comm
-  int forceFlush;                  // Force a flush on receives
-  int speed;                       // Port speed in Mbps.
-  int port;                        // Port number.
-  float latency;                   // Network latency
-  int maxComms;                    // Maximum number of comms we can create
-  int maxRecvs;                    // Maximum number of grouped receives.
-  ncclNetDeviceType netDeviceType; // Network offload type
-  int netDeviceVersion;            // Version number for network offload
-  ncclNetVDeviceProps_v9_t vProps;
-  size_t maxP2pBytes;              // Max transfer size for point-to-point operations
-  size_t maxCollBytes;             // Max transfer size for collective operations
-} ncclNetProperties_v9_t;
-typedef ncclNetProperties_v9_t ncclNetProperties_t;
-
-typedef struct {
-  // Name of the network (mainly for logs)
-  const char* name;
-  // Initialize the network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create a connection.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Connect to a handle and return a sending comm object for that peer.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with sendComm == NULL with the expectation that
-  // it will be called again until sendComm != NULL.
-  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
-  // Finalize connection establishment after remote peer has called connect.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with recvComm == NULL with the expectation that
-  // it will be called again until recvComm != NULL.
-  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
-  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
-  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
-  /* DMA-BUF support */
-  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
-  ncclResult_t (*deregMr)(void* comm, void* mhandle);
-  // Asynchronous send to a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request);
-  // Asynchronous recv from a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* sizes);
-  // Close and free send/recv comm objects
-  ncclResult_t (*closeSend)(void* sendComm);
-  ncclResult_t (*closeRecv)(void* recvComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-
-  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
-  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
-
-  // Notify the plugin that a recv has completed by the device
-  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
-
-  // Create a virtual NIC given the specified properties, which can be accessed at device index d
-  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props);
-} ncclNet_v9_t;
-
-typedef ncclNet_v9_t ncclNet_t;
-
-#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v9
-
-typedef struct {
-  void* mhandle;
-  void* address;
-  size_t size;
-} ncclNetSGE_v9_t;
-
-typedef struct {
-  // Name of the collective network (mainly for logs)
-  const char* name;
-  // Initialize the collective network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters capable of doing collective operations.
-  // If ndev returns 0, all other functions might be set to NULL.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create connections.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Create a group for collective operations. handles have been created
-  // using listen() above. rank indicates caller's rank in the collective network.
-  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
-  // Returns whether a reduction operation on a data type is supported.
-  // 1 for supported, 0 otherwise.
-  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
-  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
-  /* DMA-BUF support */
-  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
-  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
-  // Performs an asynchronous allreduce operation on the collective group.
-  // May return request == NULL if the call cannot be performed (or would block).
-  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, size_t count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
-  ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v9_t* recvParts,
-                             size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
-                             void* sendMhandle, void** request);
-  ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v9_t* sendParts, void* recvData,
-                                 size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
-                                 ncclDataType_t dataType, ncclRedOp_t redOp,
-                                 void* recvMhandle, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* size);
-  // Close and free collective comm objects
-  ncclResult_t (*closeColl)(void* collComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-
-  // Create a virtual NIC given the specified properties, which can be accessed at device index d
-  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props);
-} ncclCollNet_v9_t;
-
-typedef ncclCollNet_v9_t ncclCollNet_t;
-
-#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v9
-
-typedef struct {
-  char* name;                      // Used mostly for logging.
-  char* pciPath;                   // Path to the PCI device in /sys.
-  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
-                                   // cards with multiple PCI functions (Physical or virtual).
-  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
-  int regIsGlobal;                 // regMr is not tied to a particular comm
-  int speed;                       // Port speed in Mbps.
-  int port;                        // Port number.
-  float latency;                   // Network latency
-  int maxComms;                    // Maximum number of comms we can create
-  int maxRecvs;                    // Maximum number of grouped receives.
-  ncclNetDeviceType netDeviceType; // Network offload type
-  int netDeviceVersion;            // Version number for network offload
-} ncclNetProperties_v8_t;
-
-typedef struct {
-  // Name of the network (mainly for logs)
-  const char* name;
-  // Initialize the network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create a connection.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Connect to a handle and return a sending comm object for that peer.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with sendComm == NULL with the expectation that
-  // it will be called again until sendComm != NULL.
-  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
-  // Finalize connection establishment after remote peer has called connect.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with recvComm == NULL with the expectation that
-  // it will be called again until recvComm != NULL.
-  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
-  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
-  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
-  /* DMA-BUF support */
-  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
-  ncclResult_t (*deregMr)(void* comm, void* mhandle);
-  // Asynchronous send to a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
-  // Asynchronous recv from a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* sizes);
-  // Close and free send/recv comm objects
-  ncclResult_t (*closeSend)(void* sendComm);
-  ncclResult_t (*closeRecv)(void* recvComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-
-  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
-  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
-
-  // Notify the plugin that a recv has completed by the device
-  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
-} ncclNet_v8_t;
-
-typedef struct {
-  void* mhandle;
-  void* address;
-  uint32_t size;
-} ncclNetSGE_v8_t;
-
-typedef struct {
-  // Name of the collective network (mainly for logs)
-  const char* name;
-  // Initialize the collective network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters capable of doing collective operations.
-  // If ndev returns 0, all other functions might be set to NULL.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create connections.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Create a group for collective operations. handles have been created
-  // using listen() above. rank indicates caller's rank in the collective network.
-  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
-  // Returns whether a reduction operation on a data type is supported.
-  // 1 for supported, 0 otherwise.
-  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
-  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
-  /* DMA-BUF support */
-  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
-  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
-  // Performs an asynchronous allreduce operation on the collective group.
-  // May return request == NULL if the call cannot be performed (or would block).
-  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
-  ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v8_t* recvParts,
-                             size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
-                             void* sendMhandle, void** request);
-  ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v8_t* sendParts, void* recvData,
-                                 size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
-                                 ncclDataType_t dataType, ncclRedOp_t redOp,
-                                 void* recvMhandle, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* size);
-  // Close and free collective comm objects
-  ncclResult_t (*closeColl)(void* collComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclCollNet_v8_t;
-
-typedef struct {
-  char* name;                      // Used mostly for logging.
-  char* pciPath;                   // Path to the PCI device in /sys.
-  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
-                                   // cards with multiple PCI functions (Physical or virtual).
-  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
-  int speed;                       // Port speed in Mbps.
-  int port;                        // Port number.
-  float latency;                   // Network latency
-  int maxComms;                    // Maximum number of comms we can create
-  int maxRecvs;                    // Maximum number of grouped receives.
-  ncclNetDeviceType netDeviceType; // Network offload type
-  int netDeviceVersion;            // Version number for network offload
-} ncclNetProperties_v7_t;
-
-typedef struct {
-  // Name of the network (mainly for logs)
-  const char* name;
-  // Initialize the network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create a connection.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Connect to a handle and return a sending comm object for that peer.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with sendComm == NULL with the expectation that
-  // it will be called again until sendComm != NULL.
-  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm);
-  // Finalize connection establishment after remote peer has called connect.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with recvComm == NULL with the expectation that
-  // it will be called again until recvComm != NULL.
-  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm);
-  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
-  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
-  /* DMA-BUF support */
-  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
-  ncclResult_t (*deregMr)(void* comm, void* mhandle);
-  // Asynchronous send to a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
-  // Asynchronous recv from a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* sizes);
-  // Close and free send/recv comm objects
-  ncclResult_t (*closeSend)(void* sendComm);
-  ncclResult_t (*closeRecv)(void* recvComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-
-  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
-  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
-
-  // Notify the plugin that a recv has completed by the device
-  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
-} ncclNet_v7_t;
-
-typedef struct {
-  // Name of the collective network (mainly for logs)
-  const char* name;
-  // Initialize the collective network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters capable of doing collective operations.
-  // If ndev returns 0, all other functions might be set to NULL.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create connections.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Create a group for collective operations. handles have been created
-  // using listen() above. rank indicates caller's rank in the collective network.
-  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
-  // Returns whether a reduction operation on a data type is supported.
-  // 1 for supported, 0 otherwise.
-  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
-  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
-  /* DMA-BUF support */
-  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
-  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
-  // Performs an asynchronous allreduce operation on the collective group.
-  // May return request == NULL if the call cannot be performed (or would block).
-  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* size);
-  // Close and free collective comm objects
-  ncclResult_t (*closeColl)(void* collComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclCollNet_v7_t;
-
-#define NCCL_NET_MAX_REQUESTS_V6 8
-
-// v6 struct for backwards compatibility
-typedef struct {
-  char* name;     // Used mostly for logging.
-  char* pciPath;  // Path to the PCI device in /sys.
-  uint64_t guid;  // Unique identifier for the NIC chip. Important for
-                  // cards with multiple PCI functions (Physical or virtual).
-  int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
-  int speed;      // Port speed in Mbps.
-  int port;       // Port number.
-  float latency;  // Network latency
-  int maxComms;   // Maximum number of comms we can create
-  int maxRecvs;   // Maximum number of grouped receives.
-} ncclNetProperties_v6_t;
-
-typedef struct {
-  // Name of the network (mainly for logs)
-  const char* name;
-  // Initialize the network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create a connection.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Connect to a handle and return a sending comm object for that peer.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with sendComm == NULL with the expectation that
-  // it will be called again until sendComm != NULL.
-  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
-  // Finalize connection establishment after remote peer has called connect.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with recvComm == NULL with the expectation that
-  // it will be called again until recvComm != NULL.
-  ncclResult_t (*accept)(void* listenComm, void** recvComm);
-  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
-  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
-  /* DMA-BUF support */
-  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
-  ncclResult_t (*deregMr)(void* comm, void* mhandle);
-  // Asynchronous send to a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
-  // Asynchronous recv from a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* sizes);
-  // Close and free send/recv comm objects
-  ncclResult_t (*closeSend)(void* sendComm);
-  ncclResult_t (*closeRecv)(void* recvComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclNet_v6_t;
-
-typedef struct {
-  // Name of the collective network (mainly for logs)
-  const char* name;
-  // Initialize the collective network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters capable of doing collective operations.
-  // If ndev returns 0, all other functions might be set to NULL.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create connections.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Create a group for collective operations. handles have been created
-  // using listen() above. rank indicates caller's rank in the collective network.
-  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
-  // Returns whether a reduction operation on a data type is supported.
-  // 1 for supported, 0 otherwise.
-  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
-  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
-  /* DMA-BUF support */
-  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
-  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
-  // Performs an asynchronous allreduce operation on the collective group.
-  // May return request == NULL if the call cannot be performed (or would block).
-  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* size);
-  // Close and free collective comm objects
-  ncclResult_t (*closeColl)(void* collComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclCollNet_v6_t;
-
-// v5 struct for backwards compatibility
-typedef struct {
-  // Name of the network (mainly for logs)
-  const char* name;
-  // Initialize the network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create a connection.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Connect to a handle and return a sending comm object for that peer.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with sendComm == NULL with the expectation that
-  // it will be called again until sendComm != NULL.
-  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
-  // Finalize connection establishment after remote peer has called connect.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with recvComm == NULL with the expectation that
-  // it will be called again until recvComm != NULL.
-  ncclResult_t (*accept)(void* listenComm, void** recvComm);
-  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
-  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
-  ncclResult_t (*deregMr)(void* comm, void* mhandle);
-  // Asynchronous send to a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
-  // Asynchronous recv from a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* sizes);
-  // Close and free send/recv comm objects
-  ncclResult_t (*closeSend)(void* sendComm);
-  ncclResult_t (*closeRecv)(void* recvComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclNet_v5_t;
-
-// v5 struct for backwards compatibility
-typedef struct {
-  // Name of the collective network (mainly for logs)
-  const char* name;
-  // Initialize the collective network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters capable of doing collective operations.
-  // If ndev returns 0, all other functions might be set to NULL.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create connections.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Create a group for collective operations. handles have been created
-  // using listen() above. rank indicates caller's rank in the collective network.
-  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
-  // Returns whether a reduction operation on a data type is supported.
-  // 1 for supported, 0 otherwise.
-  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
-  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
-  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
-  // Performs an asynchronous allreduce operation on the collective group.
-  // May return request == NULL if the call cannot be performed (or would block).
-  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* size);
-  // Close and free collective comm objects
-  ncclResult_t (*closeColl)(void* collComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclCollNet_v5_t;
-
-// context passed from RCCL lib to n/w plugin
-typedef struct {
-  // channel id
-  uint32_t chId;
-} ncclNet_ctxt_t;
-
-#endif // end include guard
@@ -1,235 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_PROFILER_H_
-#define NCCL_PROFILER_H_
-
-#include <cstdint>
-
-enum {
-  ncclProfileGroup     = (1 << 0),  // group event type
-  ncclProfileColl      = (1 << 1),  // host collective call event type
-  ncclProfileP2p       = (1 << 2),  // host point-to-point call event type
-  ncclProfileProxyOp   = (1 << 3),  // proxy operation event type
-  ncclProfileProxyStep = (1 << 4),  // proxy step event type
-  ncclProfileProxyCtrl = (1 << 5),  // proxy control event type
-};
-
-typedef struct {
-  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
-  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
-  int rank;                     // originating rank
-  union {
-    struct {
-      const char* name;
-      uint64_t commHash;
-      uint64_t seqNumber;
-      const char* func;
-      void const* sendBuff;
-      void* recvBuff;
-      size_t count;
-      int root;
-      const char* datatype;
-      size_t trafficBytes;
-      uint8_t nMaxChannels;
-      uint8_t nWarps;
-      const char* algo;
-      const char* proto;
-    } coll;
-
-    struct {
-      const char* name;
-      uint64_t commHash;
-      const char* func;
-      void* buff;
-      const char* datatype;
-      size_t count;
-      int peer;
-    } p2p;
-
-    struct {
-      pid_t pid;                // pid of the originating process
-      uint8_t channelId;        // channel id for this proxy operation
-      int peer;                 // remote rank for send/recv
-      int nSteps;               // number of steps for this proxy operation
-      int chunkSize;            // amount of data transferred by this proxy operation
-      int isSend;
-    } proxyOp;
-
-    struct {
-      int step;
-    } proxyStep;
-  };
-} ncclProfilerEventDescr_v2_t;
-
-typedef enum {
-  ncclProfilerProxyOpSendPosted,
-  ncclProfilerProxyOpSendRemFifoWait,
-  ncclProfilerProxyOpSendTransmitted,
-  ncclProfilerProxyOpSendDone,
-  ncclProfilerProxyOpRecvPosted,
-  ncclProfilerProxyOpRecvReceived,
-  ncclProfilerProxyOpRecvTransmitted,
-  ncclProfilerProxyOpRecvDone,
-
-  /* Legacy proxy profiler states */
-  ncclProfilerProxyStepSendGPUWait,
-  ncclProfilerProxyStepSendWait,
-  ncclProfilerProxyStepRecvWait,
-  ncclProfilerProxyStepRecvFlushWait,
-  ncclProfilerProxyStepRecvGPUWait,
-
-  /* Legacy proxy control states */
-  ncclProfilerProxyCtrlIdle,
-  ncclProfilerProxyCtrlActive,
-  ncclProfilerProxyCtrlSleep,
-  ncclProfilerProxyCtrlWakeup,
-  ncclProfilerProxyCtrlAppend,
-  ncclProfilerProxyCtrlAppendEnd,
-} ncclProfilerEventState_v2_t;
-
-typedef union {
-  struct {
-    size_t transSize;
-    int steps;
-  } proxyOp;
-
-  struct {
-    int appendedProxyOps;
-  } proxyCtrl;
-} ncclProfilerEventStateArgs_v2_t;
-
-typedef struct {
-  const char* name;
-
-  // init - initialize the profiler plugin
-  // Input
-  //  - context        : opaque profiler context object for separating profiler behavior across comms
-  // Output
-  //  - eActivationMask: bitmask of active events set by the plugin
-  ncclResult_t (*init)(void** context, int* eActivationMask);
-
-  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
-  // Input
-  //  - context: opaque profiler context object
-  //  - eDescr : pointer to ncclProfilerEventDescr_t object
-  // Output
-  //  - eHandle: return event handle for supplied event descriptor object
-  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr);
-
-  // stopEvent - stop/finalize an event inside and event set
-  // Input
-  //  - eHandle: handle to event object
-  ncclResult_t (*stopEvent)(void* eHandle);
-
-  // recordEventState - record event state transitions and event attribute updates
-  // Input
-  //  - eHandle   : handle to event object created through startEvent
-  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
-  //  - eState    : event state transition
-  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs);
-
-  // finalize - finalize the profiler plugin
-  // Input
-  //  - context: opaque profiler context object
-  ncclResult_t (*finalize)(void* context);
-} ncclProfiler_v2_t;
-
-typedef ncclProfilerEventDescr_v2_t ncclProfilerEventDescr_t;
-typedef ncclProfilerEventState_v2_t ncclProfilerEventState_t;
-typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_t;
-typedef ncclProfiler_v2_t ncclProfiler_t;
-
-typedef struct {
-  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
-  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
-  int rank;                     // originating rank
-  union {
-    struct {
-      const char* name;
-      uint64_t commHash;
-      uint64_t seqNumber;
-      uint8_t func;
-      void const* sendBuff;
-      void* recvBuff;
-      size_t count;
-      int root;
-      uint8_t datatype;
-      uint32_t op;
-      size_t trafficBytes;
-      uint8_t nMaxChannels;
-      uint8_t nWarps;
-      uint8_t algo;
-      uint8_t proto;
-      int isCollnet;
-      int isNvls;
-    } coll;
-
-    struct {
-      const char* name;
-      uint64_t commHash;
-      uint8_t func;
-      void* buff;
-      uint8_t datatype;
-      size_t count;
-      int peer;
-    } p2p;
-
-    struct {
-      pid_t pid;                // pid of the originating process
-      uint8_t channelId;        // channel id for this proxy operation
-      int peer;                 // remote rank for send/recv
-      int nSteps;               // number of steps for this proxy operation
-      int chunkSize;            // amount of data transferred by this proxy operation
-      int isSend;
-    } proxyOp;
-
-    struct {
-      int step;
-    } proxyStep;
-  };
-} ncclProfilerEventDescr_v1_t;
-
-typedef ncclProfilerEventState_v2_t ncclProfilerEventState_v1_t;
-typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_v1_t;
-
-typedef struct {
-  const char* name;
-
-  // init - initialize the profiler plugin
-  // Input
-  //  - context        : opaque profiler context object for separating profiler behavior across comms
-  // Output
-  //  - eActivationMask: bitmask of active events set by the plugin
-  ncclResult_t (*init)(void** context, int* eActivationMask);
-
-  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
-  // Input
-  //  - context: opaque profiler context object
-  //  - eDescr : pointer to ncclProfilerEventDescr_t object
-  // Output
-  //  - eHandle: return event handle for supplied event descriptor object
-  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr);
-
-  // stopEvent - stop/finalize an event inside and event set
-  // Input
-  //  - eHandle: handle to event object
-  ncclResult_t (*stopEvent)(void* eHandle);
-
-  // recordEventState - record event state transitions and event attribute updates
-  // Input
-  //  - eHandle   : handle to event object created through startEvent
-  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
-  //  - eState    : event state transition
-  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs);
-
-  // finalize - finalize the profiler plugin
-  // Input
-  //  - context: opaque profiler context object
-  ncclResult_t (*finalize)(void* context);
-} ncclProfiler_v1_t;
-
-#endif
@@ -1,149 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
- * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_TUNER_H_
-#define NCCL_TUNER_H_
-
-#include "nccl.h"
-#include "nccl_common.h"
-
-// API to be implemented by external tuner
-typedef struct {
-  // Name of the tuner
-  const char* name;
-
-  // Initializes tuner states.
-  // Inputs:
-  //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
-  //   - nNodes: number of nodes in current communicator.
-  //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
-  // Outputs:
-  //   - context: tuner context object
-  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
-
-  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
-  // Inputs:
-  //   - context: tuner context object
-  //   - collType: collective type , e.g., allreduce, allgather…
-  //   - nBytes: collective size in bytes
-  //   - numPipeOps: number of operations in the group
-  //   - numAlgo: number of algorithms in collCostTable
-  //   - numProto: number of protocols in collCostTable
-  //   - regBuff: can register user buffer
-  //
-  // Outputs:
-  //   - nChannels: number of channels (hence SMs) to be used.
-  //
-  // InOut:
-  //   - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
-  //                    NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
-  //
-  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
-  // default tuning for the given collective.
-  // Also, the plugin is allowed to not set any output, or set only the
-  // algorithm and protocol, but not only the algorithm or only the protocol.
-  // Unset fields will be set automatically by NCCL.
-  ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
-                              int numPipeOps, float** collCostTable, int numAlgo, int numProto,
-                              int regBuff, int* nChannels);
-
-  // Terminates the plugin and cleans up any resources that the plugin allocated.
-  // context: tuner context object
-  ncclResult_t (*destroy)(void* context);
-} ncclTuner_v4_t;
-
-typedef ncclTuner_v4_t ncclTuner_t;
-
-#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4"
-
-// API to be implemented by external tuner
-typedef struct {
-  // Name of the tuner
-  const char* name;
-
-  // Initializes tuner states.
-  // Inputs:
-  //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
-  //   - nNodes: number of nodes in current communicator.
-  //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
-  // Outputs:
-  //   - context: tuner context object
-  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
-
-  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
-  // Inputs:
-  //   - context: tuner context object
-  //   - collType: collective type , e.g., allreduce, allgather…
-  //   - nBytes: collective size in bytes
-  //   - numPipeOps: number of operations in the group
-  //   - numAlgo: number of algorithms in collCostTable
-  //   - numProto: number of protocols in collCostTable
-  //
-  // Outputs:
-  //   - nChannels: number of channels (hence SMs) to be used.
-  //
-  // InOut:
-  //   - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
-  //                    NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
-  //
-  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
-  // default tuning for the given collective.
-  // Also, the plugin is allowed to not set any output, or set only the
-  // algorithm and protocol, but not only the algorithm or only the protocol.
-  // Unset fields will be set automatically by NCCL.
-  ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
-                              int numPipeOps, float** collCostTable, int numAlgo, int numProto,
-                              int* nChannels);
-
-  // Terminates the plugin and cleans up any resources that the plugin allocated.
-  // context: tuner context object
-  ncclResult_t (*destroy)(void* context);
-} ncclTuner_v3_t;
-
-// API to be implemented by external tuner
-typedef struct {
-  // Name of the tuner
-  const char* name;
-
-  // Initializes tuner states.
-  // Inputs:
-  //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
-  //   - nNodes: number of nodes in current communicator.
-  //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
-  // Outputs:
-  //   - context: tuner context object
-  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
-
-  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
-  // Inputs:
-  //   - context: tuner context object
-  //   - collType: collective type , e.g., allreduce, allgather…
-  //   - nBytes: collective size in bytes
-  //   - collNetTypeSupport: whether collnet supports this type
-  //   - nvlsTypeSupport: whether nvlink sharp supports this time
-  //   - numPipeOps: number of operations in the group
-  //
-  // Outputs:
-  //   - algorithm: selected algorithm to be used for the given collective
-  //   - protocol: selected protocol to be used for the give collective
-  //   - nChannels: number of channels (hence SMs) to be used.
-  //
-  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
-  // default tuning for the given collective.
-  // Also, the plugin is allowed to not set any output, or set only the
-  // algorithm and protocol, but not only the algorithm or only the protocol.
-  // Unset fields will be set automatically by NCCL.
-  ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
-                              int collNetSupport, int nvlsSupport, int numPipeOps,
-                              int* algorithm, int* protocol, int* nChannels);
-
-  // Terminates the plugin and cleans up any resources that the plugin allocated.
-  // context: tuner context object
-  ncclResult_t (*destroy)(void* context);
-} ncclTuner_v2_t;
-
-#endif
@@ -18,7 +18,6 @@ ncclResult_t ncclNetPluginLoad(struct ncclComm* comm);
 ncclResult_t ncclNetPluginUnload(struct ncclComm* comm);
 ncclResult_t ncclNetInit(struct ncclComm* comm);
 ncclResult_t ncclNetFinalize(struct ncclComm* comm);
-int ncclNetVersion(struct ncclComm* comm);

 // Test whether the current GPU support GPU Direct RDMA.
 ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport);
@@ -26,6 +26,7 @@ typedef struct {

 typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
 typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t;
-typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_t;
+typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t;
+typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_t;

 #endif
@@ -37,9 +37,10 @@
 #define NVTX_SID_CommInitRankScalable 17 // same schema as NVTX_SID_CommInitRank
 #define NVTX_SID_CommSplit            18
 #define NVTX_SID_CommFinalize         19
+// When adding new schema IDs, DO NOT re-use/overlap with the enum schema ID below!

 // Define static schema ID for the reduction operation.
-#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 19 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
+#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 20 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START

 extern const nvtxDomainHandle_t ncclNvtxDomainHandle;

@@ -0,0 +1,60 @@
+/*************************************************************************
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_NET_H_
+#define NCCL_NET_H_
+
+#include "nccl.h"
+#include "nccl_common.h"
+#include "net_device.h"
+#include <stdint.h>
+
+#define NCCL_NET_HANDLE_MAXSIZE 128
+//Maximum value NCCL can accept for maxP2pBytes and maxCollBytes net properties
+#define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L)
+#define NCCL_NET_OPTIONAL_RECV_COMPLETION 0x1
+
+#define MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two.
+#define MAX_COLLNET_SIZE (512*1024*1024L) //Set for initial collent plugins when size was not dynamically queried
+
+#define NCCL_PTR_HOST 0x1
+#define NCCL_PTR_CUDA 0x2
+#define NCCL_PTR_DMABUF 0x4
+
+// Maximum number of requests per comm object
+#define NCCL_NET_MAX_REQUESTS 32
+
+// Max number of ncclNet objects which can live in the same process
+#define NCCL_NET_MAX_PLUGINS 3
+
+// NCCL core profiler callback for network defined events instrumentation
+typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData);
+
+#include "net/net_v10.h"
+#include "net/net_v9.h"
+#include "net/net_v8.h"
+#include "net/net_v7.h"
+#include "net/net_v6.h"
+
+typedef ncclNet_v10_t ncclNet_t;
+typedef ncclCollNet_v10_t ncclCollNet_t;
+typedef ncclNetSGE_v10_t ncclNetSGE_t;
+typedef ncclNetProperties_v10_t ncclNetProperties_t;
+typedef ncclNetVDeviceProps_v10_t ncclNetVDeviceProps_t;
+typedef ncclNetCommConfig_v10_t ncclNetCommConfig_t;
+
+#define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V10
+
+#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v10
+#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v10
+
+// context passed from RCCL lib to n/w plugin
+typedef struct {
+  // channel id
+  uint32_t chId;
+} ncclNet_ctxt_t;
+
+#endif // end include guard
@@ -0,0 +1,69 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_PROFILER_H_
+#define NCCL_PROFILER_H_
+
+enum {
+  ncclProfileGroup     = (1 << 0),  // group event type
+  ncclProfileColl      = (1 << 1),  // host collective call event type
+  ncclProfileP2p       = (1 << 2),  // host point-to-point call event type
+  ncclProfileProxyOp   = (1 << 3),  // proxy operation event type
+  ncclProfileProxyStep = (1 << 4),  // proxy step event type
+  ncclProfileProxyCtrl = (1 << 5),  // proxy control event type
+  ncclProfileKernelCh  = (1 << 6),  // kernel channel event type
+  ncclProfileNetPlugin = (1 << 7),  // network plugin-defined, events
+};
+
+typedef enum {
+  ncclProfilerProxyOpSendPosted,
+  ncclProfilerProxyOpSendRemFifoWait,
+  ncclProfilerProxyOpSendTransmitted,
+  ncclProfilerProxyOpSendDone,
+  ncclProfilerProxyOpRecvPosted,
+  ncclProfilerProxyOpRecvReceived,
+  ncclProfilerProxyOpRecvTransmitted,
+  ncclProfilerProxyOpRecvDone,
+
+  /* Legacy proxy profiler states */
+  ncclProfilerProxyStepSendGPUWait,
+  ncclProfilerProxyStepSendWait,
+  ncclProfilerProxyStepRecvWait,
+  ncclProfilerProxyStepRecvFlushWait,
+  ncclProfilerProxyStepRecvGPUWait,
+
+  /* Legacy proxy control states */
+  ncclProfilerProxyCtrlIdle,
+  ncclProfilerProxyCtrlActive,
+  ncclProfilerProxyCtrlSleep,
+  ncclProfilerProxyCtrlWakeup,
+  ncclProfilerProxyCtrlAppend,
+  ncclProfilerProxyCtrlAppendEnd,
+} ncclProfilerEventState_t;
+
+typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t;
+typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t;
+typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t;
+
+#include <cstdint>
+#include "profiler/profiler_v3.h"
+#include "profiler/profiler_v2.h"
+#include "profiler/profiler_v1.h"
+
+typedef ncclProfiler_v3_t ncclProfiler_t;
+typedef ncclProfilerEventDescr_v3_t ncclProfilerEventDescr_t;
+typedef ncclProfilerEventStateArgs_v3_t ncclProfilerEventStateArgs_t;
+
+#define NCCL_PROFILER_NET_VER_BITS  (16)
+#define NCCL_PROFILER_NET_VER_MASK  (~0U >> NCCL_PROFILER_NET_VER_BITS)
+#define NCCL_PROFILER_NET_TYPE_MASK (~0U << NCCL_PROFILER_NET_VER_BITS)
+
+typedef enum {
+  NCCL_PROFILER_NET_TYPE_IB   = (1U << NCCL_PROFILER_NET_VER_BITS),
+  NCCL_PROFILER_NET_TYPE_SOCK = (2U << NCCL_PROFILER_NET_VER_BITS),
+} ncclProfilerNetType;
+
+#endif
@@ -0,0 +1,22 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_TUNER_H_
+#define NCCL_TUNER_H_
+
+#include "nccl.h"
+#include "nccl_common.h"
+
+#include "tuner/tuner_v4.h"
+#include "tuner/tuner_v3.h"
+#include "tuner/tuner_v2.h"
+
+typedef ncclTuner_v4_t ncclTuner_t;
+
+#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4"
+
+#endif
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NET_V10_H_
+#define NET_V10_H_
+
+#define NCCL_NET_MAX_DEVS_PER_NIC_V10 4
+
+typedef struct {
+  int ndevs;
+  int devs[NCCL_NET_MAX_DEVS_PER_NIC_V10];
+} ncclNetVDeviceProps_v10_t;
+
+#define NCCL_NET_TRAFFIC_CLASS_UNDEF -1
+
+typedef struct {
+  // Plugin-specific TC value
+  int trafficClass;
+} ncclNetCommConfig_v10_t;
+
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int regIsGlobal;                 // regMr is not tied to a particular comm
+  int forceFlush;                  // Force a flush on receives
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+  ncclNetVDeviceProps_v10_t vProps;
+  size_t maxP2pBytes;              // Max transfer size for point-to-point operations
+  size_t maxCollBytes;             // Max transfer size for collective operations
+} ncclNetProperties_v10_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v10_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+
+  // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
+  // what index this new vNIC exists at
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v10_t* props);
+} ncclNet_v10_t;
+
+typedef struct {
+  void* mhandle;
+  void* address;
+  size_t size;
+} ncclNetSGE_v10_t;
+
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, size_t count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v10_t* recvParts,
+                             size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                             void* sendMhandle, void** request);
+  ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v10_t* sendParts, void* recvData,
+                                 size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                                 ncclDataType_t dataType, ncclRedOp_t redOp,
+                                 void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Create a virtual NIC given the specified properties, which can be accessed at device index d
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v10_t* props);
+} ncclCollNet_v10_t;
+
+#endif // end include guard
@@ -0,0 +1,113 @@
+/*************************************************************************
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_V6_H_
+#define NET_V6_H_
+
+#define NCCL_NET_MAX_REQUESTS_V6 8
+
+// v6 struct for backwards compatibility
+typedef struct {
+  char* name;     // Used mostly for logging.
+  char* pciPath;  // Path to the PCI device in /sys.
+  uint64_t guid;  // Unique identifier for the NIC chip. Important for
+                  // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int speed;      // Port speed in Mbps.
+  int port;       // Port number.
+  float latency;  // Network latency
+  int maxComms;   // Maximum number of comms we can create
+  int maxRecvs;   // Maximum number of grouped receives.
+} ncclNetProperties_v6_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v6_t;
+
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclCollNet_v6_t;
+
+#endif
@@ -0,0 +1,120 @@
+/*************************************************************************
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_V7_H_
+#define NET_V7_H_
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+} ncclNetProperties_v7_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+} ncclNet_v7_t;
+
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclCollNet_v7_t;
+
+#endif
@@ -0,0 +1,134 @@
+/*************************************************************************
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_V8_H_
+#define NET_V8_H_
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int regIsGlobal;                 // regMr is not tied to a particular comm
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+} ncclNetProperties_v8_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+} ncclNet_v8_t;
+
+typedef struct {
+  void* mhandle;
+  void* address;
+  uint32_t size;
+} ncclNetSGE_v8_t;
+
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v8_t* recvParts,
+                             size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                             void* sendMhandle, void** request);
+  ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v8_t* sendParts, void* recvData,
+                                 size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                                 ncclDataType_t dataType, ncclRedOp_t redOp,
+                                 void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclCollNet_v8_t;
+
+#endif
@@ -0,0 +1,152 @@
+/*************************************************************************
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_V9_H_
+#define NET_V9_H_
+
+#define NCCL_NET_MAX_DEVS_PER_NIC_V9 4
+
+typedef struct {
+  int ndevs;
+  int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9];
+} ncclNetVDeviceProps_v9_t;
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int regIsGlobal;                 // regMr is not tied to a particular comm
+  int forceFlush;                  // Force a flush on receives
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+  ncclNetVDeviceProps_v9_t vProps;
+  size_t maxP2pBytes;              // Max transfer size for point-to-point operations
+  size_t maxCollBytes;             // Max transfer size for collective operations
+} ncclNetProperties_v9_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v9_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v9_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+
+  // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
+  // what index this new vNIC exists at
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v9_t* props);
+} ncclNet_v9_t;
+
+typedef struct {
+  void* mhandle;
+  void* address;
+  size_t size;
+} ncclNetSGE_v9_t;
+
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, size_t count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v9_t* recvParts,
+                             size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                             void* sendMhandle, void** request);
+  ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v9_t* sendParts, void* recvData,
+                                 size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                                 ncclDataType_t dataType, ncclRedOp_t redOp,
+                                 void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Create a virtual NIC given the specified properties, which can be accessed at device index d
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v9_t* props);
+} ncclCollNet_v9_t;
+
+#endif // end include guard
@@ -0,0 +1,18 @@
+/*************************************************************************
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_PLUGIN_H_
+#define NCCL_PLUGIN_H_
+
+#include "nccl.h"
+
+void* ncclOpenNetPluginLib(const char* name);
+void* ncclOpenTunerPluginLib(const char* name);
+void* ncclOpenProfilerPluginLib(const char* name);
+void* ncclGetNetPluginLib(void);
+ncclResult_t ncclClosePluginLib(void* handle);
+
+#endif
@@ -0,0 +1,13 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_IB_H_
+#define NET_IB_H_
+
+#include "nccl_profiler.h"
+#include "net_ib_v1.h"
+
+#endif
@@ -0,0 +1,34 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_IB_V1_H_
+#define NET_IB_V1_H_
+
+#define NCCL_PROFILER_NET_IB_VER 1
+
+enum {
+  ncclProfileQp = (1 << 0),
+};
+
+// The data structure version is encoded in the plugin identifier bitmask and
+// passed to NCCL core through the profiler callback. NCCL copies the plugin
+// identifier in the event descriptor before calling the profiler startEvent
+// function. The profiler should inspect the plugin id to find out the source
+// plugin as well as the version of the event struct
+typedef struct {
+  uint8_t type;        // event type (plugin defined)
+  union {
+    struct {
+      int device;      // network device id
+      uint64_t wr_id;  // work request id
+      int opcode;      // ibv opcode
+      int qpNum;       // QP number
+      size_t length;   // work request data length
+    } qp;
+  };
+} ncclProfilerNetIbDescr_v1_t;
+
+#endif
@@ -0,0 +1,13 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_SOCKET_H_
+#define NET_SOCKET_H_
+
+#include "nccl_profiler.h"
+#include "net_socket_v1.h"
+
+#endif
@@ -0,0 +1,32 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_SOCKET_V1_H_
+#define NET_SOCKET_V1_H_
+
+#define NCCL_PROFILER_NET_SOCKET_VER 1
+
+enum {
+  ncclProfileSocket = (1 << 0),
+};
+
+// The data structure version is encoded in the plugin identifier bitmask and
+// passed to NCCL core through the profiler callback. NCCL copies the plugin
+// identifier in the event descriptor before calling the profiler startEvent
+// function. The profiler should inspect the plugin id to find out the source
+// plugin as well as the version of the event struct
+typedef struct {
+  uint8_t type;        // event type (plugin defined)
+  union {
+    struct {
+      int fd;
+      int op;
+      size_t length;
+    } sock;
+  };
+} ncclProfilerNetSockDescr_v1_t;
+
+#endif
@@ -0,0 +1,107 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V1_H_
+#define PROFILER_V1_H_
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint64_t seqNumber;
+      uint8_t func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      uint8_t datatype;
+      uint32_t op;
+      size_t trafficBytes;
+      uint8_t nMaxChannels;
+      uint8_t nWarps;
+      uint8_t algo;
+      uint8_t proto;
+      int isCollnet;
+      int isNvls;
+    } coll;
+
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint8_t func;
+      void* buff;
+      uint8_t datatype;
+      size_t count;
+      int peer;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+  };
+} ncclProfilerEventDescr_v1_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+    int steps;
+  } proxyOp;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+} ncclProfilerEventStateArgs_v1_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v1_t;
+
+#endif
@@ -0,0 +1,104 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V2_H_
+#define PROFILER_V2_H_
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      size_t trafficBytes;
+      uint8_t nMaxChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+    } coll;
+
+    struct {
+      const char* name;
+      uint64_t commHash;
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+  };
+} ncclProfilerEventDescr_v2_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+    int steps;
+  } proxyOp;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+} ncclProfilerEventStateArgs_v2_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v2_t;
+
+#endif
@@ -0,0 +1,112 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V3_H_
+#define PROFILER_V3_H_
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      uint8_t nMaxChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+    } coll;
+
+    struct {
+      const char* name;
+      uint64_t commHash;
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+
+    struct {
+      uint8_t channelId;
+    } kernelCh;
+
+    struct {
+      int64_t id;
+      void* data;
+    } netPlugin;
+  };
+} ncclProfilerEventDescr_v3_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+    int steps;
+  } proxyOp;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+} ncclProfilerEventStateArgs_v3_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v3_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v3_t eState, ncclProfilerEventStateArgs_v3_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v3_t;
+
+#endif
@@ -0,0 +1,53 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef TUNER_V2_H_
+#define TUNER_V2_H_
+
+// API to be implemented by external tuner
+typedef struct {
+  // Name of the tuner
+  const char* name;
+
+  // Initializes tuner states.
+  // Inputs:
+  //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
+  //   - nNodes: number of nodes in current communicator.
+  //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
+  // Outputs:
+  //   - context: tuner context object
+  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
+
+  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
+  // Inputs:
+  //   - context: tuner context object
+  //   - collType: collective type , e.g., allreduce, allgather…
+  //   - nBytes: collective size in bytes
+  //   - collNetTypeSupport: whether collnet supports this type
+  //   - nvlsTypeSupport: whether nvlink sharp supports this time
+  //   - numPipeOps: number of operations in the group
+  //
+  // Outputs:
+  //   - algorithm: selected algorithm to be used for the given collective
+  //   - protocol: selected protocol to be used for the give collective
+  //   - nChannels: number of channels (hence SMs) to be used.
+  //
+  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
+  // default tuning for the given collective.
+  // Also, the plugin is allowed to not set any output, or set only the
+  // algorithm and protocol, but not only the algorithm or only the protocol.
+  // Unset fields will be set automatically by NCCL.
+  ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
+                              int collNetSupport, int nvlsSupport, int numPipeOps,
+                              int* algorithm, int* protocol, int* nChannels);
+
+  // Terminates the plugin and cleans up any resources that the plugin allocated.
+  // context: tuner context object
+  ncclResult_t (*destroy)(void* context);
+} ncclTuner_v2_t;
+
+#endif
@@ -0,0 +1,55 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef TUNER_V3_H_
+#define TUNER_V3_H_
+
+// API to be implemented by external tuner
+typedef struct {
+  // Name of the tuner
+  const char* name;
+
+  // Initializes tuner states.
+  // Inputs:
+  //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
+  //   - nNodes: number of nodes in current communicator.
+  //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
+  // Outputs:
+  //   - context: tuner context object
+  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
+
+  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
+  // Inputs:
+  //   - context: tuner context object
+  //   - collType: collective type , e.g., allreduce, allgather…
+  //   - nBytes: collective size in bytes
+  //   - numPipeOps: number of operations in the group
+  //   - numAlgo: number of algorithms in collCostTable
+  //   - numProto: number of protocols in collCostTable
+  //
+  // Outputs:
+  //   - nChannels: number of channels (hence SMs) to be used.
+  //
+  // InOut:
+  //   - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
+  //                    NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
+  //
+  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
+  // default tuning for the given collective.
+  // Also, the plugin is allowed to not set any output, or set only the
+  // algorithm and protocol, but not only the algorithm or only the protocol.
+  // Unset fields will be set automatically by NCCL.
+  ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
+                              int numPipeOps, float** collCostTable, int numAlgo, int numProto,
+                              int* nChannels);
+
+  // Terminates the plugin and cleans up any resources that the plugin allocated.
+  // context: tuner context object
+  ncclResult_t (*destroy)(void* context);
+} ncclTuner_v3_t;
+
+#endif
@@ -0,0 +1,56 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef TUNER_V4_H_
+#define TUNER_V4_H_
+
+// API to be implemented by external tuner
+typedef struct {
+  // Name of the tuner
+  const char* name;
+
+  // Initializes tuner states.
+  // Inputs:
+  //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
+  //   - nNodes: number of nodes in current communicator.
+  //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
+  // Outputs:
+  //   - context: tuner context object
+  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
+
+  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
+  // Inputs:
+  //   - context: tuner context object
+  //   - collType: collective type , e.g., allreduce, allgather…
+  //   - nBytes: collective size in bytes
+  //   - numPipeOps: number of operations in the group
+  //   - numAlgo: number of algorithms in collCostTable
+  //   - numProto: number of protocols in collCostTable
+  //   - regBuff: can register user buffer
+  //
+  // Outputs:
+  //   - nChannels: number of channels (hence SMs) to be used.
+  //
+  // InOut:
+  //   - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
+  //                    NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
+  //
+  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
+  // default tuning for the given collective.
+  // Also, the plugin is allowed to not set any output, or set only the
+  // algorithm and protocol, but not only the algorithm or only the protocol.
+  // Unset fields will be set automatically by NCCL.
+  ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
+                              int numPipeOps, float** collCostTable, int numAlgo, int numProto,
+                              int regBuff, int* nChannels);
+
+  // Terminates the plugin and cleans up any resources that the plugin allocated.
+  // context: tuner context object
+  ncclResult_t (*destroy)(void* context);
+} ncclTuner_v4_t;
+
+#endif
@@ -17,6 +17,18 @@ struct ncclTaskP2p;
 struct ncclInfo;
 struct ncclComm;
 struct ncclProxyOp;
+struct ncclProxyConnector;
+
+struct ncclProfilerProxy {
+  bool initialized;
+  uint64_t* workStarted/*[MAXCHANNELS]*/;
+  uint64_t* workCompleted/*[MAXCHANNELS]*/;
+  uint64_t workCounter[MAXCHANNELS]; // host work counter
+  struct ncclProxyConnector sendProxyConn[MAXCHANNELS];
+  struct ncclProxyConnector recvProxyConn[MAXCHANNELS];
+};
+
+extern int ncclProfilerEventMask;

 // Plugin Init/Finalize Wrappers
 ncclResult_t ncclProfilerPluginInit(struct ncclComm* comm);
@@ -44,6 +56,10 @@ ncclResult_t ncclProfilerStopProxyStepEvent(int sub, struct ncclProxyArgs* args,
 ncclResult_t ncclProfilerStartProxyCtrlEvent(void* profilerContext, void** eHandle);
 ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle);

+// Kernel Channel Start/Stop Event Wrappers
+ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s);
+ncclResult_t ncclProfilerStopKernelChEvent(struct ncclProxyArgs* args, int s);
+
 // Record Event Wrappers
 ncclResult_t ncclProfilerRecordProxyOpEventState(int sub, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState);
 ncclResult_t ncclProfilerRecordProxyStepEventState(int sub, struct ncclProxyArgs* args, int stepId, ncclProfilerEventState_t eState);
@@ -51,5 +67,10 @@ ncclResult_t ncclProfilerRecordProxyCtrlEventState(void*eHandle, int appended, n

 // Profiler utility functions
 ncclResult_t ncclProfilerAddPidToProxyOp(struct ncclProxyOp* op);
+bool ncclProfilerNeedsProxy(struct ncclComm* comm, struct ncclProxyOp* op);
+bool ncclProfilerPluginLoaded(void);
+
+// Profiler callback for network plugin
+ncclResult_t ncclProfilerCallback(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData);

 #endif
@@ -34,7 +34,8 @@ typedef enum : uint8_t {
  ncclPatternPatUp,
  ncclPatternPatDown,
  ncclPatternSend,
-  ncclPatternRecv
+  ncclPatternRecv,
+  ncclPatternProfiler,
 } ncclPattern_t;

 enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };
@@ -93,12 +94,19 @@ struct ncclProxyOp {
    struct ncclTaskP2p* p2p;
  } task;

+  // Profiler work counter increment flag. Set to 'true' if the profiler work counter for this channel needs increment.
+  // Always 'true' for collective operations. Grouped p2p operations are fused into one <send, recv> pair in the GPU kernel,
+  // meaning the GPU profiler code increments the work counter for the pair rather than the individual p2p. For this
+  // reason, the incWorkCounter flag is used to avoid incrementing the work counter twice in the host code. This is done
+  // by setting incWorkCounter to 'true' only for one of the p2ps in the pair during enqueue.
+  bool incWorkCounter;
  int eActivationMask;
  void* taskEventHandle;
  int rank;
  int peer;
  pid_t pid;
  void* profilerContext;
+  uint64_t workCounter;

  struct ncclProxyOp *enqNext;
 };
@@ -135,12 +143,15 @@ struct ncclProxySubArgs {
  // Profiler plugin
  int eActivationMask;
  int rank;
+  uint64_t profilerSteps;
  pid_t pid;
  void* profilerContext;
  void* taskEventHandle;
  void* opEventHandle;
+  void* kernelEventHandle;
  void* stepEventHandles[NCCL_STEPS];
  size_t transSize;
+  uint64_t workCounter;

  void* recvRequestsCache[NCCL_STEPS];
  int recvRequestsSubCount;
@@ -15,6 +15,8 @@ struct rasRankInit {
  pid_t pid;
  int cudaDev;
  int nvmlDev;
+  uint64_t hostHash;
+  uint64_t pidHash;
 };

 ncclResult_t ncclRasCommInit(struct ncclComm* comm, struct rasRankInit* myRank);
@@ -42,7 +42,7 @@ struct ncclReg {
  uintptr_t baseAddr;
  size_t baseSize;
  CUdeviceptr regAddr;
-  size_t regSize;
+  size_t regUCSize, regMCSize;
  int dev;
  CUmemGenericAllocationHandle mcHandle;
  uintptr_t caddrs[NCCL_MAX_LOCAL_RANKS]; /* use to check if NVLS buffers match among intra-node ranks */
@@ -14,7 +14,6 @@ struct shmCuIpc {
    CUmemFabricHandle handle;
    CUmemGenericAllocationHandle data;
  };
-  int tpProxyRank;
  void *ptr;
  size_t size;
 };
@@ -30,8 +29,8 @@ struct shmIpcDesc {

 typedef struct shmIpcDesc ncclShmIpcDesc_t;

-ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool legacy, ncclShmIpcDesc_t *descOut, void **hptr, void **dptr);
-ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut);
+ncclResult_t ncclShmAllocateShareableBuffer(size_t size, bool legacy, ncclShmIpcDesc_t *descOut, void **hptr, void **dptr);
+ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, int proxyRank, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut);
 ncclResult_t ncclShmIpcClose(ncclShmIpcDesc_t *desc);

 #endif
@@ -96,5 +96,5 @@ ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size);
 ncclResult_t ncclSocketSendRecv(struct ncclSocket* sendSock, void* sendPtr, int sendSize, struct ncclSocket* recvSock, void* recvPtr, int recvSize);
 ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking);
 ncclResult_t ncclSocketShutdown(struct ncclSocket* sock, int how);
-ncclResult_t ncclSocketClose(struct ncclSocket* sock);
+ncclResult_t ncclSocketClose(struct ncclSocket* sock, bool wait = false);
 #endif
@@ -10,13 +10,24 @@
 #include "nccl.h"
 #include "checks.h"

+#include <cuda.h>
+#include <cuda_runtime.h>
 #include <stdint.h>

+// ncclCudaContext: wraps a CUDA context with per-context state.
+struct ncclCudaContext;
+
+// Get a ncclCudaContext to track the currently active CUDA context.
+ncclResult_t ncclCudaContextTrack(struct ncclCudaContext** out);
+// Drop reference.
+void ncclCudaContextDrop(struct ncclCudaContext* cxt);
+
 /* ncclCudaGraph: Wraps a cudaGraph_t so that we can support pre-graph CUDA runtimes
 * easily.
 */
 struct ncclCudaGraph {
 #if ROCM_VERSION >= 60100
+  cudaStream_t origin;
  cudaGraph_t graph;
  unsigned long long graphId;
 #endif
@@ -25,6 +36,7 @@ struct ncclCudaGraph {
 inline struct ncclCudaGraph ncclCudaGraphNone() {
  struct ncclCudaGraph tmp;
  #if ROCM_VERSION >= 60100
+    tmp.origin = nullptr;
    tmp.graph = nullptr;
    tmp.graphId = ULLONG_MAX;
  #endif
@@ -33,7 +45,7 @@ inline struct ncclCudaGraph ncclCudaGraphNone() {

 inline bool ncclCudaGraphValid(struct ncclCudaGraph graph) {
  #if ROCM_VERSION >= 60100
-    return graph.graph != nullptr;
+    return graph.graphId != ULLONG_MAX;
  #else
    return false;
  #endif
@@ -57,84 +69,69 @@ ncclResult_t ncclCudaGraphAddDestructor(struct ncclCudaGraph graph, cudaHostFn_t
 * streams unfit for the use of serializing access to a persistent resource.
 * Strong streams have been introduced to address this need.
 *
- * - All updates to a strong stream must be enclosed by a Acquire/Release pair.
+ * All updates to a strong stream must be enclosed by a Acquire/Release pair.
 *
- * - The Acquire, Release, and all updates take a ncclCudaGraph parameter
- *   indicating the currently capturing graph (or none). This parameter must be
- *   the same for the entire sequence of {Acquire; ...; Release}.
+ * Acquire retrieves a "work" stream (cudaStream_t) which may be used to add
+ * work.
 *
- * - An {Acquire; ...; Release} sequence must not be concurrent with any
- *   other operations against the strong stream including graph launches which
- *   reference this stream.
+ * Release publishes the work streams work into the strong stream. The Release
+ * must be issued by the same thread that did the Acquire.
 */
 struct ncclStrongStream;

 ncclResult_t ncclStrongStreamConstruct(struct ncclStrongStream* ss);
 ncclResult_t ncclStrongStreamDestruct(struct ncclStrongStream* ss);

-// Acquire-fence the strong stream.
+// Acquire the strong stream. Upon return `*workStream` will be usable to add work.
+// `concurrent` indicates if other threads may be using the strong stream.
 ncclResult_t ncclStrongStreamAcquire(
-  struct ncclCudaGraph graph, struct ncclStrongStream* ss
+  struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent, cudaStream_t* workStream
 );

-// Acquire-fence the strong stream assuming no graph is capturing. This permits
-// the caller to enqueue directly to the `ss->cudaStream` member using native CUDA
-// calls. Strong stream still must be released via:
-//   ncclStrongStreamRelease(ncclCudaGraphNone(), ss);
-ncclResult_t ncclStrongStreamAcquireUncaptured(struct ncclStrongStream* ss);
-
-// Release-fence of the strong stream.
-ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss);
-
-// Add a host launch to the stream.
-ncclResult_t ncclStrongStreamLaunchHost(
-  struct ncclCudaGraph graph, struct ncclStrongStream* ss,
-  cudaHostFn_t fn, void* arg
-);
-// Add a kernel launch to the stream.
-ncclResult_t ncclStrongStreamLaunchKernel(
-  struct ncclCudaGraph graph, struct ncclStrongStream* ss,
-  void* fn, dim3 grid, dim3 block, void** args, size_t sharedMemBytes
+// Get the workStream for an already acquired strong stream.
+// `concurrent` indicates if other threads may be using the strong stream.
+ncclResult_t ncclStrongStreamAcquiredWorkStream(
+  struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent, cudaStream_t* workStream
 );

-// Cause `a` to wait for the current state `b`. Both `a` and `b` must be acquired.
-// `b_subsumes_a` indicates that all work in `a` is already present in `b`, thus
-// we want to fast-forward `a` to be a clone of `b`. Knowing this permits the
-// implementation to induce few graph dependencies.
-ncclResult_t ncclStrongStreamWaitStream(
-  struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b, bool b_subsumes_a=false
-);
-// `b` must be capturing within `graph`.
-ncclResult_t ncclStrongStreamWaitStream(
-  struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b, bool b_subsumes_a=false
-);
-// `a` must be capturing within `graph`.
-ncclResult_t ncclStrongStreamWaitStream(
-  struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b, bool b_subsumes_a=false
+// Release of the strong stream.
+// `concurrent` indicates if other threads may be using the strong stream.
+ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent);
+
+ncclResult_t ncclStreamWaitStream(
+  cudaStream_t a, cudaStream_t b, cudaEvent_t scratchEvent
 );

+// Like cudaStreamWaitEvent except `e` must be strictly ahead of everything in `s`.
+ncclResult_t ncclStreamAdvanceToEvent(struct ncclCudaGraph g, cudaStream_t s, cudaEvent_t e);
+
 // Synchrnoization does not need the strong stream to be acquired.
 ncclResult_t ncclStrongStreamSynchronize(struct ncclStrongStream* ss);

 ////////////////////////////////////////////////////////////////////////////////

-struct ncclStrongStreamGraph; // internal to ncclStrongStream
+struct ncclStrongStreamCapture; // internal to ncclStrongStream

 struct ncclStrongStream {
-  // Used when not graph capturing.
-  cudaStream_t cudaStream;
+  // The stream to use for non-captured work.
+  cudaStream_t liveStream;
+  void* liveAcquiredBy;
 #if ROCM_VERSION >= 60100
+  // This stream ever appeared in a graph capture.
+  bool everCaptured;
+  pthread_mutex_t lock;
+  struct ncclStrongStreamCapture* captureHead;
  // The event used to establish order between graphs and streams. During acquire
  // this event is waited on, during release it is recorded to.
  cudaEvent_t serialEvent;
-  // This stream ever appeared in a graph capture.
-  bool everCaptured;
-  // Tracks whether serialEvent needs to be recorded to upon Release().
-  bool serialEventNeedsRecord;
-  struct ncclStrongStreamGraph* graphHead;
-#else
-  cudaEvent_t scratchEvent;
 #endif
 };

+struct ncclCudaContext {
+  struct ncclCudaContext* next;
+  CUcontext hcontext;
+  int refCount;
+  struct ncclStrongStream launchOrder;
+};
+
 #endif
@@ -19,6 +19,7 @@
 #define TRANSPORT_SHM 1
 #define TRANSPORT_NET 2
 #define TRANSPORT_COLLNET 3
+#define TRANSPORT_PROFILER 4

 #include "proxy.h"
 #include "comm.h"
@@ -27,6 +28,7 @@ extern struct ncclTransport p2pTransport;
 extern struct ncclTransport shmTransport;
 extern struct ncclTransport netTransport;
 extern struct ncclTransport collNetTransport;
+extern struct ncclTransport profilerTransport;

 extern struct ncclTransport* ncclTransports[];
 // Forward declarations
@@ -50,8 +52,10 @@ struct ncclNvlsSharedRes {
  CUmulticastObjectProp signalProp;
  CUmemAccessDesc accessDesc;
  int dev;
-  size_t buffSize;
-  size_t creditSize;
+  size_t creditUCSize;
+  size_t creditMCSize;
+  size_t buffUCSize;
+  size_t buffMCSize;
  CUmemGenericAllocationHandle mcBuffHandle; // Multicast handle for NVLS buffer
  CUmemGenericAllocationHandle mcCreditHandle; // Multicast handle for NVLS credit buffer
  char* mcBuff; // Multicast NVLS buffer address
@@ -108,7 +112,7 @@ ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm);
 ncclResult_t ncclNvlsTreeConnect(struct ncclComm* comm);
 ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, int* nCleanupQueueElts);
 ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv);
-ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size);
+ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t ucsize, size_t mcsize);
 ncclResult_t ncclNvlsFree(struct ncclComm* comm);

 enum { collNetRecv=0, collNetSend=1 };
@@ -87,17 +87,6 @@ NCCL_PARAM(RuntimeConnect, "RUNTIME_CONNECT", 1);
 struct allocationTracker allocTracker[MAX_ALLOC_TRACK_NGPU] = {};
 static ncclResult_t commReclaim(ncclComm_t comm);

-static uint64_t hashUniqueId(ncclUniqueId const &id) {
-  char const *bytes = (char const*)&id;
-  uint64_t h = 0xdeadbeef;
-  for(int i=0; i < (int)sizeof(ncclUniqueId); i++) {
-    h ^= h >> 32;
-    h *= 0x8db3db47fa2994ad;
-    h += bytes[i];
-  }
-  return h;
-}
-
 //RCCL runtime param to set Unroll Factor
 RCCL_PARAM(UnrollFactor, "UNROLL_FACTOR", 0);

@@ -131,7 +120,7 @@ ncclResult_t commSetUnrollFactor(struct ncclComm* comm) {

 #ifdef ENABLE_MSCCLPP
 size_t std::hash<ncclUniqueId>::operator ()(const ncclUniqueId& uniqueId) const noexcept {
-  return (size_t)hashUniqueId(uniqueId);
+  return (size_t)getHash(uniqueId.internal, NCCL_UNIQUE_ID_BYTES);
 }

 bool operator ==(const ncclUniqueId& a, const ncclUniqueId& b) {
@@ -237,7 +226,7 @@ ncclResult_t ncclGetUniqueId_impl(ncclUniqueId* out) {
  // copy to avoid alignment mismatch
  memcpy(out, &handle, sizeof(handle));
  Recorder::instance().record(rrGetUniqueId, -1, -1, out);
-  TRACE_CALL("ncclGetUniqueId(0x%llx)", (unsigned long long)hashUniqueId(*out));
+  TRACE_CALL("ncclGetUniqueId(0x%llx)", (unsigned long long)getHash(out->internal, NCCL_UNIQUE_ID_BYTES));
  return ncclSuccess;
 }

@@ -485,6 +474,8 @@ static ncclResult_t commFree(ncclComm_t comm) {
      free(comm->sharedRes->tpRankToLocalRank);
      NCCLCHECK(ncclStrongStreamDestruct(&comm->sharedRes->hostStream));
      NCCLCHECK(ncclStrongStreamDestruct(&comm->sharedRes->deviceStream));
+      CUDACHECK(cudaEventDestroy(comm->sharedRes->launchEvent));
+      CUDACHECK(cudaEventDestroy(comm->sharedRes->scratchEvent));
      NCCLCHECK(ncclProxyDestroy(comm));
      free(comm->sharedRes);
    }
@@ -524,6 +515,9 @@ static ncclResult_t commFree(ncclComm_t comm) {
  NCCLCHECK(ncclProfilerPluginFinalize(comm));
  NCCLCHECK(ncclNetFinalize(comm));
  NCCLCHECK(ncclNetPluginUnload(comm));
+
+  ncclCudaContextDrop(comm->context);
+
  free(comm);

  return ncclSuccess;
@@ -570,17 +564,12 @@ ncclResult_t ncclCommEnsureReady(ncclComm_t comm) {
    ncclGroupJobAbort(comm->groupJob);
  } else {
    NCCLCHECK(ncclCommGetAsyncError(comm, &ret));
-    if (ret != ncclSuccess) {
-      /* if ret is not ncclInProgress, we just keep it. */
+    if (ret == ncclInProgress) {
      WARN("Attempt to use communicator before the previous operation returned ncclSuccess");
-      if (ret == ncclInProgress) ret = ncclInvalidArgument;
+      ret = ncclInvalidArgument;
      goto exit;
    }
-    /* if there is linked group job, we should complete it. */
-    if (comm->groupJob) {
-      NCCLCHECK(ncclGroupJobComplete(comm->groupJob));
-      comm->groupJob = NULL;
-    }
+    /* if ret is not ncclInProgress, we just keep it. */
  }

 exit:
@@ -625,6 +614,8 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
  comm->lastStream = nullptr;
  CUDACHECK(cudaGetDevice(&comm->cudaDev));

+  NCCLCHECK(ncclCudaContextTrack(&comm->context));
+
  NCCLCHECK(getBusId(comm->cudaDev, &comm->busId));
  char busId[]="0000:00:00.0";
  NCCLCHECK(int64ToBusId(comm->busId, busId));
@@ -688,6 +679,8 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
    NCCLCHECK(ncclCalloc(&sharedRes->tpRankToLocalRank, comm->nRanks));
    NCCLCHECK(ncclStrongStreamConstruct(&sharedRes->deviceStream));
    NCCLCHECK(ncclStrongStreamConstruct(&sharedRes->hostStream));
+    CUDACHECK(cudaEventCreateWithFlags(&sharedRes->launchEvent, cudaEventDisableTiming));
+    CUDACHECK(cudaEventCreateWithFlags(&sharedRes->scratchEvent, cudaEventDisableTiming));
    comm->sharedRes = sharedRes;
    sharedRes->refCount = 1;
  } else {
@@ -730,13 +723,14 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
  struct ncclDevCommAndChannels *devCommAndChans = NULL;
  struct ncclNvmlCCStatus ccStatus;
  bool ccEnable = false;
+  cudaStream_t deviceStream;

-  NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), ret, fail);
-  NCCLCHECKGOTO(ncclCudaCallocAsync(&devCommAndChans, 1, comm->sharedRes->deviceStream.cudaStream), ret, fail);
+  NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), ret, fail);
+  NCCLCHECKGOTO(ncclCudaCallocAsync(&devCommAndChans, 1, deviceStream), ret, fail);
  ncclCommPushCudaFree(comm, devCommAndChans);
-  NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.rankToLocalRank, comm->nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
+  NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.rankToLocalRank, comm->nRanks, deviceStream), ret, fail);
  ncclCommPushCudaFree(comm, tmpCommAndChans.comm.rankToLocalRank);
-  NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.comm.rankToLocalRank, comm->rankToLocalRank, comm->nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
+  NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.comm.rankToLocalRank, comm->rankToLocalRank, comm->nRanks, deviceStream), ret, fail);
  comm->devComm = &devCommAndChans->comm;
  tmpCommAndChans.comm.rank = comm->rank;
  tmpCommAndChans.comm.nRanks = nRanks;
@@ -759,12 +753,22 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
  if (ccEnable) {
    comm->workFifoBytes = 0;
  } else {
-    comm->workFifoBytes = ncclParamWorkFifoBytes();
-    if (0 != (comm->workFifoBytes & (comm->workFifoBytes-1))) {
-      WARN("NCCL_WORK_FIFO_BYTES=%d is being ignored because it is not a power of 2.", comm->workFifoBytes);
-      comm->workFifoBytes = NCCL_WORK_FIFO_BYTES_DEFAULT;
+    int64_t workFifoBytesParam = ncclParamWorkFifoBytes();
+    if (workFifoBytesParam == -1) {
+      if (comm->MNNVL && (comm->compCap >= 100)) {
+        // WAR: Disable work fifo for Blackwell all2all hang issue on MNNVL
+        INFO(NCCL_INIT, "Disabling work fifo");
+        comm->workFifoBytes = 0;
+      } else {
+        comm->workFifoBytes = NCCL_WORK_FIFO_BYTES_DEFAULT;
+      }
+    } else {
+      if (0 != (workFifoBytesParam & (workFifoBytesParam-1))) {
+        WARN("NCCL_WORK_FIFO_BYTES=%ld is being ignored because it is not a power of 2.", workFifoBytesParam);
+        comm->workFifoBytes = NCCL_WORK_FIFO_BYTES_DEFAULT;
+      }
+      comm->workFifoBytes = std::min<uint64_t>(workFifoBytesParam, 1ul<<30);
    }
-    comm->workFifoBytes = std::min(comm->workFifoBytes, 1u<<30);
  }
 #else
  comm->workFifoBytes = ncclParamWorkFifoBytes();
@@ -797,10 +801,18 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
  comm->workFifoConsumedLeast = 0;
  tmpCommAndChans.comm.workConsumed = comm->workFifoConsumed;

+  // Alloc profiler counters for the kernel
+  NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->profiler.workStarted, MAXCHANNELS), ret, fail);
+  NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->profiler.workCompleted, MAXCHANNELS), ret, fail);
+  tmpCommAndChans.comm.workStarted = comm->profiler.workStarted;
+  tmpCommAndChans.comm.workCompleted = comm->profiler.workCompleted;
+  ncclCommPushCudaHostFree(comm, comm->profiler.workStarted);
+  ncclCommPushCudaHostFree(comm, comm->profiler.workCompleted);
+
  if (comm->collNetDenseToUserRank != nullptr) {
-    NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.collNetDenseToUserRank, nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
+    NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.collNetDenseToUserRank, nRanks, deviceStream), ret, fail);
    ncclCommPushCudaFree(comm, tmpCommAndChans.comm.collNetDenseToUserRank);
-    NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.comm.collNetDenseToUserRank, comm->collNetDenseToUserRank, nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
+    NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.comm.collNetDenseToUserRank, comm->collNetDenseToUserRank, nRanks, deviceStream), ret, fail);
  }

  for (int c=0; c < MAXCHANNELS; c++) {
@@ -814,7 +826,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
    tmpCommAndChans.channels[c].nvls = comm->channels[c].nvls;

    if (comm->channels[c].ring.userRanks != nullptr) {
-      NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.channels[c].ring.userRanks, comm->channels[c].ring.userRanks, nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
+      NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.channels[c].ring.userRanks, comm->channels[c].ring.userRanks, nRanks, deviceStream), ret, fail);
    }
  }

@@ -839,10 +851,10 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
  tmpCommAndChans.comm.faults = comm->faults;
 #endif

-  NCCLCHECKGOTO(ncclCudaMemcpyAsync(devCommAndChans, &tmpCommAndChans, 1, comm->sharedRes->deviceStream.cudaStream), ret, fail);
+  NCCLCHECKGOTO(ncclCudaMemcpyAsync(devCommAndChans, &tmpCommAndChans, 1, deviceStream), ret, fail);
 exit:
+  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false));
  NCCLCHECK(ncclStrongStreamSynchronize(&comm->sharedRes->deviceStream));
-  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream));
  return ret;
 fail:
  goto exit;
@@ -1507,6 +1519,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
      graphs[a]->typeInter = std::max(allGather3Data[i].graphInfo[a].typeInter, graphs[a]->typeInter);
      graphs[a]->crossNic = std::max(allGather3Data[i].graphInfo[a].crossNic, graphs[a]->crossNic);
    }
+    comm->maxTreePattern = std::max(comm->maxTreePattern, allGather3Data[i].graphInfo[NCCL_ALGO_TREE].pattern);
  }
  if (graphs[NCCL_ALGO_COLLNET_CHAIN]->nChannels == 0) comm->collNetSupport = 0;
  if (graphs[NCCL_ALGO_NVLS]->nChannels == 0) comm->nvlsSupport = comm->nvlsChannels = 0;
@@ -1932,12 +1945,12 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
    timers[TIMER_INIT_ALLOC] = clockNano();
    NCCLCHECKGOTO(commAlloc(comm, job->parent, job->nranks, job->myrank), res, fail);
    timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC];
-    // obtain a unique hash for the comm, re-using part of the parent's hash, commHash is a 64bit struct (=16 hex),
-    // add unique split counter and the color
-    ncclUniqueId tmpId;
-    memset(&tmpId,0,sizeof(ncclUniqueId));// must set 0 here to avoid undefined bits
-    snprintf((char*)&tmpId, NCCL_UNIQUE_ID_BYTES, "%016lx-%d-%d", job->parent->commHash, job->splitCount, job->color);
-    comm->commHash = getHash(tmpId.internal, NCCL_UNIQUE_ID_BYTES);
+    // child hash obtained from (parent hash, split count, color)
+    uint64_t hacc[2] = {1, 1};
+    eatHash(hacc, &job->parent->commHash);
+    eatHash(hacc, &job->splitCount);
+    eatHash(hacc, &job->color);
+    comm->commHash = digestHash(hacc);
    INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p splitCount %d color %d key %d- Init START", job->funcName,
         comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->splitCount, job->color, job->key);
    timers[TIMER_INIT_BOOTSTRAP] = clockNano();
@@ -1950,8 +1963,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
    NCCLCHECKGOTO(commAlloc(comm, NULL, job->nranks, job->myrank), res, fail);
    timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC];
    // obtain a unique hash using the first commId
-    comm->commHash = getHash(job->commId->internal, NCCL_UNIQUE_ID_BYTES);
-    commIdHash = hashUniqueId(job->commId[0]);
+    comm->commHash = commIdHash = getHash(job->commId->internal, NCCL_UNIQUE_ID_BYTES);
    INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init START", job->funcName,
         comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, commIdHash);
    timers[TIMER_INIT_BOOTSTRAP] = clockNano();
@@ -1991,12 +2003,12 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
        auto& mscclppUniqueId = mscclpp_uniqueIdMap[*job->commId];
        if (comm->localRank == 0 && !mapContainsId) {
          NCCLCHECKGOTO(mscclpp_ncclGetUniqueId(&mscclppUniqueId), res, fail);
-          TRACE_CALL("mscclpp_ncclGetUniqueId(0x%llx)", (unsigned long long)hashUniqueId(mscclppUniqueId));
+          TRACE_CALL("mscclpp_ncclGetUniqueId(0x%llx)", (unsigned long long)getHash(mscclppUniqueId.internal, NCCL_UNIQUE_ID_BYTES));
        }

        NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, &mscclppUniqueId, sizeof(mscclppUniqueId)), res, fail);
        unsigned long long mscclppUniqueIdHash; (void)mscclppUniqueIdHash;
-        TRACE_CALL("bootstrapIntraNodeBroadcast(rank=%d, nranks=%d, root=%d, bcastData=hash:0x%llx)", comm->localRank, comm->localRanks, 0, (mscclppUniqueIdHash = (unsigned long long)hashUniqueId(mscclppUniqueId)));
+        TRACE_CALL("bootstrapIntraNodeBroadcast(rank=%d, nranks=%d, root=%d, bcastData=hash:0x%llx)", comm->localRank, comm->localRanks, 0, (mscclppUniqueIdHash = (unsigned long long)getHash(mscclppUniqueId.internal, NCCL_UNIQUE_ID_BYTES)));
        mscclpp_uniqueIdReverseMap[mscclppUniqueId].insert(*job->commId);

        comm->mscclpp_threshold = rcclParamMscclppThreshold();
@@ -2228,6 +2240,7 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
  NCCL_CONFIG_DEFAULT(internalConfigPtr, maxCTAs, NCCL_CONFIG_UNDEF_INT, MAXCHANNELS, "Max CTAs", "%d");
  NCCL_CONFIG_DEFAULT(internalConfigPtr, netName, NCCL_CONFIG_UNDEF_PTR, NULL, "Net name", "%s");
  NCCL_CONFIG_DEFAULT(internalConfigPtr, splitShare, NCCL_CONFIG_UNDEF_INT, 0, "Split share", "%d");
+  NCCL_CONFIG_DEFAULT(internalConfigPtr, trafficClass, NCCL_CONFIG_UNDEF_INT, NCCL_CONFIG_UNDEF_INT, "Traffic class", "%d");

  /* assign config to communicator */
  comm->config.blocking = internalConfigPtr->blocking;
@@ -2236,6 +2249,7 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
  comm->config.maxCTAs = internalConfigPtr->maxCTAs;
  comm->config.netName = internalConfigPtr->netName;
  comm->config.splitShare = internalConfigPtr->splitShare;
+  comm->config.trafficClass = internalConfigPtr->trafficClass;

  NCCLCHECKGOTO(envConfigOverride(comm), ret, fail);

@@ -2260,6 +2274,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, int nId
  const char* commIdEnv = NULL;
  ncclComm_t comm = NULL;
  struct ncclCommInitRankAsyncJob* job = NULL;
+  bool launchedJob = false;
  // first call ncclInit, this will setup the environment
  NCCLCHECKGOTO(ncclInit(), res, fail);

@@ -2313,6 +2328,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, int nId
    // start the bootstrap root before bootstrapping, use only the first handle
    NCCLCHECKGOTO(bootstrapCreateRoot((struct ncclBootstrapHandle*)&job->commId[0], true), res, fail);
  }
+  launchedJob = true;
  NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, ncclCommInitRankFunc, NULL, ncclCommInitJobFree, comm), res, fail);

 exit:
@@ -2321,7 +2337,7 @@ exit:
  NCCLCHECK(Recorder::instance().record(rrCommInitDev, nranks, myrank, commId, comm, cudaDev));
  return ncclGroupErrCheck(res);
 fail:
-  if (job) ncclCommInitJobFree(job);
+  if (job && !launchedJob) ncclCommInitJobFree(job);
  if (comm) {
    free(comm->abortFlag);
    if (comm->abortFlagDev) (void)ncclCudaHostFree((void*)comm->abortFlagDev);
@@ -2520,7 +2536,7 @@ static ncclResult_t commDestroySync(struct ncclAsyncJob* job_) {
    NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm), ret, fail);
    NCCLCHECKGOTO(ncclCommPollCallbacks(comm, false), ret, fail);
    // And keep polling until all graphs referencing us die.
-    while (comm->persistentRefs != 0) {
+    while (comm->localPersistentRefs != 0) {
      NCCLCHECKGOTO(ncclCommPollCallbacks(comm, /*waitSome=*/true), ret, fail);
    }
    while (!ncclIntruQueueEmpty(&comm->legacyRegCleanupQueue)) {
@@ -2613,7 +2629,6 @@ exit:
  }
  return ret;
 fail:
-  free(job);
  if (comm && !comm->config.blocking) (void) ncclCommSetAsyncError(comm, ret);
  goto exit;
 }
@@ -2896,6 +2911,11 @@ ncclResult_t ncclCommGetAsyncError_impl(ncclComm_t comm, ncclResult_t *asyncErro

  *asyncError = __atomic_load_n(&comm->asyncResult, __ATOMIC_ACQUIRE);
  if (*asyncError == ncclSuccess && comm->proxyState) *asyncError = __atomic_load_n(&comm->proxyState->asyncResult, __ATOMIC_ACQUIRE);
+  /* if there is linked group job, we should complete it. */
+  if (*asyncError == ncclSuccess && comm->groupJob) {
+    NCCLCHECK(ncclGroupJobComplete(comm->groupJob));
+    comm->groupJob = NULL;
+  }
  return ncclSuccess;
 }

@@ -2949,16 +2969,13 @@ ncclResult_t  ncclMemAlloc_impl(void **ptr, size_t size) {

 #if CUDART_VERSION >= 12010
  size_t memGran = 0;
-  size_t mcGran = 0;
  CUdevice currentDev;
  CUmemAllocationProp memprop = {};
-  CUmulticastObjectProp mcprop = {};
  CUmemAccessDesc accessDesc = {};
  CUmemGenericAllocationHandle handle;
  int cudaDev;
  int flag;
  int dcnt;
-  int mcSupport = 0;

  if (ptr == NULL || size == 0) goto fallback;

@@ -2968,6 +2985,7 @@ ncclResult_t  ncclMemAlloc_impl(void **ptr, size_t size) {
  CUCHECK(cuDeviceGet(&currentDev, cudaDev));

  if (ncclCuMemEnable()) {
+    size_t handleSize = size;
    int requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
    // Query device to see if FABRIC handle support is available
    flag = 0;
@@ -2983,40 +3001,25 @@ ncclResult_t  ncclMemAlloc_impl(void **ptr, size_t size) {
    if (flag) memprop.allocFlags.gpuDirectRDMACapable = 1;
    CUCHECK(cuMemGetAllocationGranularity(&memGran, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
    CUDACHECK(cudaGetDeviceCount(&dcnt));
-
-    if (CUPFN(cuMulticastCreate) != NULL) CUCHECK(cuDeviceGetAttribute(&mcSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, currentDev));
-    if (mcSupport) {
-      /* mc property */
-      mcprop.size = size;
-      /* device cnt is a dummy value right now, it might affect mc granularity in the future. */
-      mcprop.numDevices = dcnt;
-      mcprop.handleTypes = requestedHandleTypes;
-      mcprop.flags = 0;
-      CUCHECK(cuMulticastGetGranularity(&mcGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
-
-      /* only size needs to be aligned to mcGran */
-      ALIGN_SIZE(size, mcGran);
-    } else {
-      ALIGN_SIZE(size, memGran);
-    }
+    ALIGN_SIZE(handleSize, memGran);

    if (requestedHandleTypes & CU_MEM_HANDLE_TYPE_FABRIC) {
      /* First try cuMemCreate() with FABRIC handle support and then remove if it fails */
-      CUresult err = CUPFN(cuMemCreate(&handle, size, &memprop, 0));
+      CUresult err = CUPFN(cuMemCreate(&handle, handleSize, &memprop, 0));
      if (err == CUDA_ERROR_NOT_PERMITTED || err == CUDA_ERROR_NOT_SUPPORTED) {
        requestedHandleTypes &= ~CU_MEM_HANDLE_TYPE_FABRIC;
        memprop.requestedHandleTypes = (CUmemAllocationHandleType) requestedHandleTypes;
        /* Allocate the physical memory on the device */
-        CUCHECK(cuMemCreate(&handle, size, &memprop, 0));
+        CUCHECK(cuMemCreate(&handle, handleSize, &memprop, 0));
      }
    } else {
      /* Allocate the physical memory on the device */
-      CUCHECK(cuMemCreate(&handle, size, &memprop, 0));
+      CUCHECK(cuMemCreate(&handle, handleSize, &memprop, 0));
    }
    /* Reserve a virtual address range */
-    CUCHECK(cuMemAddressReserve((CUdeviceptr*)ptr, size, memGran, 0, 0));
+    CUCHECK(cuMemAddressReserve((CUdeviceptr*)ptr, handleSize, memGran, 0, 0));
    /* Map the virtual address range to the physical allocation */
-    CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
+    CUCHECK(cuMemMap((CUdeviceptr)*ptr, handleSize, 0, handle, 0));
    /* Now allow RW access to the newly mapped memory */
    for (int i = 0; i < dcnt; ++i) {
      int p2p = 0;
@@ -3024,7 +3027,7 @@ ncclResult_t  ncclMemAlloc_impl(void **ptr, size_t size) {
        accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
        accessDesc.location.id = i;
        accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-        CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
+        CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, handleSize, &accessDesc, 1));
      }
      if (0 == p2p && i != cudaDev) INFO(NCCL_ALLOC, "P2P not supported between GPU%d and GPU%d", cudaDev, i);
    }
@@ -4,6 +4,7 @@
 * See LICENSE.txt for license information
 ************************************************************************/

+#include "alloc.h"
 #include "nccl.h"
 #include "debug.h"
 #include "param.h"
@@ -67,6 +68,36 @@ int ncclCuMemHostEnable() {
      ncclCumemHostEnable = paramValue;
    else
      ncclCumemHostEnable = (cudaDriverVersion >= 12060) ? 1 : 0;
+    if (ncclCumemHostEnable) {
+      // Verify that host allocations actually work.  Docker in particular is known to disable "get_mempolicy",
+      // causing such allocations to fail (this can be fixed by invoking Docker with "--cap-add SYS_NICE").
+      int cudaDev;
+      CUdevice currentDev;
+      int cpuNumaNodeId = -1;
+      CUmemAllocationProp prop = {};
+      size_t granularity = 0;
+      size_t size;
+      CUmemGenericAllocationHandle handle;
+      CUDACHECK(cudaGetDevice(&cudaDev));
+      CUCHECK(cuDeviceGet(&currentDev, cudaDev));
+      CUCHECK(cuDeviceGetAttribute(&cpuNumaNodeId, CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID, currentDev));
+      if (cpuNumaNodeId < 0) cpuNumaNodeId = 0;
+      prop.location.type = CU_MEM_LOCATION_TYPE_HOST_NUMA;
+      prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+      prop.requestedHandleTypes = ncclCuMemHandleType;
+      prop.location.id = cpuNumaNodeId;
+      CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
+      size = 1;
+      ALIGN_SIZE(size, granularity);
+      if (CUPFN(cuMemCreate(&handle, size, &prop, 0)) != CUDA_SUCCESS) {
+        INFO(NCCL_INIT, "cuMem host allocations do not appear to be working; falling back to a /dev/shm/ based "
+             "implementation. This could be due to the container runtime disabling NUMA support. "
+             "To disable this warning, set NCCL_CUMEM_HOST_ENABLE=0");
+        ncclCumemHostEnable = 0;
+      } else {
+        CUCHECK(cuMemRelease(handle));
+      }
+    }
  }
  return ncclCumemHostEnable;
 error:
@@ -169,7 +169,7 @@ ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen,
  } control_un;

  struct cmsghdr *cmptr;
-  char dummy_buffer[1];
+  char dummy_buffer[1] = {'\0'};
  struct sockaddr_un cliaddr;

  // Construct client address to send this shareable handle to
@@ -190,6 +190,7 @@ ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen,
  TRACE(NCCL_INIT, "UDS: Sending hdr %p len %d fd %d to UDS socket %s", hdr, hdrLen, sendFd, temp);

  if (sendFd != -1) {
+    memset(&control_un, '\0', sizeof(control_un));
    msg.msg_control = control_un.control;
    msg.msg_controllen = sizeof(control_un.control);

@@ -32,6 +32,7 @@ void setEnvFile(const char* fileName) {
  size_t n = 0;
  ssize_t read;
  while ((read = getline(&line, &n, file)) != -1) {
+    if (line[0] == '#') continue;
    if (line[read-1] == '\n') line[read-1] = '\0';
    int s=0; // Env Var Size
    while (line[s] != '\0' && line[s] != '=') s++;
@@ -176,6 +176,7 @@ static int findInterfaces(const char* prefixList, char* names, union ncclSocketA
      strncpy(names+found*maxIfNameSize, interface->ifa_name, maxIfNameSize);
      // Store the IP address
      int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
+      memset(addrs+found, '\0', sizeof(*addrs));
      memcpy(addrs+found, interface->ifa_addr, salen);
      found++;
    }
@@ -919,9 +920,17 @@ ncclResult_t ncclSocketShutdown(struct ncclSocket* sock, int how) {
  return ncclSuccess;
 }

-ncclResult_t ncclSocketClose(struct ncclSocket* sock) {
+ncclResult_t ncclSocketClose(struct ncclSocket* sock, bool wait) {
  if (sock != NULL) {
    if (sock->state > ncclSocketStateNone && sock->state < ncclSocketStateNum && sock->fd >= 0) {
+      if (wait) {
+        char data;
+        int closed = 0;
+        do {
+          int offset = 0;
+          if (ncclSocketProgress(NCCL_SOCKET_RECV, sock, &data, sizeof(char), &offset, &closed) != ncclSuccess) break;
+        } while (closed == 0);
+      }
      /* shutdown() is needed to send FIN packet to proxy thread; shutdown() is not affected
       * by refcount of fd, but close() is. close() won't close a fd and send FIN packet if
       * the fd is duplicated (e.g. fork()). So shutdown() guarantees the correct and graceful
@@ -9,28 +9,61 @@
 #include "checks.h"
 #include "param.h"

-// Tracks the chain of graph nodes for a given graph captured identified by
-// its graph id. This state has to live for as long as captured work is being
-// submitted. CUDA doesn't have mechanism to inform us when the user ends capture
-// so the best we can do is get notified when the graph is destroyed.
-struct ncclStrongStreamGraph {
-  struct ncclStrongStreamGraph* next;
-  // Atomically exchanged to false by both the main thread or the graph destructor
-  // callback. The last to arrive deletes the node.
-  bool alive;
+// Tracks the captured work a given graph captured identified by its graph id.
+struct ncclStrongStreamCapture {
+  struct ncclStrongStreamCapture* next;
+  cudaGraph_t graph;
  unsigned long long graphId;
-  // For each graph we track the "tip" of the chain of graph nodes. A linear
-  // chain would always have just one node at its tip, but since we have to merge
-  // in chains from other streams (via ncclStrongStreamWaitStream) some spots
-  // in the chain can be wider than a single node and thus need a list, so we
-  // maintain a dynamically sized array of tip nodes.
-  int tipCount, tipCapacity;
-  cudaGraphNode_t* tipNodes;
+  cudaStream_t captureStream;
+  cudaGraphNode_t lastRecord;
+  void* acquiredBy;
 };

-static void ncclStrongStreamGraphDelete(struct ncclStrongStreamGraph* g) {
-  free(g->tipNodes);
-  free(g);
+////////////////////////////////////////////////////////////////////////////////
+
+static ncclCudaContext* cxtListHead = nullptr;
+static pthread_mutex_t cxtListLock = PTHREAD_MUTEX_INITIALIZER;
+
+ncclResult_t ncclCudaContextTrack(struct ncclCudaContext** out) {
+  ncclResult_t result = ncclSuccess;
+  CUcontext hcontext;
+  cuCtxGetCurrent(&hcontext);
+
+  pthread_mutex_lock(&cxtListLock);
+  struct ncclCudaContext* p = cxtListHead;
+  while (1) {
+    if (p == nullptr) {
+      p = (struct ncclCudaContext*)calloc(1, sizeof(struct ncclCudaContext));
+      p->refCount = 1;
+      p->hcontext = hcontext;
+      p->next = cxtListHead;
+      cxtListHead = p;
+      NCCLCHECKGOTO(ncclStrongStreamConstruct(&p->launchOrder), result, leave);
+      break;
+    }
+    if (p->hcontext == hcontext) {
+      p->refCount += 1;
+      break;
+    }
+    p = p->next;
+  }
+leave:
+  pthread_mutex_unlock(&cxtListLock);
+  *out = p;
+  return ncclSuccess;
+}
+
+void ncclCudaContextDrop(struct ncclCudaContext* cxt) {
+  pthread_mutex_lock(&cxtListLock);
+  if (0 == --cxt->refCount) {
+    struct ncclCudaContext** pp = &cxtListHead;
+    while (*pp != cxt) pp = &(*pp)->next;
+    *pp = cxt->next; // remove from list
+    // Destroy resources held in cxt
+    ncclStrongStreamDestruct(&cxt->launchOrder);
+    free(cxt);
+  }
+  pthread_mutex_unlock(&cxtListLock);
 }

 ////////////////////////////////////////////////////////////////////////////////
@@ -40,13 +73,14 @@ ncclResult_t ncclCudaGetCapturingGraph(
  ) {
 #if ROCM_VERSION >= 60100
  hipStreamCaptureStatus status;
-  unsigned long long gid;
-  CUDACHECK(hipStreamGetCaptureInfo_v2(stream, &status, &gid, &graph->graph, nullptr, nullptr));
+  CUDACHECK(hipStreamGetCaptureInfo_v2(stream, &status, &graph->graphId, &graph->graph, nullptr, nullptr));
  if (status != hipStreamCaptureStatusActive) {
+    graph->origin = nullptr;
    graph->graph = nullptr;
-    gid = ULLONG_MAX;
+    graph->graphId = ULLONG_MAX;
+  } else {
+    graph->origin = stream;
  }
-  graph->graphId = gid;
 #endif
  return ncclSuccess;
 }
@@ -68,315 +102,250 @@ ncclResult_t ncclCudaGraphAddDestructor(struct ncclCudaGraph graph, cudaHostFn_t
 ////////////////////////////////////////////////////////////////////////////////

 ncclResult_t ncclStrongStreamConstruct(struct ncclStrongStream* ss) {
-  CUDACHECK(cudaStreamCreateWithFlags(&ss->cudaStream, cudaStreamNonBlocking));
+  CUDACHECK(cudaStreamCreateWithFlags(&ss->liveStream, cudaStreamNonBlocking));
  #if ROCM_VERSION >= 60100
-    CUDACHECK(cudaEventCreateWithFlags(&ss->serialEvent, cudaEventDisableTiming));
    ss->everCaptured = false;
-    ss->serialEventNeedsRecord = false;
-    ss->graphHead = nullptr;
-  #else
-    CUDACHECK(cudaEventCreateWithFlags(&ss->scratchEvent, cudaEventDisableTiming));
+    ss->captureHead = nullptr;
+    pthread_mutex_init(&ss->lock, nullptr);
+    CUDACHECK(cudaEventCreateWithFlags(&ss->serialEvent, cudaEventDisableTiming));
  #endif
  return ncclSuccess;
 }

-static void graphDestructor(void* arg) {
-  struct ncclStrongStreamGraph* g = (struct ncclStrongStreamGraph*)arg;
-  if (false == __atomic_exchange_n(&g->alive, false, __ATOMIC_ACQ_REL)) {
-    // Last to arrive deletes list node.
-    ncclStrongStreamGraphDelete(g);
-  }
-}
-
 ncclResult_t ncclStrongStreamDestruct(struct ncclStrongStream* ss) {
-  CUDACHECK(cudaStreamDestroy(ss->cudaStream));
+  CUDACHECK(cudaStreamDestroy(ss->liveStream));
  #if ROCM_VERSION >= 60100
-    CUDACHECK(cudaEventDestroy(ss->serialEvent));
-    // Delete list of per-graph chains.
-    struct ncclStrongStreamGraph* g = ss->graphHead;
-    while (g != nullptr) {
-      struct ncclStrongStreamGraph* next = g->next;
-      if (false == __atomic_exchange_n(&g->alive, false, __ATOMIC_ACQ_REL)) {
-        // Last to arrive deletes list node.
-        ncclStrongStreamGraphDelete(g);
-      }
-      g = next;
+    struct ncclStrongStreamCapture* cap = ss->captureHead;
+    while (cap) {
+      struct ncclStrongStreamCapture* next = cap->next;
+      CUDACHECK(cudaStreamDestroy(cap->captureStream));
+      free(cap);
+      cap = next;
    }
-  #else
-    CUDACHECK(cudaEventDestroy(ss->scratchEvent));
+    CUDACHECK(cudaEventDestroy(ss->serialEvent));
+    pthread_mutex_destroy(&ss->lock);
  #endif
  return ncclSuccess;
 }

 NCCL_PARAM(GraphMixingSupport, "GRAPH_MIXING_SUPPORT", 0)
+NCCL_PARAM(LaunchRaceFatal, "LAUNCH_RACE_FATAL", 1);
+constexpr char const* launchRaceFatalMsg = "Fatal: host threads racing to launch NCCL on same device.";

-static void ensureTips(struct ncclStrongStreamGraph* g, int n) {
-  if (g->tipCapacity < n) {
-    g->tipNodes = (cudaGraphNode_t*)realloc(g->tipNodes, n*sizeof(cudaGraphNode_t));
-    g->tipCapacity = n;
-  }
-}
+static __thread char threadIdMarker;
+static void* localThreadId() { return &threadIdMarker; }

 ncclResult_t ncclStrongStreamAcquire(
-    struct ncclCudaGraph graph, struct ncclStrongStream* ss
+   struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent,
+   cudaStream_t* workStream
  ) {
  #if ROCM_VERSION >= 60100
    bool mixing = ncclParamGraphMixingSupport();
-    if (graph.graph == nullptr) {
-      if (mixing && ss->everCaptured) {
-        CUDACHECK(cudaStreamWaitEvent(ss->cudaStream, ss->serialEvent, 0));
-        ss->serialEventNeedsRecord = false;
+    if (graph.graphId == ULLONG_MAX) {
+      *workStream = ss->liveStream;
+      ss->liveAcquiredBy = localThreadId();
+      if (mixing && __atomic_load_n(&ss->everCaptured, __ATOMIC_RELAXED)) {
+        CUDACHECK(cudaStreamWaitEvent(ss->liveStream, ss->serialEvent, 0));
      }
    } else {
-      ss->everCaptured = true;
-      // Find the current graph in our list of graphs if it exists.
-      struct ncclStrongStreamGraph** pg = &ss->graphHead;
-      struct ncclStrongStreamGraph* g;
-      while (*pg != nullptr) {
-        g = *pg;
-        if (g->graphId == graph.graphId) {
-          // Move to front of list so that operations after acquire don't have to search the list.
-          *pg = g->next;
-          g->next = ss->graphHead;
-          ss->graphHead = g;
+      bool firstCapture = !ss->everCaptured;
+      __atomic_store_n(&ss->everCaptured, true, __ATOMIC_RELAXED);
+
+      ncclResult_t ret = ncclSuccess;
+      if (concurrent) pthread_mutex_lock(&ss->lock);
+
+      // Look for capture in our list of active captures.
+      struct ncclStrongStreamCapture** pcap = &ss->captureHead;
+      struct ncclStrongStreamCapture* cap;
+      struct ncclStrongStreamCapture* spare = nullptr;
+      while (*pcap != nullptr) {
+        cap = *pcap;
+        if (cap->graphId == graph.graphId) { // Capture node already exists.
+          *workStream = cap->captureStream;
+          cap->acquiredBy = localThreadId();
+          if (concurrent) pthread_mutex_unlock(&ss->lock);
          return ncclSuccess;
-        } else if (false == __atomic_load_n(&g->alive, __ATOMIC_ACQUIRE)) {
-          // Unrelated graph that has been destroyed. Remove and delete.
-          *pg = g->next;
-          ncclStrongStreamGraphDelete(g);
        } else {
-          pg = &g->next;
+          cudaStreamCaptureStatus status;
+          CUDACHECKGOTO(cudaStreamIsCapturing(cap->captureStream, &status), ret, do_unlock);
+          if (status == cudaStreamCaptureStatusActive) {
+            pcap = &cap->next; // Active capture doesn't match, on to next.
+          } else { // Capture no longer active
+            *pcap = cap->next; // Remove from current list
+            if (spare == nullptr) { // Keep one spare to reuse below.
+              spare = cap;
+            } else {
+              cudaStreamDestroy(cap->captureStream);
+              free(cap);
+            }
+          }
        }
      }
-
-      // This is a new graph so add to the list.
-      g = (struct ncclStrongStreamGraph*)malloc(sizeof(struct ncclStrongStreamGraph));
-      g->graphId = graph.graphId;
-      g->tipNodes = nullptr;
-      g->tipCapacity = 0;
-      g->tipCount = 0;
-      g->next = ss->graphHead;
-      ss->graphHead = g;
-      g->alive = true;
-      NCCLCHECK(ncclCudaGraphAddDestructor(graph, graphDestructor, (void*)g));
-
-      if (mixing && ss->serialEventNeedsRecord) {
-        // Can only be here if previous release was for uncaptured work that
-        // elided updating the event because no capture had yet occurred.
-        CUDACHECK(cudaStreamWaitEvent(ss->cudaStream, ss->serialEvent, 0));
-        CUDACHECK(cudaEventRecord(ss->serialEvent, ss->cudaStream));
+      // No matching capture, need a new entry.
+      cap = spare;
+      if (cap == nullptr) {
+        cap = (struct ncclStrongStreamCapture*)calloc(1, sizeof(struct ncclStrongStreamCapture));
+        CUDACHECKGOTO(cudaStreamCreateWithFlags(&cap->captureStream, cudaStreamNonBlocking), ret, do_unlock);
      }
-      ss->serialEventNeedsRecord = false;
+      cap->graphId = graph.graphId;
+      cap->lastRecord = nullptr;
+      cap->acquiredBy = localThreadId();
+      // Push to capturing list.
+      cap->next = ss->captureHead;
+      ss->captureHead = cap;

-      // First node in the chain must be a wait on the serialEvent.
+    do_unlock:
+      if (concurrent) pthread_mutex_unlock(&ss->lock);
+      if (ret != ncclSuccess) return ret;
+
+      *workStream = cap->captureStream;
+
+      // Bring captureStream into the graph but without any dependencies.
+      cudaEvent_t scratch;
+      CUDACHECK(cudaEventCreateWithFlags(&scratch, cudaEventDisableTiming));
+      CUDACHECK(cudaEventRecord(scratch, graph.origin));
+      CUDACHECK(cudaStreamWaitEvent(cap->captureStream, scratch, 0));
+      CUDACHECK(cudaEventDestroy(scratch));
+      CUDACHECK(cudaStreamUpdateCaptureDependencies(cap->captureStream, nullptr, 0, cudaStreamSetCaptureDependencies));
+
+      if (mixing && firstCapture) {
+        CUDACHECK(cudaEventRecord(ss->serialEvent, ss->liveStream));
+      }
      if (mixing) {
-        ensureTips(g, 1);
-        CUDACHECK(cudaGraphAddEventWaitNode(&g->tipNodes[0], graph.graph, nullptr, 0, ss->serialEvent));
-        g->tipCount = 1;
-      } else {
-        g->tipCount = 0;
+        // First dependency is to wait on serialEvent
+        CUDACHECK(cudaStreamWaitEvent(cap->captureStream, ss->serialEvent, 0));
      }
    }
  #endif
  return ncclSuccess;
 }

-ncclResult_t ncclStrongStreamAcquireUncaptured(struct ncclStrongStream* ss) {
+ncclResult_t ncclStrongStreamAcquiredWorkStream(
+    struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent,
+    cudaStream_t* workStream
+  ) {
  #if ROCM_VERSION >= 60100
-    bool mixing = ncclParamGraphMixingSupport();
-    if (mixing && ss->everCaptured) {
-      CUDACHECK(cudaStreamWaitEvent(ss->cudaStream, ss->serialEvent, 0));
+    if (graph.graphId == ULLONG_MAX) {
+      *workStream = ss->liveStream;
+    } else {
+      if (concurrent) pthread_mutex_lock(&ss->lock);
+      struct ncclStrongStreamCapture* cap = ss->captureHead;
+      while (cap->graphId != graph.graphId) cap = cap->next;
+      *workStream = cap->captureStream;
+      if (concurrent) pthread_mutex_unlock(&ss->lock);
    }
-    ss->serialEventNeedsRecord = true; // Assume the caller is going to add work to stream.
+  #else
+    *workStream = ss->liveStream
  #endif
  return ncclSuccess;
 }

-static ncclResult_t checkGraphId(struct ncclStrongStreamGraph* g, unsigned long long id) {
-  if (g == nullptr || g->graphId != id) {
-    WARN("Expected graph id=%llu was not at head of strong stream's internal list.", id);
-    return ncclInternalError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss) {
+ncclResult_t ncclStrongStreamRelease(
+    struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent
+  ) {
  #if ROCM_VERSION >= 60100
    bool mixing = ncclParamGraphMixingSupport();
-    if (mixing && ss->serialEventNeedsRecord) {
-      if (graph.graph == nullptr) {
-        if (ss->everCaptured) {
-          CUDACHECK(cudaEventRecord(ss->serialEvent, ss->cudaStream));
-          ss->serialEventNeedsRecord = false;
+    if (mixing) {
+      if (graph.graphId == ULLONG_MAX) {
+        if (__atomic_load_n(&ss->everCaptured, __ATOMIC_RELAXED)) {
+          CUDACHECK(cudaEventRecord(ss->serialEvent, ss->liveStream));
+        }
+        if (ss->liveAcquiredBy != localThreadId() && ncclParamLaunchRaceFatal()) {
+          WARN("%s", launchRaceFatalMsg);
+          return ncclInvalidUsage;
        }
      } else {
-        struct ncclStrongStreamGraph* g = ss->graphHead;
-        NCCLCHECK(checkGraphId(g, graph.graphId));
-        ensureTips(g, 1);
-        CUDACHECK(cudaGraphAddEventRecordNode(&g->tipNodes[0], graph.graph, g->tipNodes, g->tipCount, ss->serialEvent));
-        g->tipCount = 1;
-        ss->serialEventNeedsRecord = false;
+        if (concurrent) pthread_mutex_lock(&ss->lock);
+        struct ncclStrongStreamCapture* cap = ss->captureHead;
+        while (cap->graphId != graph.graphId) cap = cap->next;
+        if (concurrent) pthread_mutex_unlock(&ss->lock);
+
+        // Add event record node with dependencies added further down.
+        cudaGraphNode_t recordNode;
+        CUDACHECK(cudaGraphAddEventRecordNode(&recordNode, graph.graph, nullptr, 0, ss->serialEvent));
+
+        // Make this record order after previous record on this stream.
+        if (cap->lastRecord != nullptr) {
+          CUDACHECK(cudaGraphAddDependencies(graph.graph, &cap->lastRecord, &recordNode, 1));
+        }
+        cap->lastRecord = recordNode;
+
+        // Get current nodes from work stream so we can add them as dependencies.
+        cudaStreamCaptureStatus status;
+        cudaGraphNode_t const* nodes;
+        size_t count = 0;
+        cudaError_t res = hipStreamGetCaptureInfo_v2(cap->captureStream, &status, nullptr, nullptr, &nodes, &count);
+
+        #if CUDART_VERSION >= 12030
+        if (res == cudaErrorLossyQuery) { // CUDA is telling us the dependencies have edge annotations.
+          cudaGraphEdgeData const* edges;
+          CUDACHECK(cudaStreamGetCaptureInfo_v3(cap->captureStream, &status, nullptr, nullptr, &nodes, &edges, &count));
+          for (int i=0; i < (int)count; i++) {
+            CUDACHECK(cudaGraphAddDependencies_v2(graph.graph, &nodes[i], &recordNode, &edges[i], 1));
+          }
+        }
+        #else
+        if (false) {}
+        #endif
+        else {
+          CUDACHECK(res /* = cudaStreamGetCaptureInfo_v2(...)*/);
+          for (int i=0; i < (int)count; i++) {
+            CUDACHECK(cudaGraphAddDependencies(graph.graph, &nodes[i], &recordNode, 1));
+          }
+        }
+
+        if (cap->acquiredBy != localThreadId() && ncclParamLaunchRaceFatal()) {
+          WARN("%s", launchRaceFatalMsg);
+          return ncclInvalidUsage;
+        }
      }
    }
  #endif
  return ncclSuccess;
 }

-ncclResult_t ncclStrongStreamLaunchHost(
-    struct ncclCudaGraph graph, struct ncclStrongStream* ss, cudaHostFn_t fn, void* arg
-  ) {
-  #if ROCM_VERSION >= 60100
-    if (graph.graph == nullptr) {
-      CUDACHECK(cudaLaunchHostFunc(ss->cudaStream, fn, arg));
-    } else {
-      cudaHostNodeParams p;
-      p.fn = fn;
-      p.userData = arg;
-      struct ncclStrongStreamGraph* g = ss->graphHead;
-      NCCLCHECK(checkGraphId(g, graph.graphId));
-      ensureTips(g, 1);
-      CUDACHECK(cudaGraphAddHostNode(&g->tipNodes[0], graph.graph, g->tipNodes, g->tipCount, &p));
-      g->tipCount = 1;
-    }
-    ss->serialEventNeedsRecord = true;
-  #else
-    CUDACHECK(cudaLaunchHostFunc(ss->cudaStream, fn, arg));
-  #endif
+ncclResult_t ncclStreamWaitStream(cudaStream_t a, cudaStream_t b, cudaEvent_t scratchEvent) {
+  CUDACHECK(cudaEventRecord(scratchEvent, b));
+  CUDACHECK(cudaStreamWaitEvent(a, scratchEvent, 0));
  return ncclSuccess;
 }

-ncclResult_t ncclStrongStreamLaunchKernel(
-    struct ncclCudaGraph graph, struct ncclStrongStream* ss,
-    void* fn, dim3 grid, dim3 block, void* args[], size_t sharedMemBytes
-  ) {
-  #if ROCM_VERSION >= 60100
-    if (graph.graph == nullptr) {
-      CUDACHECK(cudaLaunchKernel(fn, grid, block, args, sharedMemBytes, ss->cudaStream));
-    } else {
-      cudaKernelNodeParams p;
-      p.func = fn;
-      p.gridDim = grid;
-      p.blockDim = block;
-      p.kernelParams = args;
-      p.sharedMemBytes = sharedMemBytes;
-      p.extra = nullptr;
-      struct ncclStrongStreamGraph* g = ss->graphHead;
-      NCCLCHECK(checkGraphId(g, graph.graphId));
-      ensureTips(g, 1);
-      CUDACHECK(cudaGraphAddKernelNode(&g->tipNodes[0], graph.graph, g->tipNodes, g->tipCount, &p));
-      g->tipCount = 1;
-    }
-    ss->serialEventNeedsRecord = true;
-  #else
-    CUDACHECK(cudaLaunchKernel(fn, grid, block, args, sharedMemBytes, ss->cudaStream));
-  #endif
-  return ncclSuccess;
-}
+ncclResult_t ncclStreamAdvanceToEvent(struct ncclCudaGraph g, cudaStream_t s, cudaEvent_t e) {
+  if (g.graphId == ULLONG_MAX) {
+    CUDACHECK(cudaStreamWaitEvent(s, e, 0));
+  } else {
+    cudaStream_t tmp;
+    CUDACHECK(cudaStreamCreateWithFlags(&tmp, cudaStreamNonBlocking));
+    CUDACHECK(cudaStreamWaitEvent(tmp, e, 0));

-// Merge node list `b` into list `a` but don't add duplicates.
-static void mergeTips(struct ncclStrongStreamGraph* a, cudaGraphNode_t const* bNodes, int bn) {
-  int an = a->tipCount;
-  ensureTips(a, an + bn);
-  for (int bi=0; bi < bn; bi++) {
-    for (int ai=0; ai < an; ai++) {
-      if (a->tipNodes[ai] == bNodes[bi]) goto next_b;
+    cudaStreamCaptureStatus status;
+    cudaGraphNode_t const* nodes;
+    size_t count = 0;
+    cudaError_t res = hipStreamGetCaptureInfo_v2(tmp, &status, nullptr, nullptr, &nodes, &count);
+
+    #if CUDART_VERSION >= 12030
+    if (res == cudaErrorLossyQuery) { // CUDA is telling us the dependencies have edge annotations.
+      cudaGraphEdgeData const* edges;
+      CUDACHECK(cudaStreamGetCaptureInfo_v3(tmp, &status, nullptr, nullptr, &nodes, &edges, &count));
+      CUDACHECK(cudaStreamUpdateCaptureDependencies_v2(s, (cudaGraphNode_t*)nodes, edges, count, cudaStreamSetCaptureDependencies));
    }
-    a->tipNodes[a->tipCount++] = bNodes[bi];
-  next_b:;
+    #else
+    if (false) {}
+    #endif
+    else {
+      CUDACHECK(res /* = cudaStreamGetCaptureInfo_v2(...)*/);
+      CUDACHECK(cudaStreamUpdateCaptureDependencies(s, (cudaGraphNode_t*)nodes, count, cudaStreamSetCaptureDependencies));
+    }
+
+    CUDACHECK(cudaStreamDestroy(tmp));
  }
-}
-
-ncclResult_t ncclStrongStreamWaitStream(
-    struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b,
-    bool b_subsumes_a
-  ) {
-  #if ROCM_VERSION >= 60100
-    if (graph.graph == nullptr) {
-      if (b->serialEventNeedsRecord) {
-        b->serialEventNeedsRecord = false;
-        CUDACHECK(cudaEventRecord(b->serialEvent, b->cudaStream));
-      }
-      CUDACHECK(cudaStreamWaitEvent(a->cudaStream, b->serialEvent, 0));
-    } else {
-      struct ncclStrongStreamGraph* ag = a->graphHead;
-      NCCLCHECK(checkGraphId(ag, graph.graphId));
-      struct ncclStrongStreamGraph* bg = b->graphHead;
-      NCCLCHECK(checkGraphId(bg, graph.graphId));
-      if (b_subsumes_a) ag->tipCount = 0;
-      mergeTips(ag, bg->tipNodes, bg->tipCount);
-    }
-    a->serialEventNeedsRecord = true;
-  #else
-    CUDACHECK(cudaEventRecord(b->scratchEvent, b->cudaStream));
-    CUDACHECK(cudaStreamWaitEvent(a->cudaStream, b->scratchEvent, 0));
-  #endif
-  return ncclSuccess;
-}
-
-ncclResult_t ncclStrongStreamWaitStream(
-    struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b,
-    bool b_subsumes_a
-  ) {
-  #if ROCM_VERSION >= 60100
-    if (graph.graph == nullptr) {
-      // It is ok to use a->serialEvent to record b since we'll be setting
-      // a->serialEventNeedsRecord so the event won't be considered accurate
-      // until re-recorded.
-      CUDACHECK(cudaEventRecord(a->serialEvent, b));
-      CUDACHECK(cudaStreamWaitEvent(a->cudaStream, a->serialEvent, 0));
-    } else {
-      cudaStreamCaptureStatus status;
-      unsigned long long bGraphId;
-      cudaGraphNode_t const* bNodes;
-      size_t bCount = 0;
-      CUDACHECK(hipStreamGetCaptureInfo_v2(b, &status, &bGraphId, nullptr, &bNodes, &bCount));
-      if (status != cudaStreamCaptureStatusActive || graph.graphId != bGraphId) {
-        WARN("Stream is not being captured by the expected graph.");
-        return ncclInvalidUsage;
-      }
-      struct ncclStrongStreamGraph* ag = a->graphHead;
-      NCCLCHECK(checkGraphId(ag, graph.graphId));
-      if (b_subsumes_a) ag->tipCount = 0;
-      mergeTips(ag, bNodes, bCount);
-    }
-    a->serialEventNeedsRecord = true;
-  #else
-    CUDACHECK(cudaEventRecord(a->scratchEvent, b));
-    CUDACHECK(cudaStreamWaitEvent(a->cudaStream, a->scratchEvent, 0));
-  #endif
-  return ncclSuccess;
-}
-
-ncclResult_t ncclStrongStreamWaitStream(
-    struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b,
-    bool b_subsumes_a
-  ) {
-  #if ROCM_VERSION >= 60100
-    if (graph.graph == nullptr) {
-      if (b->serialEventNeedsRecord) {
-        b->serialEventNeedsRecord = false;
-        CUDACHECK(cudaEventRecord(b->serialEvent, b->cudaStream));
-      }
-      CUDACHECK(cudaStreamWaitEvent(a, b->serialEvent, 0));
-    } else {
-      struct ncclStrongStreamGraph* bg = b->graphHead;
-      NCCLCHECK(checkGraphId(bg, graph.graphId));
-      CUDACHECK(hipStreamUpdateCaptureDependencies(a, bg->tipNodes, bg->tipCount,
-        b_subsumes_a ? cudaStreamSetCaptureDependencies : cudaStreamAddCaptureDependencies
-      ));
-    }
-  #else
-    CUDACHECK(cudaEventRecord(b->scratchEvent, b->cudaStream));
-    CUDACHECK(cudaStreamWaitEvent(a, b->scratchEvent, 0));
-  #endif
  return ncclSuccess;
 }

 ncclResult_t ncclStrongStreamSynchronize(struct ncclStrongStream* ss) {
  #if ROCM_VERSION >= 60100
-    CUDACHECK(cudaStreamWaitEvent(ss->cudaStream, ss->serialEvent, 0));
-    ss->serialEventNeedsRecord = false;
+    CUDACHECK(cudaStreamWaitEvent(ss->liveStream, ss->serialEvent, 0));
  #endif
-  CUDACHECK(cudaStreamSynchronize(ss->cudaStream));
+  CUDACHECK(cudaStreamSynchronize(ss->liveStream));
  return ncclSuccess;
 }
@@ -1,267 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
- * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include <dlfcn.h>
-#include <errno.h>
-#include <stdlib.h>
-
-#include "checks.h"
-#include "debug.h"
-#include "tuner.h"
-
-pthread_mutex_t tunerPluginLock = PTHREAD_MUTEX_INITIALIZER;
-static int tunerPluginRefCount;
-static void* tunerPluginLib = nullptr;
-static ncclTuner_v4_t* tunerSymbol = nullptr;
-static ncclTuner_v3_t* ncclTuner_v3 = nullptr;
-static ncclTuner_v2_t* ncclTuner_v2 = nullptr;
-static ncclTuner_v4_t ncclTuner_v2_as_v4;
-static ncclTuner_v4_t ncclTuner_v3_as_v4;
-
-static int hasNvlsSupport(float** collCostTable) {
-  // Requirements for support of different algorithms:
-  //
-  // - NVLS intra-node: nvlsSupport
-  // - NVLS intra+inter-node: collNetSupport
-  // - NVLSTree intra-node: always disabled
-  // - NVLSTree inter-node: nvlsSupport
-  // - Collnet* inter-node: collNetSupport
-  //
-  // nvlsSupport = 1 if either NVLS or NVLS_TREE entries in the cost table are not -1
-  float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
-  return (table[NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE || table[NCCL_ALGO_NVLS_TREE][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) ? 1 : 0;
-}
-
-static int hasCollNetSupport(float** collCostTable) {
-  float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
-  return (table[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] == NCCL_ALGO_PROTO_IGNORE) ? 0 : 1;
-}
-
-static ncclResult_t ncclTuner_v3_as_v4_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo, int numProto, int regBuff __attribute__((unused)), int* nChannels) {
-  NCCLCHECK(ncclTuner_v3->getCollInfo(context, collType, nBytes, numPipeOps, collCostTable, numAlgo, numProto,  nChannels));
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclTuner_v3_as_v4_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void** context) {
-  NCCLCHECK(ncclTuner_v3->init(nRanks, nNodes, logFunction, context));
-  ncclTuner_v3_as_v4.name = ncclTuner_v3->name;
-  ncclTuner_v3_as_v4.getCollInfo = ncclTuner_v3_as_v4_getCollInfo;
-  ncclTuner_v3_as_v4.destroy = ncclTuner_v3->destroy;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclTuner_v2_as_v4_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo __attribute__((unused)), int numProto __attribute__((unused)), int regBuff __attribute__((unused)), int* nChannels) {
-  int algorithm = NCCL_ALGO_UNDEF;
-  int protocol = NCCL_PROTO_UNDEF;
-  int nvlsSupport = hasNvlsSupport(collCostTable);
-  int collNetSupport = hasCollNetSupport(collCostTable);
-  NCCLCHECK(ncclTuner_v2->getCollInfo(context, collType, nBytes, collNetSupport, nvlsSupport, numPipeOps, &algorithm, &protocol, nChannels));
-  // set time to 0 below to make sure this algorithm/protocol is selected later on
-  if (algorithm >= 0 && algorithm < NCCL_NUM_ALGORITHMS && protocol >= 0 && protocol < NCCL_NUM_PROTOCOLS) {
-    float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
-    if (table[algorithm][protocol] != NCCL_ALGO_PROTO_IGNORE) table[algorithm][protocol] = 0.0;
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclTuner_v2_as_v4_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void** context) {
-  NCCLCHECK(ncclTuner_v2->init(nRanks, nNodes, logFunction, context));
-  ncclTuner_v2_as_v4.name = ncclTuner_v2->name;
-  ncclTuner_v2_as_v4.getCollInfo = ncclTuner_v2_as_v4_getCollInfo;
-  ncclTuner_v2_as_v4.destroy = ncclTuner_v2->destroy;
-  return ncclSuccess;
-}
-
-#define MAX_STR_LEN 255
-
-static void* tryOpenLib(const char* name, int* err, char* errStr) {
-  *err = 0;
-  if (nullptr == name || strlen(name) == 0) {
-    return nullptr;
-  }
-
-  if (strncasecmp(name, "STATIC_PLUGIN", strlen(name)) == 0) {
-    name = nullptr;
-  }
-
-  void *handle = dlopen(name, RTLD_LAZY | RTLD_LOCAL);
-  if (nullptr == handle) {
-    strncpy(errStr, dlerror(), MAX_STR_LEN);
-    errStr[MAX_STR_LEN] = '\0';
-    // "handle" and "name" won't be NULL at the same time.
-    // coverity[var_deref_model]
-    if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) {
-      *err = ENOENT;
-    }
-  }
-  return handle;
-}
-
-static char* tryOpenLibCheck(int openErr, char* openErrStr, char* nameList, int *nameListLen, char* name) {
-  if (openErr == ENOENT) {
-    snprintf(nameList, *nameListLen, " %s", name);
-    nameList += strlen(name) + 1;
-    *nameListLen -= strlen(name) + 1;
-    return nameList;
-  }
-  INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: %s", openErrStr);
-  return nameList;
-}
-
-static void* openTunerPluginLib(char* couldNotFindNames, int len) {
-  int openErr;
-  void *pluginLib;
-  char tunerPluginLibName[PATH_MAX];
-  char openErrStr[MAX_STR_LEN + 1] = { 0 };
-  const char *envTunerPluginName = getenv("NCCL_TUNER_PLUGIN");
-  if (envTunerPluginName && strlen(envTunerPluginName)) {
-    INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: NCCL_TUNER_PLUGIN set to %s", envTunerPluginName);
-    snprintf(tunerPluginLibName, PATH_MAX, "%s", envTunerPluginName);
-    pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName);
-      return pluginLib;
-    }
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName);
-
-    snprintf(tunerPluginLibName, PATH_MAX, "librccl-tuner-%s.so", envTunerPluginName);
-    pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName);
-      return pluginLib;
-    }
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName);
-  } else {
-    snprintf(tunerPluginLibName, PATH_MAX, "librccl-tuner.so");
-    pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      return pluginLib;
-    }
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName);
-  }
-
-  const char *envNetPluginName = getenv("NCCL_NET_PLUGIN");
-  if (envNetPluginName && strlen(envNetPluginName)) {
-    // Users are allowed to pack tuner into the net plugin
-    snprintf(tunerPluginLibName, PATH_MAX, "%s", envNetPluginName);
-    pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName);
-      return pluginLib;
-    }
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName);
-
-    snprintf(tunerPluginLibName, PATH_MAX, "librccl-net-%s.so", envNetPluginName);
-    pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName);
-      return pluginLib;
-    }
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName);
-  } else {
-    snprintf(tunerPluginLibName, PATH_MAX, "librccl-net.so");
-    pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      return pluginLib;
-    }
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName);
-  }
-  tunerPluginLibName[0] = '\0';
-  return nullptr;
-}
-
-enum {
-  tunerPluginLoadFailed  = -1,
-  tunerPluginLoadReady   =  0,
-  tunerPluginLoadSuccess =  1,
-};
-
-#define MAX_PLUGIN_LOAD 4
-
-static int status = tunerPluginLoadReady;
-
-ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm) {
-  // Initialize to nullptr by default if plugin tuner cannot be loaded.
-  char couldNotFindNames[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 };
-  comm->tuner = nullptr;
-  if (tunerPluginLoadFailed == status) {
-    return ncclSuccess;
-  }
-
-  pthread_mutex_lock(&tunerPluginLock);
-  if (tunerPluginLoadFailed == status) {
-    goto exit;
-  }
-
-  if (tunerPluginLoadSuccess == status) {
-    comm->tuner = tunerSymbol;
-    ++tunerPluginRefCount;
-    goto exit;
-  }
-
-  tunerPluginLib = openTunerPluginLib(couldNotFindNames, MAX_PLUGIN_LOAD * PATH_MAX);
-  if (nullptr == tunerPluginLib) {
-    if (strlen(couldNotFindNames)) {
-      INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Could not find:%s. Using internal tuner plugin.", couldNotFindNames);
-    } else {
-      INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using internal tuner plugin.");
-    }
-    goto fail;
-  }
-
-  tunerSymbol = (ncclTuner_v4_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v4");
-  if (tunerSymbol == nullptr) {
-    INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.");
-    ncclTuner_v3 = (ncclTuner_v3_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v3");
-    if (ncclTuner_v3 == nullptr) {
-      INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.");
-      ncclTuner_v2 = (ncclTuner_v2_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v2");
-      if (ncclTuner_v2 == nullptr) {
-        INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.");
-        dlclose(tunerPluginLib);
-        goto fail;
-      } else {
-        ncclTuner_v2_as_v4.init = ncclTuner_v2_as_v4_init;
-        ncclTuner_v2_as_v4.name = ncclTuner_v2->name;
-        tunerSymbol = &ncclTuner_v2_as_v4;
-      }
-    } else {
-      ncclTuner_v3_as_v4.init = ncclTuner_v3_as_v4_init;
-      ncclTuner_v3_as_v4.name = ncclTuner_v3->name;
-      tunerSymbol = &ncclTuner_v3_as_v4;
-    }
-  }
-
-  INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using tuner plugin %s", tunerSymbol->name);
-  comm->tuner = tunerSymbol;
-  ++tunerPluginRefCount;
-  status = tunerPluginLoadSuccess;
-  comm->tunerPluginLoaded = 1;
-
-exit:
-  pthread_mutex_unlock(&tunerPluginLock);
-  return ncclSuccess;
-fail:
-  tunerPluginLib = nullptr;
-  status = tunerPluginLoadFailed;
-  goto exit;
-}
-
-ncclResult_t ncclTunerPluginUnload(struct ncclComm* comm) {
-  pthread_mutex_lock(&tunerPluginLock);
-  if (comm->tunerPluginLoaded && 0 == (--tunerPluginRefCount)) {
-    INFO(NCCL_TUNING, "TUNER/Plugin: Closing tuner: '%s'", tunerSymbol->name);
-    dlclose(tunerPluginLib);
-    tunerPluginLib = nullptr;
-    tunerSymbol = nullptr;
-    comm->tuner = nullptr;
-    status = tunerPluginLoadReady;
-    comm->tunerPluginLoaded = 0;
-  }
-  pthread_mutex_unlock(&tunerPluginLock);
-  return ncclSuccess;
-}
@@ -82,6 +82,7 @@ typedef struct ncclConfig_v21700 {
  int maxCTAs;                 /*!< Maximum number of cooperative thread arrays (blocks) */
  const char *netName;         /*!< Force NCCL to use a specfic network */
  int splitShare;              /*!< Allow communicators to share resources */
+  int trafficClass;            /*!< Traffic class*/
 } ncclConfig_t;

 /* Config initializer must be assigned to initialize config structure when it is created.
@@ -95,7 +96,8 @@ typedef struct ncclConfig_v21700 {
  NCCL_CONFIG_UNDEF_INT,                            /* minCTAs */        \
  NCCL_CONFIG_UNDEF_INT,                            /* maxCTAs */        \
  NCCL_CONFIG_UNDEF_PTR,                            /* netName */        \
-  NCCL_CONFIG_UNDEF_INT                             /* splitShare */     \
+  NCCL_CONFIG_UNDEF_INT,                            /* splitShare */     \
+  NCCL_CONFIG_UNDEF_INT,                            /* trafficClass */   \
 }
 /*! @} */

--- a/Показать больше
+++ b/Показать больше