Merge remote-tracking branch 'nccl/master' into develop

2026-01-20 13:01:49 -06:00
@@ -3,6 +3,6 @@
 /coverage/
 build/
 ext/
-
+src/transport/net_ib_rocm.cc
 # Visual Studio Code
-.vscode
+.vscode
@@ -2,16 +2,30 @@

 Full documentation for RCCL is available at [https://rccl.readthedocs.io](https://rccl.readthedocs.io)

+## Unreleased - RCCL 2.28.3 for ROCm 7.11
+
+### Known issues
+* AllGather regression for small message sizes (less than 1 MB) due to the Direct algorithm.
+* ROCTx feature needs to be verified.
+* Profiler plugin needs to be verified.
+
+### Changed
+* Compatibility with NCCL 2.28.3.
+* The MSCCL feature is now disabled by default. The `--disable-msccl-kernel` build flag is replaced with `--enable-msccl-kernel` in the `rccl/install.sh` script.
+* MSCCL and NPKIT are deprecated and will be removed in a future release of RCCL.
+
 ## Unreleased - RCCL 2.27.7 for ROCm 7.2.0

 ### Changed
-
 * RCCL error messages have been made more verbose in several cases. RCCL now prints out fatal error messages by default. Fatal error messages can be suppressed by setting `NCCL_DEBUG=NONE`.
 * Disabled `reduceCopyPacks` pipelining for `gfx950`.
 * Experimental support for traffic shaping using warp specialization (also known as WarpSpeed) is now available for the Ring algorithm.
 * Enabling WarpSpeed in auto mode using RCCL_WARP_SPEED_AUTO optimizes performance and reduces the CU count by 50% on a single node for AllReduce, AllGather from 64MB, and ReduceScatter from 256MB.
 * The following configuration knobs control WarpSpeed behavior for debugging purposes: `RCCL_WARP_SPEED_ENABLE`, `RCCL_UNROLL_FACTOR`, `RCCL_WARP_SPEED_CU_COUNT`, and `RCCL_THREADS_PER_BLOCK`. Note that the effective unroll factor is calculated as 2 raised to the value of `RCCL_UNROLL_FACTOR`.

+### Known issues
+* AllToAllv/AlltoAll for single GPU is hanging.
+
 ## Unreleased - RCCL 2.27.7 for ROCm 7.1.1

 ### Changed
@@ -26,7 +26,7 @@ option(BUILD_TESTS                             "Build unit test programs"
 option(COLLTRACE                               "Collective Trace Option"                       ON)
 option(DUMP_ASM                                "Disassemble and dump"                          OFF)
 option(ENABLE_CODE_COVERAGE                    "Enable code coverage"                          OFF)
-option(ENABLE_MSCCL_KERNEL                     "Enable MSCCL while compiling"                  ON)
+option(ENABLE_MSCCL_KERNEL                     "Enable MSCCL while compiling"                  OFF)
 option(ENABLE_MSCCLPP                          "Enable MSCCL++"                                OFF)
 option(ENABLE_MSCCLPP_CLIP                     "Enable MSCCL++ CLIP"                           OFF)
 option(ENABLE_MSCCLPP_EXECUTOR                 "Enable MSCCL++ Executor"                       OFF)
@@ -463,10 +463,12 @@ configure_file(src/nccl.h.in ${PROJECT_BINARY_DIR}/include/nccl.h)      # Used b
 set(SRC_FILES
  src/allocator.cc
  src/bootstrap.cc
+  src/ce_coll.cc
  src/channel.cc
  src/collectives.cc
  src/commDump.cc
  src/debug.cc
+  src/dev_runtime.cc
  src/enqueue.cc
  src/group.cc
  src/init.cc
@@ -475,7 +477,7 @@ set(SRC_FILES
  src/msccl.cc
  src/proxy.cc
  src/rccl_wrap.cc
-  src/symmetric.cc
+  src/sym_kernels.cc
  src/transport.cc
  src/device/all_gather.h
  src/device/all_reduce.h
@@ -526,6 +528,7 @@ set(SRC_FILES
  src/include/BfdBacktrace.hpp
  src/include/bitops.h
  src/include/bootstrap.h
+  src/include/ce_coll.h
  src/include/channel.h
  src/include/checks.h
  src/include/collectives.h
@@ -535,6 +538,7 @@ set(SRC_FILES
  src/include/cpuset.h
 # src/include/cudawrap.h
  src/include/debug.h
+  src/include/dev_runtime.h
  src/include/device.h
  src/include/enqueue.h
  src/include/gdrwrap.h
@@ -549,6 +553,7 @@ set(SRC_FILES
  src/include/ipcsocket.h
  src/include/mnnvl.h
  src/include/nccl_common.h
+  src/include/nccl_device.h
  src/include/net_device.h
  src/include/net.h
  src/include/nvmlwrap.h
@@ -569,12 +574,13 @@ set(SRC_FILES
  src/include/rocmwrap.h
  src/include/roctx.h
  src/include/recorder.h
+  src/include/scheduler.h
  src/include/shm.h
  src/include/shmutils.h
  src/include/signals.h
  src/include/socket.h
  src/include/strongstream.h
-  src/include/symmetric.h
+  src/include/sym_kernels.h
  src/include/timer.h
  src/include/transport.h
  src/include/trees.h
@@ -592,6 +598,23 @@ set(SRC_FILES
  src/include/msccl/msccl_setup.h
  src/include/msccl/msccl_status.h
  src/include/msccl/msccl_struct.h
+  src/include/nccl_device/comm.h
+  src/include/nccl_device/coop.h
+  src/include/nccl_device/core.h
+  src/include/nccl_device/ll_a2a.h
+  src/include/nccl_device/mem_barrier.h
+  src/include/nccl_device/ptr.h
+  src/include/nccl_device/utility.h
+  src/include/nccl_device/impl/comm__funcs.h
+  src/include/nccl_device/impl/comm__types.h
+  src/include/nccl_device/impl/core__funcs.h
+  src/include/nccl_device/impl/core__types.h
+  src/include/nccl_device/impl/ll_a2a__funcs.h
+  src/include/nccl_device/impl/ll_a2a__types.h
+  src/include/nccl_device/impl/mem_barrier__funcs.h
+  src/include/nccl_device/impl/mem_barrier__types.h
+  src/include/nccl_device/impl/ptr__funcs.h
+  src/include/nccl_device/impl/ptr__types.h
  src/include/npkit/npkit.h
  src/include/npkit/npkit_event.h
  src/include/npkit/npkit_struct.h
@@ -639,6 +662,7 @@ set(SRC_FILES
  src/include/plugin/net/net_v8.h
  src/include/plugin/net/net_v9.h
  src/include/plugin/net/net_v10.h
+  src/include/plugin/net/net_v11.h
  src/include/plugin/profiler/net_ib_v1.h
  src/include/plugin/profiler/net_ib.h
  src/include/plugin/profiler/net_socket_v1.h
@@ -647,9 +671,11 @@ set(SRC_FILES
  src/include/plugin/profiler/profiler_v2.h
  src/include/plugin/profiler/profiler_v3.h
  src/include/plugin/profiler/profiler_v4.h
+  src/include/plugin/profiler/profiler_v5.h
  src/include/plugin/tuner/tuner_v2.h
  src/include/plugin/tuner/tuner_v3.h
  src/include/plugin/tuner/tuner_v4.h
+  src/include/plugin/tuner/tuner_v5.h
  src/misc/alt_rsmi.cc
  src/misc/archinfo.cc
  src/misc/argcheck.cc
@@ -682,6 +708,9 @@ set(SRC_FILES
  src/misc/msccl/msccl_setup.cc
  src/misc/msccl/msccl_status.cc
  src/misc/proxy_trace/proxy_trace.cc
+  src/nccl_device/core.cc
+  src/nccl_device/ll_a2a.cc
+  src/nccl_device/mem_barrier.cc
  src/plugin/net.cc
  src/plugin/plugin_open.cc
  src/plugin/profiler.cc
@@ -691,13 +720,16 @@ set(SRC_FILES
  src/plugin/net/net_v8.cc
  src/plugin/net/net_v9.cc
  src/plugin/net/net_v10.cc
+  src/plugin/net/net_v11.cc
  src/plugin/profiler/profiler_v1.cc
  src/plugin/profiler/profiler_v2.cc
  src/plugin/profiler/profiler_v3.cc
  src/plugin/profiler/profiler_v4.cc
+  src/plugin/profiler/profiler_v5.cc
  src/plugin/tuner/tuner_v2.cc
  src/plugin/tuner/tuner_v3.cc
  src/plugin/tuner/tuner_v4.cc
+  src/plugin/tuner/tuner_v5.cc
  src/ras/client.cc
  src/ras/client_support.cc
  src/ras/collectives.cc
@@ -708,6 +740,7 @@ set(SRC_FILES
  src/register/coll_reg.cc
  src/register/register.cc
  src/register/sendrecv_reg.cc
+  src/scheduler/symmetric_sched.cc
  src/transport/coll_net.cc
  src/transport/generic.cc
  src/transport/net.cc
@@ -880,6 +913,7 @@ target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/device)
 target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/device/network/unpack)
 target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include)
 target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include/mlx5)
+target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include/nccl_device)
 target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include/ionic)
 target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include/plugin)
 target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/gensrc)
@@ -899,6 +933,7 @@ if(COLLTRACE)
  target_compile_definitions(rccl PRIVATE ENABLE_COLLTRACE)
 endif()
 if(ENABLE_MSCCL_KERNEL)
+  message(WARNING "MSCCL is deprecated and will be removed in a future version of RCCL.")
  target_compile_definitions(rccl PRIVATE COMPILE_MSCCL_KERNEL)
 endif()
 if(ENABLE_MSCCLPP)
@@ -939,6 +974,7 @@ endif()
 # NPKit flags
 ## May be better to move these to a separate file
 if(ENABLE_NPKIT)
+  message(WARNING "NPKit is deprecated and will be removed in a future version of RCCL. Please consider using alternative profiling tools.")
  target_compile_definitions(rccl PRIVATE ENABLE_NPKIT)
  target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
  target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
@@ -42,7 +42,7 @@ RCCL build & installation helper script
       --debug                 Build debug library
       --enable_backtrace      Build with custom backtrace support
       --disable-colltrace     Build without collective trace
-       --disable-msccl-kernel  Build without MSCCL kernels
+       --enable-msccl-kernel   Build with MSCCL kernels
       --enable-mscclpp        Build with MSCCL++ support
       --enable-mscclpp-clip   Build MSCCL++ with clip wrapper on bfloat16 and half addition routines
       --disable-roctx         Build without ROCTX logging
@@ -246,4 +246,12 @@ execute_process(
  COMMAND bash -c "sed -i 's/ncclNetIb/rocmNetIb/g' ${ROCM_NETIB_FILE}"
  WORKING_DIRECTORY ${RCCL_SRC_DIR}
 )
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbFinalize/rocmNetIbFinalize/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbSetNetAttr/rocmNetIbSetNetAttr/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
@@ -60,36 +60,36 @@ of newer ones.
 The `nccl/` directory is populated with `net_vX.h` files extracting all relevant definitions
 from old API versions. It also provides error codes in `err.h`.

-# API (v10)
+# API (v11)

-Below is the main `ncclNet_v10` struct. Each function is explained in later sections.
+Below is the main `ncclNet_v11` struct. Each function is explained in later sections.

 ```
 typedef struct {
  // Name of the network (mainly for logs)
  const char* name;
  // Initialize the network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
+  ncclResult_t (*init)(void** ctx, uint64_t commId, ncclNetCommConfig_v11_t* config, ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
  // Return the number of adapters.
  ncclResult_t (*devices)(int* ndev);
  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props);
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v11_t* props);
  // Create a receiving object and provide a handle to connect to it. The
  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
  // between ranks to create a connection.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  ncclResult_t (*listen)(void* ctx, int dev, void* handle, void** listenComm);
  // Connect to a handle and return a sending comm object for that peer.
  // This call must not block for the connection to be established, and instead
  // should return successfully with sendComm == NULL with the expectation that
  // it will be called again until sendComm != NULL.
  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*connect)(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm);
+  ncclResult_t (*connect)(void* ctx, int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v11_t** sendDevComm);
  // Finalize connection establishment after remote peer has called connect.
  // This call must not block for the connection to be established, and instead
  // should return successfully with recvComm == NULL with the expectation that
  // it will be called again until recvComm != NULL.
  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v10_t** recvDevComm);
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v11_t** recvDevComm);
  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
@@ -191,6 +191,12 @@ This will allow the plugin to discover network devices and make sure they are us
 `init` function does not return `ncclSuccess`, then NCCL will not use the plugin and fall back on
 internal ones.

+Every call to `init` returns an opaque context that the plugin uses internally to allocate resources
+and manage state. Such context is passed to other net plugin calls that create further resources,
+such as `listen` and `connect`. Every context is uniquely associated to a communicator
+using the commId. The network can also be initialized with a per communicator configuration using
+the `config` argument.
+
 To allow the plugin logs to integrate into the NCCL logs seemlessly, NCCL provides a logging
 function to `init`. This function is typically used to allow for `INFO` and `WARN` macros within
 the plugin code adding the following definitions:
@@ -282,7 +288,7 @@ side.
 `listen`

 To create a connection, NCCL will start by calling `listen` on the receiver side. This function
-takes a device number as input argument, and should return a local `listenComm` object, and a
+takes the opaque plugin context returned by `init` and a device number as input argument, and should return a local `listenComm` object, and a
 `handle` to pass to the other side, so that the sender side can connect to the receiver.

 The `handle` is a buffer of size `NCCL_NET_HANDLE_MAXSIZE` and is provided by NCCL.
@@ -304,7 +310,8 @@ the `listen` call previously. If the sender did not connect yet, `accept` should
 should return `ncclSuccess`, setting `recvComm` to `NULL`. NCCL will call `accept` again until it
 succeeds.

-The `connect` API takes a `ncclNetCommConfig_t`, which contains a trafficClass field.
+The `connect` API takes the opaque plugin context returned by `init`. The plugin context can reference
+the `ncclNetCommConfig_t` passed to the `init` function and containing a trafficClass field.
 This field can be used by the network plugin to specify the QoS level of the connection. By default,
 `trafficClass` is set to -1 but can be configured by the application during communicator initialization
 to select a plugin-supported QoS level.
@@ -0,0 +1,19 @@
+set(SRC_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/plugin.c
+)
+
+# Create shared library
+add_library(nccl-net-example SHARED ${SRC_FILES})
+
+# Set include directories
+target_include_directories(nccl-net-example PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/nccl
+)
+
+# Set output name to match Makefile
+set_target_properties(nccl-net-example PROPERTIES
+    OUTPUT_NAME "nccl-net-example"
+    PREFIX "lib"
+    POSITION_INDEPENDENT_CODE ON
+    LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/test/unit/plugins
+)
@@ -22,7 +22,9 @@

 // Maximum number of requests per comm object
 #define NCCL_NET_MAX_REQUESTS 32
+#define NCCL_NET_MAX_DEVS_PER_NIC 4

+#include "net_v11.h"
 #include "net_v10.h"
 #include "net_v9.h"
 #include "net_v8.h"
@@ -33,9 +35,9 @@
 #include "net_v3.h"
 #include "net_v2.h"

-typedef ncclNet_v10_t ncclNet_t;
-typedef ncclNetProperties_v10_t ncclNetProperties_t;
-typedef ncclNetVDeviceProps_v10_t ncclNetVDeviceProps_t;
-typedef ncclNetCommConfig_v10_t ncclNetCommConfig_t;
+typedef ncclNet_v11_t ncclNet_t;
+typedef ncclNetProperties_v11_t ncclNetProperties_t;
+typedef ncclNetVDeviceProps_v11_t ncclNetVDeviceProps_t;
+typedef ncclNetCommConfig_v11_t ncclNetCommConfig_t;

 #endif // end include guard
@@ -12,7 +12,7 @@

 // Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin
 // version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version.
-#define NCCL_NET_DEVICE_UNPACK_VERSION 0x7  
+#define NCCL_NET_DEVICE_UNPACK_VERSION 0x7

 typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType;

@@ -27,6 +27,7 @@ typedef struct {
 typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
 typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t;
 typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t;
-typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_t;
+typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_v11_t;
+typedef ncclNetDeviceHandle_v11_t ncclNetDeviceHandle_t;

 #endif
@@ -5,10 +5,9 @@
 #ifndef NET_V10_H_
 #define NET_V10_H_

-#define NCCL_NET_MAX_DEVS_PER_NIC_V10 4
 typedef struct {
  int ndevs;
-  int devs[NCCL_NET_MAX_DEVS_PER_NIC_V10];
+  int devs[NCCL_NET_MAX_DEVS_PER_NIC];
 } ncclNetVDeviceProps_v10_t;


@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NET_V11_H_
+#define NET_V11_H_
+
+typedef struct {
+  int ndevs;
+  int devs[NCCL_NET_MAX_DEVS_PER_NIC];
+} ncclNetVDeviceProps_v11_t;
+
+#define NCCL_NET_TRAFFIC_CLASS_UNDEF -1
+
+typedef struct {
+  // Plugin-specific TC value
+  int trafficClass;
+} ncclNetCommConfig_v11_t;
+
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int regIsGlobal;                 // regMr is not tied to a particular comm
+  int forceFlush;                  // Force a flush on receives
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+  ncclNetVDeviceProps_v11_t vProps;
+  size_t maxP2pBytes;              // Max transfer size for point-to-point operations
+  size_t maxCollBytes;             // Max transfer size for collective operations
+  int maxMultiRequestSize;         // Maximum number of requests supported in a single multi-request.
+} ncclNetProperties_v11_t;
+
+typedef struct {
+  int32_t maxConcurrentPeers;
+  int32_t minConcurrentPeers;
+  int32_t maxFlowsPerPeer;
+  int32_t minFlowsPerPeer;
+} ncclNetCommAttr_v11_t;
+
+typedef struct {
+  ncclNetCommAttr_v11_t sendCommAttr;
+  ncclNetCommAttr_v11_t recvCommAttr;
+  uint32_t op;
+  uint32_t algo;
+  uint32_t proto;
+} ncclNetAttr_v11_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(void** ctx, uint64_t commId, ncclNetCommConfig_v11_t* config, ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v11_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(void* ctx, int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(void* ctx, int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v11_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v11_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+
+  // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
+  // what index this new vNIC exists at
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v11_t* props);
+  // Finalize the network.
+  ncclResult_t (*finalize)(void* ctx);
+
+  ncclResult_t (*setNetAttr)(void* ctx, ncclNetAttr_v11_t* netAttr);
+} ncclNet_v11_t;
+
+#endif // end include guard
@@ -5,10 +5,9 @@
 #ifndef NET_V9_H_
 #define NET_V9_H_

-#define NCCL_NET_MAX_DEVS_PER_NIC_V9 4
 typedef struct {
  int ndevs;
-  int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9];
+  int devs[NCCL_NET_MAX_DEVS_PER_NIC];
 } ncclNetVDeviceProps_v9_t;

 typedef struct {
@@ -11,7 +11,7 @@

 int max_requests = NCCL_NET_MAX_REQUESTS;

-__hidden ncclResult_t pluginInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { return ncclSuccess; }
+__hidden ncclResult_t pluginInit(void** ctx, uint64_t commId, ncclNetCommConfig_t* config, ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { return ncclSuccess; }
 __hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess; }
 __hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; }
 __hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; }
@@ -51,8 +51,8 @@ __hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_t* props) {
  return ncclSuccess;
 }

-__hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; }
-__hidden ncclResult_t pluginConnect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginListen(void* ctx, int dev, void* handle, void** listenComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginConnect(void* ctx, int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { return ncclInternalError; }
 __hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** recvDevComm) { return ncclInternalError; }
 __hidden ncclResult_t pluginRegMr(void* collComm, void* data, size_t size, int type, void** mhandle) { return ncclInternalError; }
 __hidden ncclResult_t pluginRegMrDmaBuf(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { return ncclInternalError; }
@@ -67,10 +67,11 @@ __hidden ncclResult_t pluginCloseListen(void* listenComm) { return ncclInternalE
 __hidden ncclResult_t pluginIrecvConsumed(void* recvComm, int n, void* request) { return ncclInternalError; }
 __hidden ncclResult_t pluginGetDeviceMr(void* comm, void* mhandle, void** dptr_mhandle) { return ncclInternalError; }
 __hidden ncclResult_t pluginMakeVDevice(int* d, ncclNetVDeviceProps_t* props) { return ncclInternalError; }
+__hidden ncclResult_t pluginFinalize(void* ctx) { return ncclSuccess; }

 #define PLUGIN_NAME "Plugin"

-const ncclNet_v10_t ncclNetPlugin_v10 = {
+const ncclNet_v11_t ncclNetPlugin_v11 = {
  .name = PLUGIN_NAME,
  .init = pluginInit,
  .devices = pluginDevices,
@@ -91,18 +92,84 @@ const ncclNet_v10_t ncclNetPlugin_v10 = {
  .getDeviceMr = pluginGetDeviceMr,
  .irecvConsumed = pluginIrecvConsumed,
  .makeVDevice   = pluginMakeVDevice,
+  .finalize = pluginFinalize,
 };

+__hidden ncclResult_t pluginInit_v10(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { return ncclSuccess; }
+__hidden ncclResult_t pluginGetProperties_v10(int dev, ncclNetProperties_v10_t* props) {
+  // Below are default values, if unsure don't change.
+
+  props->name = "Example";
+  // Fill for proper topology detection, e.g. /sys/devices/pci0000:00/0000:00:10.0/0000:0b:00.0
+  props->pciPath = NULL;
+  // Only used to detect NICs with multiple PCI attachments.
+  props->guid = 0;
+  // Add NCCL_PTR_CUDA if GPU Direct RDMA is supported and regMr can take CUDA pointers.
+  props->ptrSupport = NCCL_PTR_HOST;
+  // If you regMr has a fast registration cache, set to 1. If set to 0, user buffer registration may be disabled.
+  props->regIsGlobal = 0;
+  // Force flush after receive. Needed if the control path and data path use a different path to the GPU
+  props->forceFlush = 0;
+  // Speed in *Mbps*. 100000 means 100G
+  props->speed = 100000;
+  // Port number, used in conjunction with guid
+  props->port = 0;
+  // Custom latency (used to help tuning if latency is high. If set to 0, use default NCCL values.
+  props->latency = 0;
+  // Maximum number of comm objects we can create.
+  props->maxComms = 1024*1024;
+  // Maximum number of receive operations taken by irecv().
+  props->maxRecvs = NCCL_PLUGIN_MAX_RECVS;
+  // Coupling with NCCL network device-side code.
+  props->netDeviceType = NCCL_NET_DEVICE_HOST;
+  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  // Used to tell NCCL core whether this is a virtual device fusing multiple physical devices.
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  // maximum transfer sizes the plugin can handle
+  props->maxP2pBytes = NCCL_MAX_NET_SIZE_BYTES;
+  props->maxCollBytes = NCCL_MAX_NET_SIZE_BYTES;
+  return ncclSuccess;
+}
+
+__hidden ncclResult_t pluginListen_v10(int d, void* handle, void** listenComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginConnect_v10(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginMakeVDevice_v10(int* d, ncclNetVDeviceProps_v10_t* props) { return ncclInternalError; }
+
+const ncclNet_v10_t ncclNetPlugin_v10 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit_v10,
+  .devices = pluginDevices,
+  .getProperties = pluginGetProperties_v10,
+  .listen = pluginListen_v10,
+  .connect = pluginConnect_v10,
+  .accept = pluginAccept,
+  .regMr = pluginRegMr,
+  .regMrDmaBuf = pluginRegMrDmaBuf,
+  .deregMr = pluginDeregMr,
+  .isend = pluginIsend,
+  .irecv = pluginIrecv,
+  .iflush = pluginIflush,
+  .test = pluginTest,
+  .closeSend = pluginCloseSend,
+  .closeRecv = pluginCloseRecv,
+  .closeListen = pluginCloseListen,
+  .getDeviceMr = pluginGetDeviceMr,
+  .irecvConsumed = pluginIrecvConsumed,
+  .makeVDevice   = pluginMakeVDevice_v10,
+};
+
+
 __hidden ncclResult_t pluginInit_v9(ncclDebugLogger_t logFunction) {
-  return pluginInit(logFunction, NULL);
+  return pluginInit_v10(logFunction, NULL);
 }

 __hidden ncclResult_t pluginGetProperties_v9(int dev, ncclNetProperties_v9_t* props) {
-  return pluginGetProperties(dev, (ncclNetProperties_t*)props);
+  return pluginGetProperties_v10(dev, (ncclNetProperties_v10_t*)props);
 }

 __hidden ncclResult_t pluginConnect_v9(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm){
-  return pluginConnect(dev, NULL, handle, sendComm, sendDevComm);
+  return pluginConnect_v10(dev, NULL, handle, sendComm, sendDevComm);
 }

 __hidden ncclResult_t pluginIsend_v9(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
@@ -120,7 +187,7 @@ const ncclNet_v9_t ncclNetPlugin_v9 = {
  .init = pluginInit_v9,
  .devices = pluginDevices,
  .getProperties = pluginGetProperties_v9,
-  .listen = pluginListen,
+  .listen = pluginListen_v10,
  .connect = pluginConnect_v9,
  .accept = pluginAccept,
  .regMr = pluginRegMr,
@@ -172,7 +239,7 @@ const ncclNet_v8_t ncclNetPlugin_v8 = {
  .init = pluginInit_v9,
  .devices = pluginDevices,
  .getProperties = pluginGetProperties_v8,
-  .listen = pluginListen,
+  .listen = pluginListen_v10,
  .connect = pluginConnect_v9,
  .accept = pluginAccept,
  .regMr = pluginRegMr,
@@ -216,7 +283,7 @@ const ncclNet_v7_t ncclNetPlugin_v7 = {
  .init = pluginInit_v9,
  .devices = pluginDevices,
  .getProperties = pluginGetProperties_v7,
-  .listen = pluginListen,
+  .listen = pluginListen_v10,
  .connect = pluginConnect_v9,
  .accept = pluginAccept,
  .regMr = pluginRegMr_v7,
@@ -257,7 +324,7 @@ const ncclNet_v6_t ncclNetPlugin_v6 = {
  .init = pluginInit_v9,
  .devices = pluginDevices,
  .getProperties = pluginGetProperties_v6,
-  .listen = pluginListen,
+  .listen = pluginListen_v10,
  .connect = pluginConnect_v6,
  .accept = pluginAccept_v6,
  .regMr = pluginRegMr_v7,
@@ -278,7 +345,7 @@ const ncclNet_v5_t ncclNetPlugin_v5 = {
  .init = pluginInit_v9,
  .devices = pluginDevices,
  .getProperties = pluginGetProperties_v6,
-  .listen = pluginListen,
+  .listen = pluginListen_v10,
  .connect = pluginConnect_v6,
  .accept = pluginAccept_v6,
  .regMr = pluginRegMr_v7,
@@ -320,7 +387,7 @@ static ncclResult_t pluginConnect_v4(int dev, void* handle, void** sendComm) {
  ncclResult_t ret;
  do {
    ncclNetDeviceHandle_v7_t* handle = NULL;
-    ret = pluginConnect(dev, NULL, handle, sendComm, &handle);
+    ret = pluginConnect_v10(dev, NULL, handle, sendComm, &handle);
  } while (ret == ncclSuccess && *sendComm == NULL);
  return ret;
 }
@@ -337,7 +404,7 @@ const ncclNet_v4_t ncclNetPlugin_v4 = {
  .init = pluginInit_v9,
  .devices = pluginDevices,
  .getProperties = pluginGetProperties_v4,
-  .listen = pluginListen,
+  .listen = pluginListen_v10,
  .connect = pluginConnect_v4,
  .accept = pluginAccept_v4,
  .regMr = pluginRegMr_v7,
@@ -363,12 +430,12 @@ static ncclResult_t pluginFlush(void* recvComm, void* data, int size, void* mhan
 }
 static ncclResult_t pluginInit_v3(ncclDebugLogger_t logFunction) {
  max_requests = NCCL_NET_MAX_REQUESTS_V3;
-  return pluginInit(logFunction, NULL);
+  return pluginInit_v10(logFunction, NULL);
 }
 #include <string.h>
 static ncclResult_t pluginListen_v3(int dev, void* handle, void** listenComm) {
  char pluginHandle[NCCL_NET_HANDLE_MAXSIZE];
-  ncclResult_t ret = pluginListen(dev, &pluginHandle, listenComm);
+  ncclResult_t ret = pluginListen_v10(dev, &pluginHandle, listenComm);
  memcpy(handle, &pluginHandle, NCCL_NET_HANDLE_MAXSIZE_V4);
  return ret;
 }
@@ -403,7 +470,7 @@ const ncclNet_v2_t ncclNetPlugin_v2 = {
  .devices = pluginDevices,
  .pciPath = pluginPciPath,
  .ptrSupport = pluginPtrSupport,
-  .listen = pluginListen,
+  .listen = pluginListen_v3,
  .connect = pluginConnect_v4,
  .accept = pluginAccept_v4,
  .regMr = pluginRegMr_v7,
@@ -49,9 +49,9 @@ of newer ones.
 The `nccl/` directory is populated with `profiler_vX.h` files extracting all relevant definitions
 from old API versions. It also provides error codes in `err.h`.

-# API (v4)
+# API (v5)

-Below is the main `ncclProfiler_v4` struct. Each function is explained in later sections.
+Below is the main `ncclProfiler_v5` struct. Each function is explained in later sections.

 ```
 typedef struct {
@@ -60,15 +60,15 @@ typedef struct {
  // init - initialize the profiler plugin
  // Input
  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  //  - commId         : communicator id
  //  - commName       : user assigned communicator name
-  //  - commHash       : communicator id
  //  - nNodes         : number of nodes in communicator
  //  - nranks         : number of ranks in communicator
  //  - rank           : rank identifier in communicator
  //  - logfn          : logger function
  // Output
  //  - eActivationMask: bitmask of active events set by the plugin
-  ncclResult_t (*init)(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
+  ncclResult_t (*init)(void** context, uint64_t commId, int* eActivationMask, const char* commName, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);

  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
  // Input
@@ -76,7 +76,7 @@ typedef struct {
  //  - eDescr : pointer to ncclProfilerEventDescr_t object
  // Output
  //  - eHandle: return event handle for supplied event descriptor object
-  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v4_t* eDescr);
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v5_t* eDescr);

  // stopEvent - stop/finalize an event inside and event set
  // Input
@@ -88,13 +88,13 @@ typedef struct {
  //  - eHandle   : handle to event object created through startEvent
  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
  //  - eState    : event state transition
-  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v4_t eState, ncclProfilerEventStateArgs_v4_t* eStateArgs);
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v5_t eState, ncclProfilerEventStateArgs_v5_t* eStateArgs);

  // finalize - finalize the profiler plugin
  // Input
  //  - context: opaque profiler context object
  ncclResult_t (*finalize)(void* context);
-} ncclProfiler_v4_t;
+} ncclProfiler_v5_t;
 ```

 ## Error codes
@@ -148,10 +148,37 @@ is the `ncclProfilerEventDescr_t` struct.

 ```
 typedef struct {
-  uint8_t type;             // event type (e.g., ncclProfileGroup, ncclProfileColl, ...)
-  void* parentObj;          // pointer to parent event used to expose the event hierarchy to the profiler
-  int rank;                 // rank that generated the event
+  uint64_t type;             // event type descriptor: ncclProfileGroupApi, ncclProfileCollApi, ...
+  void* parentObj;           // pointer to parent event used to expose the event hierarchy to the profiler
+  int rank;                  // rank that generated the event
  union {
+    struct {                 // GroupAPI event metadata
+      bool graphCaptured;    // Set to true if the Group API event is emitted inside a CUDA graph capture
+      int groupDepth;        // Determines the depth of a ncclGroup. A depth of 1 implies that the Group API call is implicit (internal to NCCL)
+                             // and not called by the user. Any depth greater than 1 means that the user made the Group API call.
+    } groupApi;
+
+    struct {                 // Collective API call metadata
+      const char* func;      // string containing name of the collective operation during
+      size_t count;          // data count
+      const char* datatype;  // string containing the name of the datatype
+      int root;              // root rank
+      void* stream;          // Opaque handle that points to the CUDA stream that the operation is enqueued in
+      bool graphCaptured;    // Set to true if the Collective API event is emitted inside a CUDA graph capture
+    } collApi;
+
+    struct {                // Point-to-point API call metadata
+      const char* func;     // string containing name of the p2p operation
+      size_t count;         // data count
+      const char* datatype; // string containing the name of the datatype
+      void* stream;         // Opaque handle that points to a CUDA stream object
+      bool graphCaptured;   // Set to true if the Collective API event is emitted inside a CUDA graph capture
+    } p2pApi;
+
+    struct {                // Kernel Launch event metadata
+      void* stream;         // Opaque handle that points to the CUDA stream that the operation is enqueued in
+    } kernelLaunch;
+
    struct {                // collective events metadata
      uint64_t seqNumber;   // sequence number of this collective operation in the communicator
      const char* func;     // string containing name of the collective
@@ -164,6 +191,7 @@ typedef struct {
      uint8_t nWarps;       // number of GPU warps for this collective
      const char* algo;     // string containing name of the algorithm for this collective
      const char* proto;    // string containing name of the protocol for this collective
+      void* parentGroup;    // for backward compatibility with v4 - this points to the legacy v4 group parent
    } coll;

    struct {                // point-to-point events metadata
@@ -173,6 +201,7 @@ typedef struct {
      size_t count;
      int peer;             // peer rank for this point-to-point
      uint8_t nChannels;    // number of channels for this p2p
+      void* parentGroup;    // for backward compatibility with v4 - this points to the legacy v4 group parent
    } p2p;

    struct {                // proxyOp events metadata
@@ -198,12 +227,12 @@ typedef struct {
      void* data;           // pointer to network plugin defined event
    } netPlugin;
  };
-} ncclProfilerEventDescr_v4_t;
+} ncclProfilerEventDescr_v5_t;
 ```

-NCCL defines the following events: `ncclProfileGroup`, `ncclProfileColl`, `ncclProfileP2p`,
-`ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileProxyCtrl`, `ncclProfileKernelCh` and
-`ncclProfileNetPlugin`.
+NCCL defines the following events: `ncclProfileGroupApi`, `ncclProfileCollApi`, `ncclProfileP2pApi`, `ncclProfileKernelLaunch`,
+`ncclProfileGroup`, `ncclProfileColl`, `ncclProfileP2p`,`ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileProxyCtrl`,
+`ncclProfileKernelCh` and `ncclProfileNetPlugin`.

 #### stopEvent

@@ -213,10 +242,10 @@ handle after `eventStop` is undefined behavior.

 #### recordEventState

-Some events can only be started and stopped. For example, `ncclProfileGroup`, `ncclProfileColl`,
-`ncclProfileP2p`, cannot be updated through calls to `recordEventState`.
+Some events can only be started and stopped. For example, `ncclProfileP2pApi`, `ncclProfileCollApi`, `ncclProfileGroup`,
+`ncclProfileColl`, `ncclProfileP2p` cannot be updated through calls to `recordEventState`.

-`ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileNetPlugin`, `ncclProfileKernelCh`, and
+`ncclProfileGroupApi`, `ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileNetPlugin`, `ncclProfileKernelCh`, and
 `ncclProfileProxyCtrl` can be updated through calls to `recordEventState`.

 The state of these events can be updated, along with event attributes, using `recordEventState`.
@@ -258,9 +287,21 @@ typedef enum {

  // ncclProfileKernelCh event states
  ncclProfilerKernelChStop             = 22,// state marks stop of kernelCh event and timestamp update
-} ncclProfilerEventState_v4_t;
+
+  // Group API States
+  ncclProfilerGroupStartApiStop        = 23,// state marks the end of a ncclGroupStart() API call
+  ncclProfilerEndGroupApiStart         = 24 // state marks the start of a ncclGroupEnd() API call
+} ncclProfilerEventState_v5_t;
 ```

+NCCL profile API events are generated when the API calls are made, right after NCCL checks
+for graph capture information. They parent collective, point-to-point and kernel launch events
+and persist across multiple operations in a group.
+
+`ncclProfileKernelLaunch` events are generated when the CUDA call to a kernel launch is made. In the
+case of graph capture, the event start indicates that the kernel launch operation has been recorded,
+not launched.
+
 `ncclProfileProxyOp` events are generated by the proxy progress thread while it is processing
 network requests for the GPU kernel. ProxyOp events are generated for every active channel and
 provide a summary of the activity of the proxy progress thread for that channel. Most of the
@@ -379,7 +420,7 @@ typedef union {
  struct {                // attribute to update for ncclProfileKernelCh events
    uint64_t pTimer;      // timestamp provided by the NCCL kernel
  } kernelCh;
-} ncclProfilerEventStateArgs_v4_t;
+} ncclProfilerEventStateArgs_v5_t;
 ```

 The example profiler in `ext-profiler/example` contains details on how to capture and use the events above.
@@ -389,27 +430,33 @@ The example profiler in `ext-profiler/example` contains details on how to captur
 NCCL core events (reported above) are organized into a hierarchy as reported below:

 ```
-Group event
+Group API event
   |
-   +- Collective event
+   +- Collective API event
   |  |
-   |  +- ProxyOp event
-   |  |  |
-   |  |  +- ProxyStep event
-   |  |     |
-   |  |     +- NetPlugin event
-   |  |
-   |  +- KernelCh event
+   |  +- Collective event
+   |     |
+   |     +- ProxyOp event
+   |     |  |
+   |     |  +- ProxyStep event
+   |     |     |
+   |     |     +- NetPlugin event
+   |     |
+   |     +- KernelCh event
   |
-   +- Point-to-point event
-      |
-      +- ProxyOp event
-      |  |
-      |  +- ProxyStep event
-      |     |
-      |     +- NetPlugin event
-      |
-      +- KernelCh event
+   +- Point-to-point API event
+   |  |
+   |  +- Point-to-point event
+   |     |
+   |     +- ProxyOp event
+   |     |  |
+   |     |  +- ProxyStep event
+   |     |     |
+   |     |     +- NetPlugin event
+   |     |
+   |     +- KernelCh event
+   |
+   +- Kernel Launch event

 ProxyCtrl event
 ```
@@ -0,0 +1,34 @@
+# Find all C source files in current directory
+set(SRC_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/plugin.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/print_event.cc
+)
+
+# Create shared library
+add_library(nccl-profiler-example SHARED ${SRC_FILES})
+
+# Set include directories
+target_include_directories(nccl-profiler-example PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/nccl
+    ${CUDAToolkit_INCLUDE_DIRS}
+)
+
+# Set output name to match Makefile
+set_target_properties(nccl-profiler-example PROPERTIES
+    OUTPUT_NAME "nccl-profiler-example"
+    PREFIX "lib"
+    POSITION_INDEPENDENT_CODE ON
+    LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
+)
+
+add_custom_command(TARGET nccl-profiler-example POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_BINARY_DIR}/test/unit/plugins
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/lib/libnccl-profiler-example.so ${CMAKE_BINARY_DIR}/test/unit/plugins
+)
+
+# Add custom target for clean (equivalent to Makefile clean target)
+add_custom_target(clean-profiler-lib
+    COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_BINARY_DIR}/lib/libnccl-profiler-example.so
+    COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_BINARY_DIR}/test/unit/plugins/libnccl-profiler-example.so
+    COMMENT "Cleaning libnccl-profiler-example.so"
+)
@@ -4,19 +4,26 @@
 # See LICENSE.txt for license information
 #
 .DEFAULT_GOAL: build
-include ../../makefiles/common.mk
-SRCDIR   ?= $(abspath ../..)
+ROCM_PATH ?= $(wildcard /opt/rocm)
+CXX = $(ROCM_PATH)/lib/llvm/bin/amdclang++
 BUILDDIR ?= .
-NCCLDIR  := $(BUILDDIR)
+HIPIFY_DIR := hipify-profiler

-SRC_FILES := $(wildcard *.c)
+SRC_FILES := $(wildcard *.cc)
+HIPIFY_SRC := $(addprefix $(HIPIFY_DIR)/,$(SRC_FILES))

-build: ${BUILDDIR}/librccl-profiler.so
+build: ${BUILDDIR}/librccl-profiler-example.so

-${BUILDDIR}/librccl-profiler.so: ${SRC_FILES}
+${BUILDDIR}/librccl-profiler-example.so: $(HIPIFY_SRC)
 	@printf "Compiling  %-35s > %s\n" $< $@
 	@mkdir -p ${BUILDDIR}
-	$(CC) -Inccl -fPIC -shared -o $@ $^
+	$(CXX) -D__HIP_PLATFORM_AMD__ -I$(HIPIFY_DIR) -I$(HIPIFY_DIR)/nccl -I$(ROCM_PATH)/include -fPIC -shared -o $@ $^
+
+$(HIPIFY_DIR)/%.cc: %.cc
+	@mkdir -p $(HIPIFY_DIR)/nccl
+	@cp *.cc *.h $(HIPIFY_DIR)/
+	@cp nccl/*.h $(HIPIFY_DIR)/nccl/
+	@hipify-perl -inplace -quiet-warnings $(HIPIFY_DIR)/*.cc $(HIPIFY_DIR)/*.h

 clean:
-	rm -f ${BUILDDIR}/librccl-profiler.so
+	rm -rf ${BUILDDIR}/librccl-profiler-example.so $(HIPIFY_DIR)
@@ -13,8 +13,7 @@ change the size of the event window the profiler keeps track of.

 ## Building the profiler plugin

-To use the example plugin, just type `make`. You will need a NCCL build's include directory present.
-You can override `NCCL_HOME` to where the NCCL installation is on your system.
+To build the example plugin shipped as part of NCCL, just type `make`.

 ## Using the profiler plugin

@@ -27,13 +26,13 @@ You can override `NCCL_HOME` to where the NCCL installation is on your system.

   As an example, setting:

-   `NCCL_PROFILE_EVENT_MASK` to 1 (`ncclProfileGroup`) | 2 (`ncclProfileColl`) | 8 (`ncclProfileProxyOp`)
+   `NCCL_PROFILE_EVENT_MASK` to 256 (`ncclProfileGroupApi`) | 2 (`ncclProfileColl`) | 8 (`ncclProfileProxyOp`)

-   enables the profiling of the group, the collective and the proxy op events. The same events can be
+   enables the profiling of the group API, the collective and the proxy op events. The same events can be
   expressed more concisely by setting `NCCL_PROFILE_EVENT_MASK` to 8 (`ncclProfileProxyOp`). Indeed,
   in NCCL all the events above (in the event hierarchy) the one requested are also captured. The advantage
   is that the profiler can easily correlate events that belong to the same NCCL operation and present
-   them accordingly.
+   them accordingly. Setting `NCCL_PROFILE_EVENT_MASK` to 4095 enables all events supported by the v5 profiler.

 3. Set `NCCL_PROFILE_DUMP_FILE` to the name of the dump file for the collected traces. A file named
   ${NCCL_PROFILE_DUMP_FILE}-hostname-tid.txt is created. Profiler traces are saved using the chrome
@@ -57,11 +56,14 @@ The group, collective and p2p pools contain objects for the corresponding events
 contains objects for `ProxyCtrl` events and the `ProxyDetach` pool contains objects for `ProxyOp` events
 generated by remote proxies. A list of pools and their size is reported below:

- `NCCL_PROFILE_GROUP_POOL_SIZE` (16)
- `NCCL_PROFILE_COLL_POOL_SIZE` (16)
- `NCCL_PROFILE_P2P_POOL_SIZE` (1024)
+- `NCCL_PROFILE_GROUP_API_POOL_SIZE` (256)
+- `NCCL_PROFILE_COLL_API_POOL_SIZE` (256)
+- `NCCL_PROFILE_P2P_API_POOL_SIZE` (256)
+- `NCCL_PROFILE_KERNEL_LAUNCH_POOL_SIZE` (256)
+- `NCCL_PROFILE_COLL_POOL_SIZE` (256)
+- `NCCL_PROFILE_P2P_POOL_SIZE` (256)
 - `NCCL_PROFILE_PROXY_CTRL_POOL_SIZE` (16)
- `NCCL_PROFILE_PROXY_DETACH_POOL_SIZE` (128)
+- `NCCL_PROFILE_PROXY_DETACH_POOL_SIZE` (256)

 Remote proxy operations are generated when PXN is in use. Refer to this article for more information
 about PXN and how it works:
@@ -73,76 +75,58 @@ The example profiler generates traces using the json format. An example of trace

 ```
 [
-{"name": "Group", "cat": "GROUP", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 764234.611328, "args": {"groupId": 0}},
-{"name": "AllReduce", "cat": "COLL", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 764237.294922, "args": {"SeqNum": 0, "CommHash": 673864846479792718, "Rank": 1, "Count": 32768, "Datatype": "ncclFloat32", "Algorithm": "RING", "Protocol": "LL", "nMaxChannels": 2}},
-{"name": "Recv", "cat": "PROXY", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768464.936523, "args": {"Channel": 0, "Peer": 0, "Steps": 14, "ChunkSize": 32768, "transSize": 229376, "POSTED": {"step": 14, "ts": 772020.300781}, "RECEIVED": {"step": 14, "ts": 772196.049805}, "TRANSMITTED": {"step": 14, "ts": 772197.326172}, "DONE": {"step": 14, "ts": 772201.538086}}},
-{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768465.158203, "args": {"Step": 0}},
-{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768477.924805},
-{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768477.924805, "args": {"Step": 0}},
-{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768547.197266},
-{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768547.197266, "args": {"Step": 0}},
-{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768564.174805},
-{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768564.174805, "args": {"Step": 0}},
-{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768568.276367},
-{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768503.604492, "args": {"Step": 1}},
-{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 768504.549805},
-{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768504.549805, "args": {"Step": 1}},
-{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 769994.490234},
-{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 769994.490234, "args": {"Step": 1}},
-{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 769995.012695},
-{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 769995.012695, "args": {"Step": 1}},
-{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 770006.914062},
-{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 768506.941406, "args": {"Step": 2}},
-{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 768507.435547},
-{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 768507.435547, "args": {"Step": 2}},
-{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771452.536133},
-{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 771452.536133, "args": {"Step": 2}},
-{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771453.060547},
-{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 771453.060547, "args": {"Step": 2}},
-{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771468.458008},
-{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 768509.484375, "args": {"Step": 3}},
-{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 768510.250000},
-{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 768510.250000, "args": {"Step": 3}},
-{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.499023},
-{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.499023, "args": {"Step": 3}},
-{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.991211},
-{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.991211, "args": {"Step": 3}},
-{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771910.500000},
-{"name": "Send", "cat": "PROXY", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768482.878906, "args": {"Channel": 0, "Peer": 2, "Steps": 14, "ChunkSize": 32768, "transSize": 229376, "POSTED": {"step": 14, "ts": 771995.675781}, "REM_FIFO_WAIT": {"step": 14, "ts": 772190.692383}, "TRANSMITTED": {"step": 14, "ts": 772191.516602}, "DONE": {"step": 14, "ts": 772208.473633}}},
-{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.019531, "args": {"Step": 0}},
-{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.300781},
-{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.300781, "args": {"Step": 0}},
-{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 769594.615234},
-{"name": "SendWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 769594.615234, "args": {"Step": 0}},
-{"name": "SendWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 769618.889648},
-{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.083008, "args": {"Step": 1}},
-{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.163086},
-{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.163086, "args": {"Step": 1}},
-{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 769610.555664},
-{"name": "SendWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 769610.555664, "args": {"Step": 1}},
-{"name": "SendWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 769622.517578},
-{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 768507.937500, "args": {"Step": 2}},
-{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 768508.017578},
-{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 768508.017578, "args": {"Step": 2}},
-{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 770002.129883},
-{"name": "SendWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 770002.129883, "args": {"Step": 2}},
-{"name": "SendWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 770013.848633},
-{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.742188, "args": {"Step": 3}},
-{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.822266},
-{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.822266, "args": {"Step": 3}},
-{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 771461.563477},
-{"name": "SendWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 771461.563477, "args": {"Step": 3}},
-{"name": "SendWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 771469.171875},
+{"name": "Group API", "cat": "GROUP_API", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 3433.595001, "args": {"groupApiId": 0, "groupDepth":1}},
+{"name": "KernelLaunch", "cat": "KERNEL_LAUNCH", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 0.000000, "args": {"groupId": 0, "Stream": 0x5020000567d0}},
+{"name": "KernelLaunch", "cat": "KERNEL_LAUNCH", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 111991.558990},
+{"name": "AllReduce", "cat": "COLL_API", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 0.000000, "args": {"count": 262144, "datatype": ncclFloat32, "root": 0, "GraphCaptured":0, "Stream": 0x5020000567d0}},
+{"name": "AllReduce", "cat": "COLL", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 111994.477997, "args": {"SeqNum": 0, "CommHash": 1493613951195738943, "Rank": 0, "Count": 262144, "Datatype": "ncclFloat32", "Algorithm": "RING", "Protocol": "SIMPLE", "nChannels": 2}},
+{"name": "KernelCh", "cat": "GPU", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119711.888000, "args": {"Channel": 0, "StartGpuClk": 1756135989724672000, "StopGpuClk": 1756135989732831232}},
+{"name": "ScheduleRecv", "cat": "PROXY", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119652.709991, "args": {"Channel": 0, "Peer": 1, "Steps": 4, "ChunkSize": 4194304, "transSize": 524288}},
+{"name": "ScheduleRecv", "cat": "PROXY", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 119686.300995},
+{"name": "ProgressRecv", "cat": "PROXY", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119686.300995, "args": {"Channel": 0, "Peer": 1, "Steps": 4, "ChunkSize": 4194304, "transSize": 524288}},
+{“name": "RecvWait", "cat": "NET", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119707.677979, "args": {"Step": 0}},
+{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 119807.691986},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119807.691986, "args": {"Step": 0}},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 119867.338989},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119867.338989, "args": {"Step": 0}},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 120120.983002},
+{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 1, "pid": 225798, "tid": 1, "ts": 119733.647980, "args": {"Step": 1}},
+{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 1, "pid": 225798, "tid": 1, "ts": 119844.401001},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 1, "pid": 225798, "tid": 1, "ts": 119844.401001, "args": {"Step": 1}},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 1, "pid": 225798, "tid": 1, "ts": 119890.567993},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 1, "pid": 225798, "tid": 1, "ts": 119890.567993, "args": {"Step": 1}},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 1, "pid": 225798, "tid": 1, "ts": 120121.129974},
+{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 2, "pid": 225798, "tid": 1, "ts": 119753.023987, "args": {"Step": 2}},
+{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 2, "pid": 225798, "tid": 1, "ts": 120038.847992},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 2, "pid": 225798, "tid": 1, "ts": 120038.847992, "args": {"Step": 2}},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 2, "pid": 225798, "tid": 1, "ts": 120085.685974},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 2, "pid": 225798, "tid": 1, "ts": 120085.685974, "args": {"Step": 2}},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 2, "pid": 225798, "tid": 1, "ts": 120121.244995},
+{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 3, "pid": 225798, "tid": 1, "ts": 119772.510986, "args": {"Step": 3}},
+{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 3, "pid": 225798, "tid": 1, "ts": 120062.944977},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 3, "pid": 225798, "tid": 1, "ts": 120062.944977, "args": {"Step": 3}},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 3, "pid": 225798, "tid": 1, "ts": 120101.089996},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 3, "pid": 225798, "tid": 1, "ts": 120101.089996, "args": {"Step": 3}},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 3, "pid": 225798, "tid": 1, "ts": 120165.115997},
+{"name": "ProgressRecv", "cat": "PROXY", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 120165.356995},
+{"name": "ScheduleSend", "cat": "PROXY", "ph": "b", "id": 1, "pid": 225798, "tid": 1, "ts": 119656.950989, "args": {"Channel": 0, "Peer": 1, "Steps": 4, "ChunkSize": 4194304, "transSize": 524288}},
+{"name": "ScheduleSend", "cat": "PROXY", "ph": "e", "id": 1, "pid": 225798, "tid": 1, "ts": 119709.078979},
+{"name": "ProgressSend", "cat": "PROXY", "ph": "b", "id": 1, "pid": 225798, "tid": 1, "ts": 119709.078979, "args": {"Channel": 0, "Peer": 1, "Steps": 4, "ChunkSize": 4194304, "transSize": 524288}},
+{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 4, "pid": 225798, "tid": 1, "ts": 119710.632996, "args": {"Step": 0}},
+{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 4, "pid": 225798, "tid": 1, "ts": 119808.636993},
+{"name": "SendPeerWait", "cat": "NET", "ph": "b", "id": 4, "pid": 225798, "tid": 1, "ts": 119808.636993, "args": {"Step": 0}},
+{"name": "SendPeerWait", "cat": "NET", "ph": "e", "id": 4, "pid": 225798, "tid": 1, "ts": 119818.972992},
 ... [ trace truncated for brevity ]
-{"name": "AllReduce", "cat": "COLL", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 772209.317383},
-{"name": "Group", "cat": "GROUP", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 772209.418945},
+{"name": "AllReduce", "cat": "COLL", "ph": "e", "id": 17, "pid": 225798, "tid": 1, "ts": 170633.535980},
+{"name": "AllReduce", "cat": "COLL_API", "ph": "e", "id": 17, "pid": 225798, "tid": 1, "ts": 170582.923981},
+{"name": "Group API", "cat": "GROUP_API", "ph": "e", "id": 17, "pid": 225798, "tid": 1, "ts": 170637.582001},
 {}]
 ```

 Details about the fields used in the trace can be found at this link:
 https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview?tab=t.0#heading=h.yr4qxyxotyw

-The trace above is obtained by running a `ncclAllReduce` operation on 8 GPUs, communicating with each other through
+The trace above is obtained by running a `ncclAllReduce` operation on 2 GPUs, communicating with each other through
 the network interface. The `Group` event encloses all traces that are related to the single `ncclAllReduce` call.
 (Note that for single collective invocations, where there are no explicit group calls, NCCL creates a group with only
 one collective and this is what is presented in the traces above).
@@ -161,38 +145,17 @@ The `AllReduce` entry presents information about the `ncclAllReduce` operation.
 - datatype    : NCCL datatype
 - algorithm   : algorithm used to process the ncclAllReduce
 - protocol    : protocol used to process the ncclAllReduce
- nMaxChannels: max number of channels used to process the ncclAllReduce
+- nChannels   : Number of channels used to process the ncclAllReduce

 If the proxy events are not active (e.g., the `ncclAllReduce` is intranode) the end timestamp will match the time
 consumed by the CPU to launch the collective. For more details refer to `ext-profiler/README.md`, section `Profiling
 of collective and p2p operations`.

-### Proxy Send
-The `Send` entry presents information about the `ProxyOp` processing in the progress thread. It contains the following
-info in the args field:
-
- Channel      : id of the channel used by this proxy operation to send data to the peer
- Peer         : peer rank
- Steps        : number of network steps required to transfer transSize bytes to the peer
- ChunkSize    : chunk size used by NCCL to pipeline data through the proxy thread
- transSize    : bytes transferred across the channel by this proxy operation
- POSTED       : struct containing the number of buffer posts to the GPU and the time stamp for the last post
- REM_FIFO_WAIT: struct containing the number of remote buffer waits and the time stamp for the last wait
- TRANSMITTED  : struct containing the number of network sends and the time stamp of the last send
- DONE         : struct containing the number of network sends completed and the time stamp of the last send completed
-
-In case of a network problem the POSTED, REM_FIFO_WAIT, TRANSMITTED and DONE might all have partially updated steps,
-which could help identify at which point the network problem occurred.
-
 The Proxy send trace gives a summary of the proxy progress thread activity for the channel. If more details are
 needed, these can be obtained by enabling the proxy step event (`ncclProfileProxyStep`). In which case the trace
 entries below are also reported by the profiler.

-#### Proxy SendBufferWait
-
-Presents, for every network step, the time the CPU proxy spends waiting for the channel staging buffer to become available.
-
-#### Proxy SendGPUWait
+#### Proxy SendGpuWait

 Presents, for every network step, the time the CPU proxy spends waiting for the GPU to provide the data in the staging
 buffer.
@@ -201,31 +164,6 @@ buffer.

 Presents, for every network step, the time the CPU proxy spends waiting for the `isend` to complete

-### Proxy Recv
-
-The `Recv` entry presents information about the `ProxyOp` processing in the progress thread. It contains the following
-info in the args field:
-
- Channel    : id of the channel used by this proxy operation to recv data from the peer
- Peer       : peer rank
- Steps      : number of network steps required to transfer transSize bytes from the peer
- ChunkSize  : chunk size used by NCCL to pipeline data through the proxy thread
- transSize  : bytes transferred across the channel by this proxy operation
- POSTED     : struct containing the number of recvs posted and the time stamp for the last recv posted
- RECEIVED   : struct containing the number of recvs completed and the time stamp for the last recv completed
- TRANSMITTED: struct containing the number of recvs flushed to the GPU memory and the time stamp for the last recv flushed
- DONE       : struct containing the number of flush completed and the time stamp for the last flush completed
-
-The Proxy Recv trace gives a summary of the proxy progress thread activity for the channel. If more details are
-needed, these can be obtained by enabling the proxy step event (`ncclProfileProxyStep`). In which case the trace
-entries below are also reported by the profiler.
-
-
-#### Proxy RecvBufferWait
-
-Presents, for every network step, the time the CPU proxy spends waiting for the staging buffer for the channel to
-become available.
-
 #### Proxy RecvWait

 Presents, for every network step, the time the CPU proxy spends waiting for a posted `irecv` to complete
@@ -234,6 +172,6 @@ Presents, for every network step, the time the CPU proxy spends waiting for a po

 Presents, for every network step, the time the CPU proxy spends waitng for the recv data to be flushed to the GPU

-#### Proxy RecvGPUWait
+#### Proxy RecvGpuWait

 Presents, for every network step, the time the CPU proxy spends waiting for the GPU to consume the recv data
@@ -1,30 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include <stdio.h>
-#include "event.h"
-
-int taskEventQueueEmpty(struct group* g) {
-  return g->eventHead == NULL;
-}
-
-void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event) {
-  event->next = NULL;
-  if (g->eventHead) g->eventTail->next = event;
-  else g->eventHead = event;
-  g->eventTail = event;
-}
-
-struct taskEventBase* taskEventQueueHead(struct group* g) {
-  return g->eventHead;
-}
-
-struct taskEventBase* taskEventQueueDequeue(struct group* g) {
-  struct taskEventBase* tmp = g->eventHead;
-  g->eventHead = g->eventHead->next;
-  if (g->eventHead == NULL) g->eventTail = NULL;
-  return tmp;
-}
@@ -10,10 +10,14 @@
 #include <sys/types.h>
 #include <stdint.h>
 #include <unistd.h>
+#include <cstring>
+#include "err.h"
 #include "profiler.h"
+#include "queue.h"
+#include <cuda_runtime.h>

 #define MAX_CHANNELS                     128 // Match RCCL's MAXCHANNELS
-#define MAX_STEPS                        16
+#define MAX_STEPS                        1024
 #define MAX_OPS                          16 // Up to 64K ranks for PAT
 #define MAX_EVENTS_PER_REQ               (8)

@@ -21,7 +25,7 @@ struct proxyOp;
 struct proxyStep;

 struct netPlugin {
-  uint8_t type;
+  uint64_t type;
  int pluginType;
  int pluginVer;
  uint8_t pluginEvent;
@@ -63,7 +67,7 @@ struct kernelCh {
 #define PROXY_STEP_MAX_STATES 3

 struct proxyStep {
-  uint8_t type;                     // type of event: network transfer
+  uint64_t type;                     // type of event: network transfer
  int state;
  int step;                         // network transfer id in given channel
  int isSend;                       // send/recv channel operation
@@ -76,7 +80,7 @@ struct proxyStep {
 };

 struct proxyOp {
-  uint8_t type;                     // type of event: proxy operation
+  uint64_t type;                     // type of event: proxy operation
  uint8_t channelId;                // channel id for this proxy operation
  pid_t pid;
  int rank;
@@ -97,7 +101,7 @@ struct group;
 struct context;

 struct proxyCtrl {
-  uint8_t type;
+  uint64_t type;
  struct context* ctx;              // profiler context
  double startTs;
  double stopTs;
@@ -107,12 +111,12 @@ struct proxyCtrl {

 // task level event base structure
 struct taskEventBase {
-  uint8_t type;                     // event type: collective/p2p
+  uint64_t type;                     // event type: collective/p2p
  int rank;                         // rank of the operation in NCCL communicator
  const char* func;                 // ncclFunc*
  int refCount;                     // number of references for this operation
-  struct group* parent;             // parent event group
-  struct taskEventBase* next;       // next top level event in group
+  void* parent;                     // parent API event
+  struct taskEventBase* next;       // next top level event
  double startTs;
  double stopTs;
 };
@@ -147,7 +151,7 @@ struct p2p {
 };

 struct group {
-  uint8_t type;
+  uint64_t type;
  struct context* ctx;              // profiler context
  int groupId;
  int refCount;
@@ -158,6 +162,70 @@ struct group {
  struct group* next;               // next group event in queue
 };

+struct collApi {
+  uint64_t type;
+  struct groupApi* parent;
+  struct context* ctx;              // profiler context
+  int collApiId;
+  int refCount;
+  cudaStream_t stream;
+  const char* func;
+  size_t count;
+  const char* datatype;
+  int root;
+  bool graphCaptured;
+  struct taskEventBase* eventHead;  // queue head for task events
+  struct taskEventBase* eventTail;  // queue tail for task events
+  double startTs;
+  double stopTs;
+  struct collApi* next;
+};
+
+struct p2pApi {
+  uint64_t type;
+  struct groupApi* parent;
+  struct context* ctx;              // profiler context
+  int p2pApiId;
+  int refCount;
+  const char* func;
+  cudaStream_t stream;
+  size_t count;
+  const char* datatype;
+  bool graphCaptured;
+  struct taskEventBase* eventHead;  // queue head for task events
+  struct taskEventBase* eventTail;  // queue tail for task events
+  double startTs;
+  double stopTs;
+  struct p2pApi* next;
+};
+
+struct kernelLaunch {
+  uint64_t type;
+  struct groupApi* parent;
+  cudaStream_t stream;
+  int kernelLaunchId;
+  double startTs;
+  double stopTs;
+  struct kernelLaunch* next;
+};
+
+struct groupApi {
+  uint64_t type;
+  struct context* ctx;
+  int groupApiId;
+  int refCount;
+  bool graphCaptured;
+  int groupDepth;
+  struct profilerQueue<struct p2pApi, &p2pApi::next> p2pApiEvents;
+  struct profilerQueue<struct collApi, &collApi::next> collApiEvents;
+  struct profilerQueue<struct kernelLaunch, &kernelLaunch::next> kernelLaunchEvents;
+  double endOfncclGroupStartTs;
+  double startOfncclGroupEndTs;
+  double startTs;
+  double stopTs;
+  struct groupApi* next;
+};
+
 // arrays for different event objects
 struct context {
  const char* commName;
@@ -165,6 +233,26 @@ struct context {
  int nranks;
  int rank;

+  int groupApiPoolSize;
+  int groupApiPoolBase;
+  int groupApiPoolIndex;
+  struct groupApi* groupApiPool;
+
+  int collApiPoolSize;
+  int collApiPoolBase;
+  int collApiPoolIndex;
+  struct collApi* collApiPool;
+
+  int p2pApiPoolSize;
+  int p2pApiPoolBase;
+  int p2pApiPoolIndex;
+  struct p2pApi* p2pApiPool;
+
+  int kernelLaunchPoolSize;
+  int kernelLaunchPoolBase;
+  int kernelLaunchPoolIndex;
+  struct kernelLaunch* kernelLaunchPool;
+
  int groupPoolSize;
  int groupPoolBase;
  int groupPoolIndex;
@@ -186,9 +274,50 @@ struct context {
  struct proxyCtrl* proxyCtrlPool;
 };

-int taskEventQueueEmpty(struct group* g);
-void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event);
-struct taskEventBase* taskEventQueueHead(struct group* g);
-struct taskEventBase* taskEventQueueDequeue(struct group* g);
+template <typename T>
+inline int taskEventQueueEmpty(T *obj) {
+  return obj->eventHead == NULL;
+}
+
+template <typename T>
+inline void taskEventQueueEnqueue(T* obj, struct taskEventBase* event) {
+  event->next = NULL;
+  if (obj->eventHead) obj->eventTail->next = event;
+  else obj->eventHead = event;
+  obj->eventTail = event;
+}
+
+template <typename T>
+inline struct taskEventBase* taskEventQueueHead(T *obj) {
+    return obj->eventHead;
+}
+
+template <typename T>
+inline struct taskEventBase* taskEventQueueDequeue(T* obj) {
+  struct taskEventBase* tmp = obj->eventHead;
+  obj->eventHead = obj->eventHead->next;
+  if (obj->eventHead == NULL) obj->eventTail = NULL;
+  return tmp;
+}
+
+template <typename T>
+inline void resetTaskEvents(T *obj, struct context* ctx) {
+  while (!taskEventQueueEmpty(obj)) {
+    struct taskEventBase* base = taskEventQueueDequeue(obj);
+    if (base->type == ncclProfileColl) {
+      struct collective* c = (struct collective *)base;
+      // reset event proxyOps & proxySteps
+      memset(c->nProxyOps, 0, sizeof(int)*MAX_CHANNELS);
+      // release collective events in the group and return them to the collective pool
+      __atomic_fetch_add(&ctx->collPoolBase, 1, __ATOMIC_RELAXED);
+    } else if (base->type == ncclProfileP2p) {
+      struct p2p* p = (struct p2p *)base;
+      // reset event proxyOp and proxySteps
+      memset(&p->op, 0, sizeof(struct proxyOp)*MAX_CHANNELS);
+      // release p2p events in the group and return them to the p2p pool
+      __atomic_fetch_add(&ctx->p2pPoolBase, 1, __ATOMIC_RELAXED);
+    }
+  }
+}

 #endif
@@ -11,17 +11,20 @@
 #include <stdlib.h>

 #include "common.h"
-#include "err.h"

 enum {
-  ncclProfileGroup     = (1 << 0),  // group event type
-  ncclProfileColl      = (1 << 1),  // host collective call event type
-  ncclProfileP2p       = (1 << 2),  // host point-to-point call event type
-  ncclProfileProxyOp   = (1 << 3),  // proxy operation event type
-  ncclProfileProxyStep = (1 << 4),  // proxy step event type
-  ncclProfileProxyCtrl = (1 << 5),  // proxy control event type
-  ncclProfileKernelCh  = (1 << 6),  // kernel channel event type
-  ncclProfileNetPlugin = (1 << 7),  // network plugin-defined, events
+  ncclProfileGroup          = (1 << 0),  // group event type
+  ncclProfileColl           = (1 << 1),  // host collective call event type
+  ncclProfileP2p            = (1 << 2),  // host point-to-point call event type
+  ncclProfileProxyOp        = (1 << 3),  // proxy operation event type
+  ncclProfileProxyStep      = (1 << 4),  // proxy step event type
+  ncclProfileProxyCtrl      = (1 << 5),  // proxy control event type
+  ncclProfileKernelCh       = (1 << 6),  // kernel channel event type
+  ncclProfileNetPlugin      = (1 << 7),  // network plugin-defined, events
+  ncclProfileGroupApi       = (1 << 8),  // Group API events
+  ncclProfileCollApi        = (1 << 9),  // Collective API events
+  ncclProfileP2pApi         = (1 << 10), // Point-to-Point API events
+  ncclProfileKernelLaunch   = (1 << 11), // Kernel launch events
 };

 typedef enum {
@@ -56,21 +59,27 @@ typedef enum {

  /* Kernel event states */
  ncclProfilerKernelChStop             = 22,
+
+  /* Group API States */
+  ncclProfilerEndGroupApiStart         = 23,
+  ncclProfilerBeginGroupApiEnd         = 24
 } ncclProfilerEventState_t;

 typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t;
 typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t;
 typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t;
 typedef ncclProfilerEventState_t ncclProfilerEventState_v4_t;
+typedef ncclProfilerEventState_t ncclProfilerEventState_v5_t;

+#include "profiler_v5.h"
 #include "profiler_v4.h"
 #include "profiler_v3.h"
 #include "profiler_v2.h"
 #include "profiler_v1.h"
 #include "profiler_net.h"

-typedef ncclProfiler_v4_t ncclProfiler_t;
-typedef ncclProfilerEventDescr_v4_t ncclProfilerEventDescr_t;
-typedef ncclProfilerEventStateArgs_v4_t ncclProfilerEventStateArgs_t;
+typedef ncclProfiler_v5_t ncclProfiler_t;
+typedef ncclProfilerEventDescr_v5_t ncclProfilerEventDescr_t;
+typedef ncclProfilerEventStateArgs_v5_t ncclProfilerEventStateArgs_t;

 #endif // end include guard
@@ -0,0 +1,152 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V5_H_
+#define PROFILER_V5_H_
+#include <stdbool.h>
+
+typedef struct {
+  uint64_t type;                // event type descriptor: ncclProfileGroupApi, ...
+  void* parentObj;              // pointer to the profiler parent object
+  int rank;                     // originating rank
+  union {
+    struct {
+      int graphCaptured;
+      int groupDepth;
+    } groupApi;
+
+    struct {
+      const char* func;
+      size_t count;
+      const char* datatype;
+      int root;
+      void* stream;
+      bool graphCaptured;
+    } collApi;
+
+    struct {
+      const char* func;
+      size_t count;
+      const char* datatype;
+      void* stream;
+      bool graphCaptured;
+    } p2pApi;
+
+    struct {
+      void* stream;
+    } kernelLaunch;
+
+    struct {
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      uint8_t nChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+      void* parentGroup; // for backward compatibility with v4
+    } coll;
+
+    struct {
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+      uint8_t nChannels;
+      void* parentGroup; // for backward compatibility with v4
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+
+    struct {
+      uint8_t channelId;
+      uint64_t pTimer;          // start timestamp from GPU globaltimer
+    } kernelCh;
+
+    struct {
+      int64_t id;
+      void* data;
+    } netPlugin;
+  };
+} ncclProfilerEventDescr_v5_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+  } proxyStep;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+
+  struct {
+    void* data;
+  } netPlugin;
+
+  struct {
+    uint64_t pTimer;
+  } kernelCh;
+} ncclProfilerEventStateArgs_v5_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  //  - commId         : communicator id
+  //  - commName       : user assigned communicator name
+  //  - nNodes         : number of nodes in communicator
+  //  - nranks         : number of ranks in communicator
+  //  - rank           : rank identifier in communicator
+  //  - logfn          : logger function
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, uint64_t commId, int* eActivationMask, const char* commName, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v5_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v5_t eState, ncclProfilerEventStateArgs_v5_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v5_t;
+
+#endif
@@ -6,7 +6,7 @@

 #include <stdio.h>
 #include <pthread.h>
-#include <string.h>
+#include <cstring>
 #include <linux/limits.h>
 #include <sys/time.h>
 #include <sys/types.h>
@@ -22,12 +22,20 @@ static int initialized;             // initialization counter for profiler
 static double startTime;            // profiler start time

 static const int defaultEActivationMask = ncclProfileColl | ncclProfileP2p;
-static const int defaultGroupPoolSize = 16;
-static const int defaultCollPoolSize = 16;
-static const int defaultP2pPoolSize = 1024;
+static const int defaultGroupApiPoolSize = 256;
+static const int defaultCollApiPoolSize = 256;
+static const int defaultP2pApiPoolSize = 256;
+static const int defaultKernelLaunchPoolSize = 256;
+static const int defaultGroupPoolSize = 256;
+static const int defaultCollPoolSize = 256;
+static const int defaultP2pPoolSize = 256;
 static const int defaultProxyCtrlPoolSize = 16;
-static const int defaultDetachPoolSize = 128;
+static const int defaultDetachPoolSize = 256;

+static int groupApiPoolSize;
+static int collApiPoolSize;
+static int p2pApiPoolSize;
+static int kernelLaunchPoolSize;
 static int groupPoolSize;
 static int collPoolSize;
 static int p2pPoolSize;
@@ -51,7 +59,7 @@ static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
 static pid_t pid;
 static int* eActivationMaskPtr;

-__hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) {
+__hidden ncclResult_t exampleProfilerInit(void** context, uint64_t commId, int* eActivationMask, const char* commName, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) {
  pthread_mutex_lock(&lock);
  if (__atomic_fetch_add(&initialized, 1, __ATOMIC_RELAXED) == 0) {
    // first thread initializes event mask, environment and detach pool
@@ -59,6 +67,18 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask,
    str = getenv("NCCL_PROFILE_EVENT_MASK");
    __atomic_store_n(eActivationMask, str ? atoi(str) : 0, __ATOMIC_RELAXED);

+    str = getenv("NCCL_PROFILE_GROUP_API_POOL_SIZE");
+    groupApiPoolSize = str ? atoi(str) : defaultGroupApiPoolSize;
+
+    str = getenv("NCCL_PROFILE_COLL_API_POOL_SIZE");
+    collApiPoolSize = str ? atoi(str) : defaultCollApiPoolSize;
+
+    str = getenv("NCCL_PROFILE_P2P_API_POOL_SIZE");
+    p2pApiPoolSize = str ? atoi(str) : defaultP2pApiPoolSize;
+
+    str = getenv("NCCL_PROFILE_KERNEL_LAUNCH_POOL_SIZE");
+    kernelLaunchPoolSize = str ? atoi(str) : defaultKernelLaunchPoolSize;
+
    str = getenv("NCCL_PROFILE_GROUP_POOL_SIZE");
    groupPoolSize = str ? atoi(str) : defaultGroupPoolSize;

@@ -96,11 +116,23 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask,
  // pre-allocate memory for event object pools in dedicated profiler context
  struct context* ctx = (struct context *)calloc(1, sizeof(*ctx));
  ctx->commName = commName;
-  ctx->commHash = commHash;
+  ctx->commHash = commId;
  ctx->nranks = nranks;
  ctx->rank = rank;
  logFn = logfn;
-  INFO(NCCL_INIT, "PROFILER/Plugin: init commName: %s commHash: %lu nranks: %d rank: %d", commName ? commName : "", commHash, nranks, rank);
+  INFO(NCCL_INIT, "PROFILER/Plugin: init commName: %s commHash: %lu nranks: %d rank: %d", commName ? commName : "", commId, nranks, rank);
+
+  ctx->groupApiPool = (struct groupApi *)calloc(groupApiPoolSize, sizeof(*ctx->groupApiPool));
+  if (ctx->groupApiPool == NULL) goto fail;
+
+  ctx->collApiPool = (struct collApi *)calloc(collApiPoolSize, sizeof(*ctx->collApiPool));
+  if (ctx->collApiPool == NULL) goto fail;
+
+  ctx->p2pApiPool = (struct p2pApi *)calloc(p2pApiPoolSize, sizeof(*ctx->p2pApiPool));
+  if (ctx->p2pApiPool == NULL) goto fail;
+
+  ctx->kernelLaunchPool = (struct kernelLaunch *)calloc(kernelLaunchPoolSize, sizeof(*ctx->kernelLaunchPool));
+  if (ctx->kernelLaunchPool == NULL) goto fail;

  ctx->groupPool = (struct group *)calloc(groupPoolSize, sizeof(*ctx->groupPool));
  if (ctx->groupPool == NULL) goto fail;
@@ -130,16 +162,22 @@ fail:
  if (ctx->p2pPool) free(ctx->p2pPool);
  if (ctx->collPool) free(ctx->collPool);
  if (ctx->groupPool) free(ctx->groupPool);
+  if (ctx->collApiPool) free(ctx->collApiPool);
+  if (ctx->p2pApiPool) free(ctx->p2pApiPool);
+  if (ctx->kernelLaunchPool) free(ctx->kernelLaunchPool);
+  if (ctx->groupApiPool) free(ctx->groupApiPool);
  free(ctx);
  if (detachPool) free(detachPool);
  return ncclSystemError;
 }

+static const char* profilerDumpFile;
+
 __hidden ncclResult_t exampleProfilerFinalize(void* context) {
  FILE* fh = NULL;
  char filename[PATH_MAX] = { 0 };
  struct context* ctx = (struct context *)context;
-  const char* dump = getenv("NCCL_PROFILE_DUMP_FILE");
+  const char* dump = profilerDumpFile ? profilerDumpFile : getenv("NCCL_PROFILE_DUMP_FILE");
  if (dump) {
    sprintf(filename, "%s_%lu_%d.json", dump, ctx->commHash, ctx->rank);
    fh = fopen(filename, "w");
@@ -148,10 +186,12 @@ __hidden ncclResult_t exampleProfilerFinalize(void* context) {
  INFO(NCCL_INIT, "PROFILER/Plugin: finalize commName: %s commHash: %lu nranks: %d rank: %d", ctx->commName ? ctx->commName : "", ctx->commHash, ctx->nranks, ctx->rank);

  // print last N groups/collectives/p2ps
-  int start = (ctx->groupPoolIndex - groupPoolSize >= 0) ? ctx->groupPoolIndex - groupPoolSize : 0;
-  int end = ctx->groupPoolIndex;
+  // Note that since the v5 version of the profiler, group API events are now at the top of the hierarchy.
+  // Legacy Group events from v4 are still emitted for compatibility purposes when using the v4 profiler but excluded from this example.
+  int start = (ctx->groupApiPoolIndex - groupApiPoolSize >= 0) ? ctx->groupApiPoolIndex - groupApiPoolSize : 0;
+  int end = ctx->groupApiPoolIndex;
  for (int i = start; i < end; i++) {
-    printEvent(fh, &ctx->groupPool[i%groupPoolSize]);
+    printEvent(fh, &ctx->groupApiPool[i%groupApiPoolSize]);
  }

  start = (ctx->proxyCtrlPoolIndex - proxyCtrlPoolSize >= 0) ? ctx->proxyCtrlPoolIndex - proxyCtrlPoolSize : 0;
@@ -161,6 +201,10 @@ __hidden ncclResult_t exampleProfilerFinalize(void* context) {
  }

  free(ctx->groupPool);
+  free(ctx->collApiPool);
+  free(ctx->p2pApiPool);
+  free(ctx->kernelLaunchPool);
+  free(ctx->groupApiPool);
  free(ctx->collPool);
  free(ctx->p2pPool);
  free(ctx->proxyCtrlPool);
@@ -187,7 +231,113 @@ __hidden void updateEvent(void* handle);
 __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, ncclProfilerEventDescr_t* eDescr) {
  *eHandle = NULL;
  struct context* ctx = (struct context *)context;
-  if (eDescr->type == ncclProfileGroup) {
+  if (eDescr->type == ncclProfileGroupApi) {
+    struct groupApi* event;
+    int groupApiId = __atomic_fetch_add(&ctx->groupApiPoolIndex, 1, __ATOMIC_RELAXED);
+    if ((groupApiId - __atomic_load_n(&ctx->groupApiPoolBase, __ATOMIC_RELAXED)) < groupApiPoolSize) {
+      // if there are available group API events grab one
+      event = &ctx->groupApiPool[groupApiId%groupApiPoolSize];
+      // Make sure all child events of the picked group API event are cleared
+      while (!profilerQueueEmpty(&event->collApiEvents)) {
+        struct collApi *collApiEvent = profilerQueueDequeue(&event->collApiEvents);
+        resetTaskEvents(collApiEvent, ctx);
+        __atomic_fetch_add(&ctx->collApiPoolBase, 1, __ATOMIC_RELAXED);
+      }
+      while (!profilerQueueEmpty(&event->p2pApiEvents)) {
+        struct p2pApi *p2pApiEvent = profilerQueueDequeue(&event->p2pApiEvents);
+        resetTaskEvents(p2pApiEvent, ctx);
+        __atomic_fetch_add(&ctx->p2pApiPoolBase, 1, __ATOMIC_RELAXED);
+      }
+      while (!profilerQueueEmpty(&event->kernelLaunchEvents)) {
+        profilerQueueDequeue(&event->kernelLaunchEvents);
+        __atomic_fetch_add(&ctx->kernelLaunchPoolBase, 1, __ATOMIC_RELAXED);
+      }
+    } else {
+      // else drop this event
+      __atomic_fetch_sub(&ctx->groupApiPoolIndex, 1, __ATOMIC_RELAXED);
+      return ncclSuccess;
+    }
+    event->type = ncclProfileGroupApi;
+    event->ctx = ctx;
+    event->groupApiId = groupApiId;
+    event->graphCaptured = eDescr->groupApi.graphCaptured;
+    event->groupDepth = eDescr->groupApi.groupDepth;
+    event->startTs = gettime() - startTime;
+    *eHandle = event;
+  } else if (eDescr->type == ncclProfileCollApi) {
+    if (eDescr->parentObj == NULL) return ncclSuccess;
+    struct collApi* event;
+    int collApiId = __atomic_fetch_add(&ctx->collApiPoolIndex, 1, __ATOMIC_RELAXED);
+    if ((collApiId - __atomic_load_n(&ctx->collApiPoolBase, __ATOMIC_RELAXED)) < collApiPoolSize) {
+      // if there are available Coll API events grab one
+      event = &ctx->collApiPool[collApiId%collApiPoolSize];
+      resetTaskEvents(event, ctx);
+    } else {
+      // else drop this event
+      __atomic_fetch_sub(&ctx->collApiPoolIndex, 1, __ATOMIC_RELAXED);
+      return ncclSuccess;
+    }
+    event->type = ncclProfileCollApi;
+    event->collApiId = collApiId;
+    event->ctx = ctx;
+    event->func = eDescr->collApi.func;
+    event->stream = (cudaStream_t) eDescr->collApi.stream;
+    event->count = eDescr->collApi.count;
+    event->datatype = eDescr->collApi.datatype;
+    event->root = eDescr->collApi.root;
+    event->graphCaptured = eDescr->collApi.graphCaptured;
+    struct groupApi* parent = (struct groupApi *) eDescr->parentObj;
+    event->parent = parent;
+    profilerQueueEnqueue(&parent->collApiEvents, event);
+    __atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED);
+    *eHandle = event;
+  } else if (eDescr->type == ncclProfileP2pApi) {
+    if (eDescr->parentObj == NULL) return ncclSuccess;
+    struct p2pApi* event;
+    int p2pApiId = __atomic_fetch_add(&ctx->p2pApiPoolIndex, 1, __ATOMIC_RELAXED);
+    if ((p2pApiId - __atomic_load_n(&ctx->p2pApiPoolBase, __ATOMIC_RELAXED)) < p2pApiPoolSize) {
+      // if there are available p2p API events grab one
+      event = &ctx->p2pApiPool[p2pApiId%p2pApiPoolSize];
+      resetTaskEvents(event, ctx);
+    } else {
+      // else drop this event
+      __atomic_fetch_sub(&ctx->p2pApiPoolIndex, 1, __ATOMIC_RELAXED);
+      return ncclSuccess;
+    }
+    event->type = ncclProfileP2pApi;
+    event->p2pApiId = p2pApiId;
+    event->ctx = ctx;
+    event->func = eDescr->p2pApi.func;
+    event->stream = (cudaStream_t) eDescr->p2pApi.stream;
+    event->count = eDescr->p2pApi.count;
+    event->datatype = eDescr->p2pApi.datatype;
+    event->graphCaptured = eDescr->p2pApi.graphCaptured;
+    struct groupApi* parent = (struct groupApi *) eDescr->parentObj;
+    event->parent = parent;
+    profilerQueueEnqueue(&parent->p2pApiEvents, event);
+    __atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED);
+    *eHandle = event;
+  } else if (eDescr->type == ncclProfileKernelLaunch) {
+    if (eDescr->parentObj == NULL) return ncclSuccess;
+    struct kernelLaunch* event;
+    int kernelLaunchId = __atomic_fetch_add(&ctx->kernelLaunchPoolIndex, 1, __ATOMIC_RELAXED);
+    if ((kernelLaunchId - __atomic_load_n(&ctx->kernelLaunchPoolBase, __ATOMIC_RELAXED)) < kernelLaunchPoolSize) {
+      // if there are available kernel API events grab one
+      event = &ctx->kernelLaunchPool[kernelLaunchId%kernelLaunchPoolSize];
+    } else {
+      // else drop this event
+      __atomic_fetch_sub(&ctx->kernelLaunchPoolIndex, 1, __ATOMIC_RELAXED);
+      return ncclSuccess;
+    }
+    event->type = ncclProfileKernelLaunch;
+    event->stream = (cudaStream_t) eDescr->kernelLaunch.stream;
+    struct groupApi* parent = (struct groupApi *) eDescr->parentObj;
+    event->parent = parent;
+    profilerQueueEnqueue(&parent->kernelLaunchEvents, event);
+    __atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED);
+    *eHandle = event;
+  } else if (eDescr->type == ncclProfileGroup) {
+    if (eDescr->parentObj == NULL) return ncclSuccess;
    struct group* event;
    int groupId = __atomic_fetch_add(&ctx->groupPoolIndex, 1, __ATOMIC_RELAXED);
    if ((groupId - __atomic_load_n(&ctx->groupPoolBase, __ATOMIC_RELAXED)) < groupPoolSize) {
@@ -222,7 +372,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
    debugEvent(event, "GroupStart");
  } else if (eDescr->type == ncclProfileColl) {
    // the parent might be null if we run out of events
-    struct group* parent = (struct group *)eDescr->parentObj;
+    struct collApi* parent = (struct collApi *)eDescr->parentObj;
    if (parent == NULL) return ncclSuccess;

    struct collective* event;
@@ -253,12 +403,12 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
    event->proto = eDescr->coll.proto;
    *eHandle = event;
    taskEventQueueEnqueue(parent, (struct taskEventBase *)event);
-    // increment the group ref counter so the event will staty open
+    // increment the group ref counter so the event will stay open
    __atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED);
    debugEvent(event, "CollStart");
  } else if (eDescr->type == ncclProfileP2p) {
    // the parent might be null if we run out of events
-    struct group* parent = (struct group *)eDescr->parentObj;
+    struct p2pApi* parent = (struct p2pApi*) eDescr->parentObj;
    if (parent == NULL) return ncclSuccess;

    struct p2p* event;
@@ -458,8 +608,34 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
 }

 void updateEvent(void* handle) {
-  uint8_t type = *(uint8_t *)handle;
-  if (type == ncclProfileGroup) {
+  uint64_t type = *(uint64_t *)handle;
+  if (type == ncclProfileGroupApi) {
+    struct groupApi* event = (struct groupApi*) handle;
+    if (__atomic_sub_fetch(&event->refCount, 1, __ATOMIC_RELAXED) == 0) {
+      event->stopTs = gettime() - startTime;
+      __atomic_fetch_add(&event->ctx->groupApiPoolBase, 1, __ATOMIC_RELAXED);
+    }
+  } else if (type == ncclProfileCollApi) {
+    struct collApi* event = (struct collApi*) handle;
+    if (__atomic_sub_fetch(&event->refCount, 1, __ATOMIC_RELAXED) == 0) {
+      event->stopTs = gettime() - startTime;
+      __atomic_fetch_add(&event->ctx->collApiPoolBase, 1, __ATOMIC_RELAXED);
+    }
+    updateEvent(event->parent);
+    return;
+  } else if (type == ncclProfileP2pApi) {
+    struct p2pApi* event = (struct p2pApi*) handle;
+    if (__atomic_sub_fetch(&event->refCount, 1, __ATOMIC_RELAXED) == 0) {
+      event->stopTs = gettime() - startTime;
+      __atomic_fetch_add(&event->ctx->p2pApiPoolBase, 1, __ATOMIC_RELAXED);
+    }
+    updateEvent(event->parent);
+    event->stopTs = gettime() - startTime;
+  } else if (type == ncclProfileKernelLaunch) {
+    struct kernelLaunch* event = (struct kernelLaunch*) handle;
+    event->stopTs = gettime() - startTime;
+    updateEvent(event->parent);
+  } else if (type == ncclProfileGroup) {
    struct group* event = (struct group *)handle;
    if (__atomic_sub_fetch(&event->refCount, 1, __ATOMIC_RELAXED) == 0) {
      event->stopTs = gettime() - startTime;
@@ -527,25 +703,35 @@ __hidden ncclResult_t exampleProfilerStopEvent(void* eHandle) {
  // the event handle might be null if we run out of events
  if (eHandle == NULL) return ncclSuccess;

-  uint8_t type = *(uint8_t *)eHandle;
-  if (type == ncclProfileGroup) {
-    // stopping the group event in NCCL core does not
-    // mean the group has completed. It means the group
-    // was submitted/enqueued so we need to keep the event open
+  uint64_t type = *(uint64_t *)eHandle;
+  // Stopping API events, Kernel Launch events, collective/p2p task events
+  // in NCCL core do not mean that they are complete. It means that the
+  // operation was enqueued so we need to keep the events open
+  if (type == ncclProfileGroupApi) {
+    struct groupApi* event = (struct groupApi*) eHandle;
+    event->stopTs = gettime() - startTime;
+    return ncclSuccess;
+  } else if (type == ncclProfileCollApi) {
+    struct collApi* event = (struct collApi*) eHandle;
+    event->stopTs = gettime() - startTime;
+    return ncclSuccess;
+  } else if (type == ncclProfileP2pApi) {
+    struct p2pApi* event = (struct p2pApi*) eHandle;
+    event->stopTs = gettime() - startTime;
+    return ncclSuccess;
+  } else if (type == ncclProfileKernelLaunch) {
+    struct kernelLaunch* event = (struct kernelLaunch*) eHandle;
+    event->stopTs = gettime() - startTime;
+    return ncclSuccess;
+  } else if (type == ncclProfileGroup) {
    struct group* event = (struct group *)eHandle;
    event->stopTs = gettime() - startTime;
    return ncclSuccess;
  } else if (type == ncclProfileColl) {
-    // stopping the collective event in NCCL core does not
-    // mean the collective has completed. It means the collective
-    // was submitted/enqueued so we need to keep the event open
    struct collective* event = (struct collective *)eHandle;
    event->base.stopTs = gettime() - startTime;
    return ncclSuccess;
  } else if (type == ncclProfileP2p) {
-    // stopping the p2p event in NCCL core does not
-    // mean the p2p has completed. It means the p2p
-    // was submitted/enqueued so we need to keep the event open
    struct p2p* event = (struct p2p *)eHandle;
    event->base.stopTs = gettime() - startTime;
    return ncclSuccess;
@@ -559,8 +745,15 @@ __hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfile
  // the event handle might be null if we run out of events
  if (eHandle == NULL) return ncclSuccess;

-  uint8_t type = *(uint8_t *)eHandle;
-  if (type == ncclProfileProxyOp) {
+  uint64_t type = *(uint64_t *)eHandle;
+  if (type == ncclProfileGroupApi) {
+    struct groupApi* event = (struct groupApi*) eHandle;
+    if (eState == ncclProfilerEndGroupApiStart) {
+      event->endOfncclGroupStartTs = gettime() - startTime;
+    } else if (eState == ncclProfilerBeginGroupApiEnd) {
+      event->startOfncclGroupEndTs = gettime() - startTime;
+    }
+  } else if (type == ncclProfileProxyOp) {
    struct proxyOp* event = (struct proxyOp *)eHandle;
    if (eState == ncclProfilerProxyOpInProgress_v4) {
      event->progrTs = gettime() - startTime;
@@ -592,6 +785,8 @@ __hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfile
      case ncclProfilerProxyStepRecvGPUWait:
        event->timestamp[PROXY_STEP_RECV_GPU_WAIT] = gettime() - startTime;
        break;
+      default:
+        break;
    }
  } else if (type == ncclProfileProxyCtrl) {
    struct proxyCtrl* event = (struct proxyCtrl *)eHandle;
@@ -609,7 +804,7 @@ __hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfile
  return ncclSuccess;
 }

-ncclProfiler_t ncclProfiler_v4 = {
+ncclProfiler_t ncclProfiler_v5 = {
  "Example-profiler",
  exampleProfilerInit,
  exampleProfilerStartEvent,
@@ -618,14 +813,15 @@ ncclProfiler_t ncclProfiler_v4 = {
  exampleProfilerFinalize,
 };

-int exampleProfilerStart(int eActivationMask) {
+__attribute__((visibility("default"))) int exampleProfilerStart(int eActivationMask, const char* name) {
+  profilerDumpFile = name;
  if (__atomic_load_n(&initialized, __ATOMIC_RELAXED)) {
    __atomic_store_n(eActivationMaskPtr, eActivationMask, __ATOMIC_RELAXED);
  }
  return ncclSuccess;
 }

-int exampleProfilerStop(void) {
+__attribute__((visibility("default"))) int exampleProfilerStop(void) {
  if (__atomic_load_n(&initialized, __ATOMIC_RELAXED)) {
    __atomic_store_n(eActivationMaskPtr, 0, __ATOMIC_RELAXED);
  }
@@ -7,7 +7,8 @@
 #ifndef PLUGIN_H_
 #define PLUGIN_H_

-int exampleProfilerStart(int eActivationMask);
-int exampleProfilerStop(void);
+__attribute__((visibility("default"))) int exampleProfilerStart(int eActivationMask, const char* name);
+__attribute__((visibility("default"))) int exampleProfilerStop(void);
+

 #endif
@@ -5,15 +5,59 @@
 ************************************************************************/

 #include <stdio.h>
+#include "err.h"
 #include "profiler.h"
 #include "event.h"
 #include "print_event.h"
+#include <cuda_runtime.h>

 #define __hidden __attribute__ ((visibility("hidden")))

 // FIXME: chrome tracing asynchronous events (following used) allow event nesting for events that have same id and category
 // It appears that nesting more than three events causes issues. Therefore, every event is given an increasing id and a
-// category that matches the type of event (GROUP, COLL, P2P, PROXY, NET)
+// category that matches the type of event (GROUP API, COLL API, P2P API, GROUP, COLL, P2P, PROXY, NET)
+static __thread int groupApiId;
+__hidden void printGroupApiEventHeader(FILE* fh, struct groupApi* event) {
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GROUP_API\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"groupApiId\": %d, \"groupDepth\":%d}},\n",
+          "Group API", groupApiId, getpid(), 1, event->startTs, event->groupApiId, event->groupDepth);
+}
+
+__hidden void printGroupApiEventTrailer(FILE* fh, struct groupApi* event) {
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GROUP_API\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+          "Group API", groupApiId++, getpid(), 1, event->stopTs);
+}
+
+static __thread int p2pApiId;
+__hidden void printP2pApiEventHeader(FILE* fh, struct p2pApi* event) {
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P_API\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"count\": %lu, \"datatype\": %s, \"GraphCaptured\":%d, \"Stream\": %p}},\n",
+      event->func, p2pApiId, getpid(), 1, event->startTs, event->count, event->datatype, event->graphCaptured, event->stream);
+}
+
+__hidden void printP2pApiEventTrailer(FILE* fh, struct p2pApi* event) {
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P_API\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+          event->func, p2pApiId++, getpid(), 1, event->stopTs);
+}
+
+static __thread int collApiId;
+__hidden void printCollApiEventHeader(FILE* fh, struct collApi* event) {
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL_API\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"count\": %lu, \"datatype\": %s, \"root\": %d, \"GraphCaptured\":%d, \"Stream\": %p}},\n",
+      event->func, collApiId, getpid(), 1, event->startTs, event->count, event->datatype, event->root, event->graphCaptured, event->stream);
+}
+
+__hidden void printCollApiEventTrailer(FILE* fh, struct collApi* event) {
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL_API\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+      event->func, collApiId++, getpid(), 1, event->stopTs);
+}
+
+static __thread int kernelLaunchId;
+__hidden void printKernelLaunchEventHeader(FILE* fh, struct kernelLaunch* event) {
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"KERNEL_LAUNCH\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"groupId\": %d, \"Stream\": %p}},\n", "KernelLaunch", kernelLaunchId, getpid(), 1, event->startTs, event->kernelLaunchId, event->stream);
+}
+
+__hidden void printKernelLaunchEventTrailer(FILE* fh, struct kernelLaunch* event) {
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"KERNEL_LAUNCH\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", "KernelLaunch", kernelLaunchId++, getpid(), 1, event->stopTs);
+}
+
 static __thread int groupId;
 __hidden void printGroupEventHeader(FILE* fh, struct group* event) {
  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GROUP\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"groupId\": %d}},\n",
@@ -28,7 +72,7 @@ __hidden void printGroupEventTrailer(FILE* fh, struct group* event) {
 static __thread int collId;
 __hidden void printCollEventHeader(FILE* fh, struct collective* event) {
  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"SeqNum\": %lu, \"CommHash\": %lu, \"Rank\": %d, \"Count\": %lu, \"Datatype\": \"%s\", \"Algorithm\": \"%s\", \"Protocol\": \"%s\", \"nChannels\": %d}},\n",
-          event->base.func, collId, getpid(), 1, event->base.startTs, event->seqNumber, event->base.parent->ctx->commHash, event->base.rank, event->count, event->datatype, event->algo, event->proto, event->nChannels);
+          event->base.func, collId, getpid(), 1, event->base.startTs, event->seqNumber, ((struct collApi*)event->base.parent)->ctx->commHash, event->base.rank, event->count, event->datatype, event->algo, event->proto, event->nChannels);
 }

 __hidden void printCollEventTrailer(FILE* fh, struct collective* event) {
@@ -39,7 +83,7 @@ __hidden void printCollEventTrailer(FILE* fh, struct collective* event) {
 static __thread int p2pId;
 __hidden void printP2pEventHeader(FILE* fh, struct p2p* event) {
  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"CommHash\": %lu, \"Rank\": %d, \"Peer\": %d, \"Count\": %lu, \"Datatype\": \"%s\", \"nChannels\": %d}},\n",
-          event->base.func, p2pId, getpid(), 1, event->base.startTs, event->base.parent->ctx->commHash, event->base.rank, event->peer, event->count, event->datatype, event->nChannels);
+          event->base.func, p2pId, getpid(), 1, event->base.startTs, ((struct p2pApi*)event->base.parent)->ctx->commHash, event->base.rank, event->peer, event->count, event->datatype, event->nChannels);
 }

 __hidden void printP2pEventTrailer(FILE* fh, struct p2p* event) {
@@ -173,7 +217,7 @@ void debugEvent(void* eHandle, const char* tag) {
  char filename[64] = { 0 };
  sprintf(filename, "EventDebug-%d", getpid());
  FILE* fh = fopen(filename, "a+");
-  uint8_t type = *(uint8_t *)eHandle;
+  uint64_t type = *(uint64_t *)eHandle;
  if (type == ncclProfileGroup) {
    struct group* event = (struct group *)eHandle;
    fprintf(fh, "Group event %p tag = %s {\n", event, tag);
@@ -241,8 +285,51 @@ void debugEvent(void* eHandle, const char* tag) {

 void printEvent(FILE* fh, void* handle) {
  if (handle == NULL || fh == NULL) return;
-  uint8_t type = *(uint8_t *)handle;
-  if (type == ncclProfileGroup) {
+  uint64_t type = *(uint64_t *)handle;
+  if (type == ncclProfileGroupApi) {
+    struct groupApi* g = (struct groupApi*) handle;
+    printGroupApiEventHeader(fh, g);
+    struct kernelLaunch* kernelLaunchHead = profilerQueueHead(&g->kernelLaunchEvents);
+    while (kernelLaunchHead != NULL) {
+      printEvent(fh, kernelLaunchHead);
+      kernelLaunchHead = kernelLaunchHead->next;
+    }
+    struct collApi* collApiHead = profilerQueueHead(&g->collApiEvents);
+    while (collApiHead != NULL) {
+      printEvent(fh, collApiHead);
+      collApiHead = collApiHead->next;
+    }
+    struct p2pApi* p2pApiHead = profilerQueueHead(&g->p2pApiEvents);
+    while (p2pApiHead != NULL) {
+      printEvent(fh, p2pApiHead);
+      p2pApiHead = p2pApiHead->next;
+    }
+    printGroupApiEventTrailer(fh, g);
+  } else if (type == ncclProfileCollApi) {
+    struct collApi* collApiEvent = (struct collApi *) handle;
+    printCollApiEventHeader(fh, collApiEvent);
+    struct taskEventBase* base = taskEventQueueHead(collApiEvent);
+    while (base) {
+      struct taskEventBase* next = base->next;
+      printEvent(fh, base);
+      base = next;
+    }
+    printCollApiEventTrailer(fh, collApiEvent);
+  } else if (type == ncclProfileP2pApi) {
+    struct p2pApi* p2pApiEvent = (struct p2pApi *) handle;
+    printP2pApiEventHeader(fh, p2pApiEvent);
+    struct taskEventBase* base = taskEventQueueHead(p2pApiEvent);
+    while (base) {
+      struct taskEventBase* next = base->next;
+      printEvent(fh, base);
+      base = next;
+    }
+    printP2pApiEventTrailer(fh, p2pApiEvent);
+  } else if (type == ncclProfileKernelLaunch) {
+    struct kernelLaunch* kernelLaunchEvent = (struct kernelLaunch *) handle;
+    printKernelLaunchEventHeader(fh, kernelLaunchEvent);
+    printKernelLaunchEventTrailer(fh, kernelLaunchEvent);
+  } else if (type == ncclProfileGroup) {
    struct group* g = (struct group *)handle;
    printGroupEventHeader(fh, g);
    struct taskEventBase* base = taskEventQueueHead(g);
@@ -0,0 +1,50 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef QUEUE_H
+#define QUEUE_H
+
+template<typename T, T *T::*next>
+struct profilerQueue {
+  T *head, *tail;
+};
+
+template<typename T, T *T::*next>
+ inline void profilerQueueConstruct(profilerQueue<T,next> *me) {
+  me->head = nullptr;
+  me->tail = nullptr;
+}
+
+template<typename T, T *T::*next>
+ inline bool profilerQueueEmpty(profilerQueue<T,next> *me) {
+  return me->head == nullptr;
+}
+
+template<typename T, T *T::*next>
+inline T* profilerQueueHead(profilerQueue<T,next> *me) {
+  return me->head;
+}
+
+template<typename T, T *T::*next>
+ inline T* profilerQueueTail(profilerQueue<T,next> *me) {
+  return me->tail;
+}
+
+template<typename T, T *T::*next>
+ inline void profilerQueueEnqueue(profilerQueue<T,next> *me, T *x) {
+  x->*next = nullptr;
+  (me->head ? me->tail->*next : me->head) = x;
+  me->tail = x;
+}
+
+template<typename T, T *T::*next>
+ inline T* profilerQueueDequeue(profilerQueue<T,next> *me) {
+  T *ans = me->head;
+  me->head = ans->*next;
+  if (me->head == nullptr) me->tail = nullptr;
+  return ans;
+}
+
+#endif
@@ -0,0 +1,22 @@
+.PHONY: build-CoMMA
+
+all: build-CoMMA
+
+build-CoMMA: clone-CoMMA
+	cd CoMMA && cargo build
+
+clone-CoMMA:
+	@if [ ! -d CoMMA ] ; then \
+		git clone https://github.com/google/CoMMA.git; \
+		ln -s $(PWD)/.. CoMMA/third_party/nccl/ext-profiler; \
+	fi
+
+clean:
+	@if [ -d CoMMA ] ; then \
+		cd CoMMA && cargo clean; \
+	fi
+
+delete:
+	@if [ -d CoMMA ] ; then \
+		rm -rf CoMMA; \
+	fi
@@ -0,0 +1,62 @@
+#
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+# Variables
+NCCL_HOME := ../../build
+INC := -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl
+PLUGIN_SO := libnccl-profiler-inspector.so
+VERSION_FILE := version.cc
+
+# Compiler and flags
+CXX := g++
+CXXFLAGS := -g -O3 -fPIC -shared -march=native -DNDEBUG -Wall -Wextra
+
+ifeq ($(DEBUG), 1)
+CXXFLAGS += -g2 -ggdb3 -rdynamic -funwind-tables -fno-omit-frame-pointer
+endif
+
+ifeq ($(ASAN), 1)
+CXXFLAGS += -fsanitize=address
+LDFLAGS += -fsanitize=address -static-libasan
+NVLDFLAGS += -Xcompiler -fsanitize=address,-static-libasan
+endif
+
+ifeq ($(UBSAN), 1)
+CXXFLAGS += -fsanitize=undefined
+LDFLAGS += -fsanitize=undefined -static-libubsan
+NVLDFLAGS += -Xcompiler -fsanitize=undefined,-static-libubsan
+endif
+
+# Source files
+SOURCES := inspector_plugin.cc inspector.cc json.cc
+
+# Default target
+all: $(PLUGIN_SO)
+
+# Rule to build the plugin
+$(PLUGIN_SO): $(VERSION_FILE) $(SOURCES)
+	@echo "Compiling to create $@ from $^"
+	$(CXX) $(INC) $(CXXFLAGS) -o $@ -Wl,-soname,$(PLUGIN_SO) $^
+
+# Rule to generate version.cc
+$(VERSION_FILE):
+	@GIT_INFO=$$(./utils/extract_git_version.sh); \
+	echo '#include "version.h"' > $(VERSION_FILE).tmp; \
+	echo 'const char* get_git_version_info() { return "'$$GIT_INFO'"; }' >> $(VERSION_FILE).tmp; \
+	if ! cmp $(VERSION_FILE).tmp $(VERSION_FILE); then \
+		echo "updating ${VERSION_FILE} file -> $$GIT_INFO"; \
+		mv $(VERSION_FILE).tmp $(VERSION_FILE); \
+	else \
+		echo "${VERSION_FILE} up to date -> $$GIT_INFO"; \
+		rm $(VERSION_FILE).tmp; \
+	fi
+
+# Clean target
+clean:
+	rm -f $(VERSION_FILE) $(PLUGIN_SO)
+
+# Phony targets
+.PHONY: all clean
@@ -0,0 +1,216 @@
+# NCCL Inspector Plugin
+
+The NCCL Inspector is a plugin for the NVIDIA Collective Communications Library (NCCL) that provides detailed, per-communicator, per-collective performance and metadata logging. It is designed to help users analyze and debug NCCL collective operations by generating structured JSON output for each operation.
+
+## Related Documentation
+
+- **[Performance Exporter](exporter/example/README.md)** - Tool for analyzing and visualizing NCCL performance data from inspector logs
+
+## Folder Location
+
+The Inspector plugin source is located in:
+
+```
+ext-profiler/inspector/
+```
+
+## Building the Inspector Plugin
+
+To build the Inspector plugin, run:
+
+```bash
+make
+```
+
+The build system will automatically detect CUDA and NCCL installations from your environment. If you need to specify custom paths, you can set `CUDA_HOME` and `NCCL_HOME` environment variables or pass them as make arguments.
+
+### Build Options
+
+The Makefile supports several build options:
+
+- **DEBUG=1**: Enable debug build with additional debugging information
+- **ASAN=1**: Enable Address Sanitizer for memory error detection
+- **UBSAN=1**: Enable Undefined Behavior Sanitizer
+
+Example debug build:
+```bash
+make DEBUG=1
+```
+
+### Build Output
+
+The build process creates:
+- `libnccl-profiler-inspector.so`: The main inspector plugin library
+- `version.cc`: Auto-generated version information from git
+
+## Using NCCL Inspector
+
+### Key Differences from Normal NCCL Usage
+
+The main difference between running NCCL with the Inspector plugin versus running NCCL normally is the addition of environment variables that enable detailed performance logging:
+
+**Normal NCCL Run:**
+```bash
+# Standard NCCL execution
+./your_nccl_application
+```
+
+**NCCL Inspector Run:**
+```bash
+# NCCL Inspector enabled execution
+export NCCL_PROFILER_PLUGIN=/path/to/nccl/ext-profiler/inspector/libnccl-profiler-inspector.so
+export NCCL_INSPECTOR_ENABLE=1
+export NCCL_INSPECTOR_DUMP_THREAD_INTERVAL_MICROSECONDS=500
+./your_nccl_application
+```
+
+### Required Environment Variables
+
+- `NCCL_PROFILER_PLUGIN=/path/to/nccl/ext-profiler/inspector/libnccl-profiler-inspector.so`
+  Loads the Inspector plugin into NCCL.
+- `NCCL_INSPECTOR_ENABLE=1`
+  Enables the Inspector plugin.
+- `NCCL_INSPECTOR_DUMP_THREAD_INTERVAL_MICROSECONDS=<interval>`
+  Sets the interval (in microseconds) for the internal dump thread to write output. Example: `500`.
+- `NCCL_INSPECTOR_DUMP_DIR=<output_dir>` (optional)
+  Sets the output directory for logs. If not set, defaults to `nccl-inspector-unknown-jobid` or `nccl-inspector-<slurm_job_id>` if running under SLURM.
+- `NCCL_INSPECTOR_DUMP_VERBOSE=<0|1>` (optional)
+  Enables verbose output including event trace information. Set to `1` to enable, `0` to disable (default).
+
+### Example Usage
+
+**Single Node:**
+```bash
+export NCCL_PROFILER_PLUGIN=/path/to/nccl/ext-profiler/inspector/libnccl-profiler-inspector.so
+export NCCL_INSPECTOR_ENABLE=1
+export NCCL_INSPECTOR_DUMP_THREAD_INTERVAL_MICROSECONDS=500
+./build/test/perf/all_reduce_perf -b 8 -e 16G -f 2 -g 8
+```
+
+**Multi-Node (SLURM):**
+```bash
+# Add these environment variables to your SLURM script
+export NCCL_PROFILER_PLUGIN=/path/to/nccl/ext-profiler/inspector/libnccl-profiler-inspector.so
+export NCCL_INSPECTOR_ENABLE=1
+export NCCL_INSPECTOR_DUMP_THREAD_INTERVAL_MICROSECONDS=500
+export NCCL_INSPECTOR_DUMP_DIR=/path/to/logs/${SLURM_JOB_ID}/
+
+# Then run your normal NCCL application
+srun your_nccl_application
+```
+
+## Example Scripts
+
+For detailed example scripts showing how to integrate NCCL Inspector with different workloads, see the **[test/examples/](test/examples/)** directory:
+
+- **Single Node Example**: Basic NCCL performance testing with inspector
+- **Multi-Node SLURM Example**: Comprehensive multi-node testing with various collective operations
+- **Training Workload Example**: Integration with distributed training workloads
+
+## Output Example
+
+Each output file contains JSON objects with the following structure:
+
+```json
+{
+  "header": {
+    "id": "0x7f8c496ae9f661",
+    "rank": 2,
+    "n_ranks": 8,
+    "nnodes": 1
+  },
+  "metadata": {
+    "inspector_output_format_version": "v4.0",
+    "git_rev": "",
+    "rec_mechanism": "profiler_plugin",
+    "dump_timestamp_us": 1748030377748202,
+    "hostname": "example-hostname",
+    "pid": 1639453
+  },
+  "coll_perf": {
+    "coll": "AllReduce",
+    "coll_sn": 1407,
+    "coll_msg_size_bytes": 17179869184,
+    "coll_exec_time_us": 61974,
+    "coll_algobw_gbs": 277.210914,
+    "coll_busbw_gbs": 485.119099
+  }
+}
+```
+
+## Output Example Verbose
+
+To enable verbose output with event trace information, set the `NCCL_INSPECTOR_DUMP_VERBOSE=1` environment variable:
+
+```bash
+export NCCL_INSPECTOR_DUMP_VERBOSE=1
+```
+
+This will include additional event trace information in the JSON output, showing the sequence of callbacks and timestamps for each individual event.
+
+```json
+{
+  "header": {
+    "id": "0xe62dedaa97644a",
+    "rank": 4,
+    "n_ranks": 8,
+    "nnodes": 1
+  },
+  "metadata": {
+    "inspector_output_format_version": "v4.0",
+    "git_rev": "9019a1912-dirty",
+    "rec_mechanism": "nccl_profiler_interface",
+    "dump_timestamp_us": 1752867229276385,
+    "hostname": "example-hostname",
+    "pid": 438776
+  },
+  "coll_perf": {
+    "coll": "ReduceScatter",
+    "coll_sn": 1231,
+    "coll_msg_size_bytes": 2147483648,
+    "coll_exec_time_us": 41057,
+    "coll_timing_source": "kernel_gpu",
+    "coll_algobw_gbs": 418.439467,
+    "coll_busbw_gbs": 366.134533,
+    "event_trace_sn": {
+      "coll_start_sn": 1,
+      "coll_stop_sn": 2,
+      "kernel_events": [
+        {
+          "channel_id": 0,
+          "kernel_start_sn": 3,
+          "kernel_stop_sn": 48,
+          "kernel_record_sn": 47
+        }
+      ]
+    },
+    "event_trace_ts": {
+      "coll_start_ts": 1752867229235059,
+      "coll_stop_ts": 1752867229235064,
+      "kernel_events": [
+        {
+          "channel_id": 0,
+          "kernel_start_ts": 1752867229235181,
+          "kernel_stop_ts": 1752867229275811,
+          "kernel_record_ts": 1752867229275811
+        }
+      ]
+    }
+  }
+}
+```
+
+Multiple such JSON objects are written, one per collective operation per communicator.
+
+## Output Directory
+
+- By default, output files are written to:
+  - `nccl-inspector-unknown-jobid` (if no SLURM job ID is present)
+  - `nccl-inspector-<slurm_job_id>` (if running under SLURM)
+- You can override this with the `NCCL_INSPECTOR_DUMP_DIR` environment variable.
+
+## Additional Notes
+
+- The plugin is compatible with standard NCCL workflows and can be used in both single-node and multi-node (SLURM) environments.
+- For more details, see the source code and comments in `ext-profiler/inspector/`.
+
@@ -0,0 +1,151 @@
+# NCCL Inspector Performance Summary Exporter
+
+This tool processes NCCL Inspector log files and generates comprehensive performance analysis reports including visualizations and statistical summaries.
+One can build similar exporters to integrate with various observability systems like Elastic, Prometheus or other Custom Metric systems.
+
+## Features
+
+- **Performance Analysis**: Generates statistical summaries for collective operations
+- **Communication Type Classification**: Automatically categorizes communication patterns
+- **Visualizations**: Creates scatter plots, histograms, and box plots for performance metrics
+- **Data Export**: Converts logs to Parquet format for efficient processing
+- **Multi-format Log Support**: Processes `.log`, `.log.gz`, `.jsonl`, and `.jsonl.gz` files
+- **Parallel Processing**: Utilizes multi-core processing for faster analysis
+
+## Requirements
+
+- Python 3.7+
+- Access to NCCL Inspector log files
+
+## Installation
+
+### Clone the Repository
+
+```bash
+git clone https://github.com/NVIDIA/nccl.git
+cd nccl/ext-profiler/inspector/exporter/example
+```
+
+Install the required dependencies using the provided `requirements.txt` file:
+
+```bash
+pip install -r requirements.txt
+```
+
+## Usage
+
+The script processes NCCL Inspector log files from a specified directory.
+
+**Note:** To generate NCCL Inspector log files, you need to run your NCCL application with the inspector plugin enabled. The log files will be output to a directory specified by the `NCCL_INSPECTOR_DUMP_DIR` environment variable. For detailed setup instructions and environment variable configuration, see the [Inspector README](../../../README.md).
+
+### Basic Usage
+
+```bash
+python perf_summary_exporter.py --input_dir /path/to/nccl/inspector/logs
+```
+
+This mode processes all log files in the specified directory and its subdirectories recursively.
+
+### Command Line Arguments
+
+- `--input_dir <path>`: **Required**. Directory containing NCCL Inspector log files (searches recursively in subdirectories)
+- `--output_dir <name>`: **Optional**. Custom output directory name (default: `<input_directory_name>-analysis`)
+
+## Output
+
+The tool generates:
+
+1. **Parquet Files**: One per log file containing processed log data (stored in `parquet_files/` subdirectory)
+2. **Summary Directory**: Contains comprehensive analysis results
+3. **Visualizations**: Scatter plots, histograms, and box plots for each message size
+4. **CSV Files**: Detailed summaries for each message size and collective type
+5. **Log File**: Processing log with detailed information
+
+## Example Output Structure
+
+```
+<output_dir_name>/
+├── output.log
+├── parquet_files/
+│   ├── <filename1>.parquet
+│   ├── <filename2>.parquet
+│   └── ...
+└── summary/
+    ├── scatter_plot_<comm_type>_<coll_type>.png
+    ├── combined_scatter_plot_<comm_type>_<coll_type>.png
+    └── msg_size_<human_readable_size>/
+        ├── histograms/
+        │   └── histogram_<comm_type>_<coll_type>_<size>.png
+        ├── boxplots/
+        │   └── boxplot_<comm_type>_<coll_type>_<size>.png
+        └── summary_<comm_type>_<coll_type>_<size>.csv
+```
+
+## Supported Communicator Types
+
+- `single-rank`
+- `nvlink-only`
+- `hca-only`
+- `mixed`
+
+## Supported Collective Types
+
+- `AllReduce`
+- `AllGather`
+- `ReduceScatter`
+- `Broadcast`
+
+## Log File Formats
+
+### Supported Formats
+
+- `.log` - Plain text JSON lines
+- `.log.gz` - Compressed JSON lines
+- `.jsonl` - JSON lines format
+- `.jsonl.gz` - Compressed JSON lines
+
+### Expected JSON Structure
+
+```json
+{
+  "header": {
+    "id": "0x9e7a479f95a66c",
+    "rank": 31,
+    "n_ranks": 32,
+    "nnodes": 4
+  },
+  "metadata": {
+    "inspector_output_format_version": "v4.0",
+    "git_rev": "75e61acda-dirty",
+    "rec_mechanism": "nccl_profiler_interface",
+    "dump_timestamp_us": 1749490229087081,
+    "hostname": "example-hostname",
+    "pid": 468528
+  },
+  "coll_perf": {
+    "coll": "ReduceScatter",
+    "coll_sn": 129,
+    "coll_msg_size_bytes": 65536,
+    "coll_exec_time_us": 110,
+    "coll_timing_source": "kernel_gpu",
+    "coll_algobw_gbs": 19.065018,
+    "coll_busbw_gbs": 18.469236
+  }
+}
+```
+
+## Troubleshooting
+
+### Common Issues
+
+1. **No log files found**: Ensure the log directory path is correct and contains valid log files
+2. **Missing dependencies**: Ensure all requirements are installed in your virtual environment
+3. **Mixed file formats**: The tool will exit if it detects mixed `.log`, `.log.gz`, `.jsonl`, and `.jsonl.gz` files in the same directory. This is typically indicative of corrupt input directories caused by multiple overlapping NCCL Inspector runs with different output format options. Clean the directory and re-run with consistent settings.
+
+### Log Files
+
+The tool creates detailed logs in the output directory. Check `output.log` for processing information and any error messages.
+
+## Support
+
+Please refer to the github issues page at https://github.com/NVIDIA/nccl/issues. Your question may already have been asked by another user. If not, feel free to create a new issue and refer to the "inspector plugin" in the title.
@@ -0,0 +1,548 @@
+from pathlib import Path
+import argparse
+import glob
+import gzip
+import sys
+import pandas as pd
+from concurrent.futures import ProcessPoolExecutor
+import json
+from tqdm.auto import tqdm
+import duckdb
+import math
+import matplotlib.pyplot as plt
+import matplotlib.dates
+from matplotlib.gridspec import GridSpec
+import os
+import logging
+import contextlib
+from datetime import datetime
+import numpy as np
+
+def setup_logging(output_dir):
+    log_file = output_dir / "output.log"
+    logging.basicConfig(
+        filename=log_file,
+        level=logging.INFO,
+        format="%(asctime)s - %(levelname)s - %(message)s",
+    )
+
+
+@contextlib.contextmanager
+def smart_open(filename, mode="r"):
+    if filename.endswith(".gz"):
+        opener = gzip.open
+    else:
+        opener = open
+
+    with opener(filename, mode) as f:
+        yield f
+
+
+def get_log_files_and_output_dir():
+    parser = argparse.ArgumentParser(description="Process log files in a directory.")
+    parser.add_argument(
+        "--input_dir",
+        type=str,
+        help="The directory containing NCCL Inspector log files to process.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        help="Custom output directory name (default: auto-generated from input directory)."
+    )
+    args = parser.parse_args()
+
+    if args.input_dir:
+        # Use the provided input directory
+        root_dir = Path(args.input_dir)
+        if not root_dir.exists():
+            raise FileNotFoundError(f"Input directory not found: {root_dir}")
+
+    logfiles = list(glob.iglob(str(Path(root_dir) / "**" / "*.log"), recursive=True))
+    gzlogfiles = list(
+        glob.iglob(str(Path(root_dir) / "**" / "*.log.gz"), recursive=True)
+    )
+    jsonlfiles = list(
+        glob.iglob(str(Path(root_dir) / "**" / "*.jsonl"), recursive=True)
+    )
+    gzjsonlfiles = list(
+        glob.iglob(str(Path(root_dir) / "**" / "*.jsonl.gz"), recursive=True)
+    )
+    if (
+            sum((1 for x in [logfiles, gzlogfiles, jsonlfiles, gzjsonlfiles] if len(x) > 0))
+            > 1
+    ):
+        ### TODO: we could probably generate some logic to pick the "right" file to load, but for now, bail
+        logging.critical("Appear to have mixed .log/.log.gz/.jsonl/.jsonl.gz; bailing!")
+        sys.exit(1)
+
+    files = logfiles + gzlogfiles + jsonlfiles + gzjsonlfiles
+
+    if not files:
+        print("No inspector logs found")
+        sys.exit(1)
+
+    # Generate output directory name from input directory
+    if args.output_dir:
+        output_dir_name = args.output_dir
+    else:
+        output_dir_name = f"{root_dir.name}-analysis"
+
+    return files, output_dir_name
+
+def bytes_to_human_readable(size_bytes):
+    """
+    Convert bytes to human-readable format using decimal (SI) units.
+
+    Uses powers of 1000 (decimal/SI standard):
+    - 1 KB = 1,000 bytes
+    - 1 MB = 1,000,000 bytes
+    - 1 GB = 1,000,000,000 bytes
+
+    Not binary units (powers of 1024):
+    - Does NOT use KiB, MiB, GiB (1024-based)
+
+    Args:
+        size_bytes: Number of bytes to convert
+
+    Returns:
+        Human-readable string (e.g., "1.50MB", "2.34GB")
+    """
+    if size_bytes == 0:
+        return "0B"
+    size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
+    i = int(math.log10(int(size_bytes)) / 3)
+    s = round(size_bytes * math.pow(10, -3 * i), 2)
+    return f"{s:.2f}{size_name[i]}"
+
+def timestamp_to_datetime(timestamp_us):
+    """Convert microsecond timestamp to datetime string"""
+    return datetime.fromtimestamp(timestamp_us / 1000000).strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
+
+def microseconds_to_human_readable(microseconds):
+    """Convert microseconds to human readable format"""
+    if microseconds < 1000:
+        return f"{microseconds:.1f}μs"
+    elif microseconds < 1000000:
+        return f"{microseconds/1000:.1f}ms"
+    else:
+        return f"{microseconds/1000000:.1f}s"
+
+def get_comm_type(row) -> str:
+    if row["n_ranks"] == 1:
+        return "single-rank"
+    elif row["nnodes"] == 1:
+        return "nvlink-only"
+    elif row["n_ranks"] == row["nnodes"]:
+        return "hca-only"
+    else:
+        return "mixed"
+
+def parse_file(filepath: Path, output_dir):
+    filename = Path(filepath).stem
+    parquet_file = output_dir / f"{filename}.parquet"
+
+    # Check if parquet file exists and is newer than source file
+    if parquet_file.exists():
+        source_mtime = Path(filepath).stat().st_mtime
+        parquet_mtime = parquet_file.stat().st_mtime
+        if parquet_mtime >= source_mtime:
+            logging.info(f"Parquet file {parquet_file} is up to date. Skipping...")
+            return
+        else:
+            logging.info(f"Source file {filepath} is newer than parquet. Regenerating...")
+
+    # Check if file is empty or too small
+    file_size = Path(filepath).stat().st_size
+    if file_size == 0:
+        logging.warning(f"Skipping empty file: {filepath}")
+        return
+
+    recs = []
+    try:
+        with smart_open(filepath, "r") as infile:
+            for lineno, line in enumerate(infile):
+                try:
+                    json_recs = json.loads(line)
+                except json.JSONDecodeError:
+                    logging.error(f"Failed to parse line {filepath}:{lineno}")
+                    continue
+
+                # Validate that required fields exist
+                if not all(key in json_recs for key in ["header", "metadata", "coll_perf"]):
+                    logging.error(f"Missing required fields in {filepath}:{lineno}")
+                    continue
+
+                header = json_recs["header"]
+                metadata = json_recs["metadata"]
+                comm_type = get_comm_type(header)
+                coll_perf = json_recs["coll_perf"]
+                recs.append(
+                    dict(
+                        **header,
+                        comm_type=comm_type,
+                        **coll_perf,
+                        **metadata,
+                    )
+                )
+    except Exception as e:
+        logging.error(f"Error reading file {filepath}: {e}")
+        return
+
+    # Skip files with no valid records
+    if not recs:
+        logging.warning(f"No valid records found in file: {filepath}. Skipping...")
+        return
+
+    df = pd.DataFrame(recs)
+    df.to_parquet(parquet_file)
+    logging.info(f"Created parquet file {parquet_file} with {len(recs)} records")
+
+def create_per_node_parquet_files(files, output_dir):
+    output_dir = Path(output_dir) / "parquet_files"
+    output_dir.mkdir(parents=True, exist_ok=True)
+    max_workers = min(64, len(files), os.cpu_count() or 1)
+    with ProcessPoolExecutor(max_workers=max_workers) as executor:
+        list(
+            tqdm(
+                executor.map(parse_file, files, [output_dir] * len(files)),
+                total=len(files),
+                desc="Processing files",
+                unit="file",
+            )
+        )
+    return output_dir
+
+def generate_scatter_plot(df, comm_type, coll_type, output_file):
+    plt.figure(figsize=(10, 6), dpi=100)
+    distinct_msg_sizes = df["coll_msg_size_bytes"].unique()
+
+    for msg_size in distinct_msg_sizes:
+        df_msg_size = df[df["coll_msg_size_bytes"] == msg_size]
+        mean_busbw = df_msg_size["mean_coll_busbw_gbs"].mean()
+        plt.scatter(
+            df_msg_size["coll_sn"],
+            df_msg_size["mean_coll_busbw_gbs"],
+            label=f"MsgSize: {bytes_to_human_readable(msg_size)} (Mean: {mean_busbw:.2f} GB/s)",
+            alpha=0.5,
+        )
+
+    plt.xlabel("Operation Sequence Number")
+    plt.ylabel("Mean Collective Bus BW (GB/s)")
+    plt.title(f"Comm Type: {comm_type}, Coll Type: {coll_type}")
+    plt.legend(title="Message Size", loc="upper right")
+    plt.tight_layout()
+    plt.savefig(output_file)
+    plt.close()
+    logging.info(f"Scatter plot saved to {output_file}")
+
+def generate_combined_scatter_plot(df, comm_type, coll_type, output_file, max_cols=3):
+    distinct_msg_sizes = df["coll_msg_size_bytes"].unique()
+    num_plots = len(distinct_msg_sizes)
+
+    # Compute number of rows and columns
+    num_cols = min(max_cols, num_plots)  # Limit max columns
+    num_rows = (num_plots + num_cols - 1) // num_cols  # Calculate rows dynamically
+
+    # Create figure with GridSpec
+    fig = plt.figure(figsize=(5 * num_cols, 5 * num_rows), dpi=100)
+    gs = GridSpec(num_rows, num_cols, figure=fig)
+
+    for i, msg_size in enumerate(distinct_msg_sizes):
+        row, col = divmod(i, num_cols)  # Determine row & column index
+        ax = fig.add_subplot(gs[row, col])  # Create subplot at position
+
+        df_msg_size = df[df["coll_msg_size_bytes"] == msg_size]
+        mean_busbw = df_msg_size["mean_coll_busbw_gbs"].mean()
+        ax.scatter(
+            df_msg_size["coll_sn"],
+            df_msg_size["mean_coll_busbw_gbs"],
+            label=f"MsgSize: {bytes_to_human_readable(msg_size)} (Mean: {mean_busbw:.2f} GB/s)",
+            alpha=0.5,
+        )
+        ax.set_xlabel("Op Seq No")
+        ax.set_ylabel("Mean Collective Bus BW (GB/s)")
+        ax.set_title(f"Message Size: {bytes_to_human_readable(msg_size)}({msg_size})")
+        ax.legend(loc="upper right")
+
+    fig.suptitle(f"Comm Type: {comm_type}, Coll Type: {coll_type}", ha="center", y=0.98)
+
+    plt.tight_layout()
+    plt.savefig(output_file)
+    plt.close()
+    logging.info(f"Combined scatter plot saved to {output_file}")
+
+def generate_histogram(df, comm_type, coll_type, output_file, message_size):
+    plt.figure(figsize=(10, 6), dpi=100)
+    data_range = df["mean_coll_busbw_gbs"].max() - df["mean_coll_busbw_gbs"].min()
+    num_bins = min(50, int(data_range) + 1)
+    plt.hist(
+        df["mean_coll_busbw_gbs"],
+        bins=num_bins,
+        alpha=0.7,
+        color="b",
+        edgecolor="black",
+        linewidth=1.2,
+    )
+    plt.xlabel("Mean Collective Bus BW (GB/s)")
+    plt.ylabel("Frequency")
+    plt.title(
+        f"Comm Type: {comm_type}, Coll Type: {coll_type} Mean Collective Bus BW Histogram\nMsg Size: {message_size}"
+    )
+    plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f"{y:.0f}"))
+    plt.gca().xaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f"{x:.2f} GB/s"))
+    plt.gca().xaxis.get_offset_text().set_visible(False)
+    plt.tight_layout()
+    plt.savefig(output_file)
+    plt.close()
+    logging.info(f"Histogram saved to {output_file}")
+
+def generate_boxplot(df, comm_type, coll_type, output_file, message_size):
+    plt.figure(figsize=(10, 6))
+    boxprops = dict(linestyle="-", linewidth=2, color="blue")
+    flierprops = dict(marker="o", color="red", alpha=0.5)
+    medianprops = dict(linestyle="-", linewidth=2.5, color="orange")
+    whiskerprops = dict(linestyle="--", linewidth=2, color="green")
+    capprops = dict(linestyle="-", linewidth=2, color="black")
+
+    plt.boxplot(
+        df["mean_coll_busbw_gbs"],
+        vert=False,
+        patch_artist=True,
+        boxprops=boxprops,
+        flierprops=flierprops,
+        medianprops=medianprops,
+        whiskerprops=whiskerprops,
+        capprops=capprops,
+    )
+
+    plt.xlabel("Mean Coll Bus BW (GB/s)")
+    plt.title(
+        f"Box Plot of Coll Bus BW (CommType: {comm_type} - Coll Type: {coll_type} - Msg Size: {message_size})"
+    )
+
+    # Adding labels for min, max, and median
+    stats = df["mean_coll_busbw_gbs"].describe(percentiles=[0.5])
+    plt.annotate(
+        f"Min: {stats['min']:.2f}",
+        xy=(stats["min"], 1),
+        xytext=(stats["min"], 1.1),
+        arrowprops=dict(facecolor="black", shrink=0.05),
+    )
+    plt.annotate(
+        f"Median: {stats['50%']:.2f}",
+        xy=(stats["50%"], 1),
+        xytext=(stats["50%"], 1.1),
+        arrowprops=dict(facecolor="black", shrink=0.05),
+    )
+    plt.annotate(
+        f"Max: {stats['max']:.2f}",
+        xy=(stats["max"], 1),
+        xytext=(stats["max"], 1.1),
+        arrowprops=dict(facecolor="black", shrink=0.05),
+    )
+
+    plt.tight_layout()
+    plt.savefig(output_file)
+    plt.close()
+    logging.info(f"Box plot saved to {output_file}")
+
+
+def summarize_data_per_comm_coll_type(output_root, comm_type, coll_type, output_dir_name):
+    """Summarize parquet data per communication and collective type using DuckDB"""
+    logging.info(f"Summarizing data per comm/coll type for {output_dir_name}, {comm_type} and {coll_type}")
+
+    # Check if there are any parquet files
+    parquet_dir = output_root / "parquet_files"
+    parquet_files = list(parquet_dir.glob("*.parquet"))
+    if not parquet_files:
+        logging.warning(f"No parquet files found for {comm_type} and {coll_type}")
+        return None
+
+    # Clean up invalid/empty parquet files by moving them to a separate directory
+    invalid_dir = parquet_dir / "invalid"
+    invalid_dir.mkdir(exist_ok=True)
+
+    invalid_count = 0
+    for pf in parquet_files:
+        try:
+            # Check file size first
+            if pf.stat().st_size == 0:
+                logging.warning(f"Moving zero-byte parquet file {pf} to invalid directory")
+                pf.rename(invalid_dir / pf.name)
+                invalid_count += 1
+                continue
+
+            # Use pyarrow to check parquet metadata without reading data
+            import pyarrow.parquet as pq
+            parquet_file = pq.ParquetFile(pf)
+            if parquet_file.metadata.num_rows == 0:
+                logging.warning(f"Moving empty parquet file {pf} (0 rows) to invalid directory")
+                pf.rename(invalid_dir / pf.name)
+                invalid_count += 1
+        except Exception as e:
+            logging.warning(f"Moving invalid parquet file {pf} to invalid directory: {e}")
+            pf.rename(invalid_dir / pf.name)
+            invalid_count += 1
+
+    # Check if any valid files remain
+    remaining_files = list(parquet_dir.glob("*.parquet"))
+    if not remaining_files:
+        logging.warning(f"No valid parquet files found for {comm_type} and {coll_type} (moved {invalid_count} invalid files)")
+        return None
+
+    logging.info(f"Found {len(remaining_files)} valid parquet files (moved {invalid_count} invalid files)")
+
+    try:
+        duckdb.execute(
+            f"CREATE OR REPLACE VIEW logs AS SELECT * FROM read_parquet('{parquet_dir}/*.parquet')"
+        )
+        df = duckdb.execute(f"""
+            SELECT
+                id,
+                coll_sn,
+                coll_msg_size_bytes,
+                AVG(coll_busbw_gbs) as mean_coll_busbw_gbs,
+                COUNT(*) as log_count,
+                ARRAY_DISTINCT(LIST(n_ranks)) as n_ranks,
+                ARRAY_DISTINCT(LIST(nnodes)) as nnodes,
+                MIN(dump_timestamp_us) as coll_start_timestamp_us,
+                MAX(dump_timestamp_us) as coll_end_timestamp_us,
+                (MAX(dump_timestamp_us) - MIN(dump_timestamp_us)) as coll_duration_us
+            FROM logs
+            WHERE coll = '{coll_type}' and comm_type = '{comm_type}'
+            GROUP BY id, coll_sn, coll_msg_size_bytes
+            ORDER BY coll_sn
+        """).df()
+    except Exception as e:
+        logging.error(f"Error executing DuckDB query for {comm_type} and {coll_type}: {e}")
+        return None
+
+    if df.empty:
+        logging.info(f"No data for {comm_type} and {coll_type}")
+        return None
+
+    # Add human-readable formatting
+    df["human_readable_coll_msg_size_bytes"] = df["coll_msg_size_bytes"].apply(
+        bytes_to_human_readable
+    )
+
+    # Log example of time range data for first few rows
+    if len(df) > 0:
+        sample_row = df.iloc[0]
+        start_time = timestamp_to_datetime(sample_row['coll_start_timestamp_us'])
+        end_time = timestamp_to_datetime(sample_row['coll_end_timestamp_us'])
+        duration = microseconds_to_human_readable(sample_row['coll_duration_us'])
+        logging.info(f"Example time range - ID: {sample_row['id']}, Coll_SN: {sample_row['coll_sn']}, "
+                     f"Start: {start_time}, End: {end_time}, Duration: {duration}")
+
+    return df
+
+
+def generate_visualizations(df, output_root, comm_type, coll_type):
+    """Generate all visualizations and save CSV files for the processed data"""
+    logging.info(f"Generating visualizations for {comm_type} and {coll_type}")
+
+    summary_dir = output_root / "summary"
+    summary_dir.mkdir(parents=True, exist_ok=True)
+
+    # Scatter Plot for all message sizes
+    output_file = summary_dir / f"scatter_plot_{comm_type}_{coll_type}.png"
+    generate_scatter_plot(df, comm_type, coll_type, output_file)
+
+    # Combined Scatter Plot for all message sizes
+    output_file = summary_dir / f"combined_scatter_plot_{comm_type}_{coll_type}.png"
+    generate_combined_scatter_plot(df, comm_type, coll_type, output_file)
+
+    distinct_msg_sizes = df["coll_msg_size_bytes"].unique()
+    for msg_size in distinct_msg_sizes:
+        hr_msg_size = bytes_to_human_readable(msg_size)
+        msg_size_dir = summary_dir / f"msg_size_{msg_size}_{hr_msg_size}"
+        msg_size_hist_dir = msg_size_dir / "histograms"
+        msg_size_boxplot_dir = msg_size_dir / "boxplots"
+        msg_size_dir.mkdir(parents=True, exist_ok=True)
+        msg_size_hist_dir.mkdir(parents=True, exist_ok=True)
+        msg_size_boxplot_dir.mkdir(parents=True, exist_ok=True)
+
+        df_msg_size = df[df["coll_msg_size_bytes"] == msg_size]
+
+        # Add human-readable time formatting
+        df_msg_size = df_msg_size.copy()
+        df_msg_size["coll_start_datetime"] = df_msg_size["coll_start_timestamp_us"].apply(timestamp_to_datetime)
+        df_msg_size["coll_end_datetime"] = df_msg_size["coll_end_timestamp_us"].apply(timestamp_to_datetime)
+        df_msg_size["coll_duration_human"] = df_msg_size["coll_duration_us"].apply(microseconds_to_human_readable)
+
+        # Histogram
+        output_file = (
+            msg_size_hist_dir / f"histogram_{comm_type}_{coll_type}_{msg_size}.png"
+        )
+        generate_histogram(
+            df_msg_size,
+            comm_type,
+            coll_type,
+            output_file,
+            bytes_to_human_readable(msg_size),
+        )
+
+        # Box Plot
+        output_file = (
+            msg_size_boxplot_dir / f"boxplot_{comm_type}_{coll_type}_{msg_size}.png"
+        )
+        generate_boxplot(
+            df_msg_size,
+            comm_type,
+            coll_type,
+            output_file,
+            bytes_to_human_readable(msg_size),
+        )
+
+        output_file = msg_size_dir / f"summary_{comm_type}_{coll_type}_{msg_size}.csv"
+        df_msg_size.to_csv(output_file, index=False)
+        logging.info(
+            f"Summary for {comm_type}, {coll_type}, and msg_size {msg_size} written to {output_file}"
+        )
+
+
+def generate_summary(output_root, comm_type, coll_type, output_dir_name):
+    """Generate summary by summarizing data per comm/coll type and creating visualizations"""
+    logging.info(f"Generating summary for {output_dir_name}, {comm_type} and {coll_type}")
+
+    # Step 1: Summarize data per communication and collective type
+    df = summarize_data_per_comm_coll_type(output_root, comm_type, coll_type, output_dir_name)
+
+    # Step 2: Generate visualizations if data exists
+    if df is not None:
+        generate_visualizations(df, output_root, comm_type, coll_type)
+    else:
+        logging.warning(f"No data found for {comm_type} and {coll_type} - skipping visualization generation")
+
+
+def generate_summary_wrapper(args):
+    return generate_summary(*args)
+
+
+if __name__ == "__main__":
+    files, output_dir_name = get_log_files_and_output_dir()
+    print(f"Number of log files found: {len(files)}")
+    print(f"Output directory: {output_dir_name}")
+    output_dir = Path(output_dir_name)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    setup_logging(output_dir)
+    create_per_node_parquet_files(files, output_dir)
+    comm_types = ["single-rank", "nvlink-only", "hca-only", "mixed"]
+    coll_types = ["AllReduce", "AllGather", "ReduceScatter", "Broadcast"]
+    summary_args = [
+        (output_dir, comm_type, coll_type, output_dir_name)
+        for comm_type in comm_types
+        for coll_type in coll_types
+    ]
+    max_workers = min(64, len(summary_args), os.cpu_count() or 1)
+    with ProcessPoolExecutor(max_workers=max_workers) as executor:
+        list(
+            tqdm(
+                executor.map(generate_summary_wrapper, summary_args),
+                total=len(summary_args),
+                desc="Generating summaries",
+            )
+        )
+        print("Done!")
@@ -0,0 +1,6 @@
+pandas>=1.3.0
+tqdm>=4.60.0
+duckdb>=0.8.0
+matplotlib>=3.3.0
+pyarrow>=5.0.0
+numpy>=1.21.0
@@ -0,0 +1,198 @@
+#pragma once
+
+#include <pthread.h>
+
+#include "json.h"
+#include "common.h"
+#include "version.h"
+
+#define MAX_CHANNELS                     64
+
+#define INS_CHK_GOTO(call, res, label)                                  \
+  do {                                                                  \
+    res = call;                                                         \
+    if (inspectorSuccess != res) {                                      \
+      INFO(NCCL_INSPECTOR, "%s:%d -> error %d: %s", __FILE__, __LINE__, res, \
+           inspectorErrorString(res));                                  \
+      goto label;                                                       \
+    }                                                                   \
+  } while (0);
+
+
+typedef enum {
+  ncclFuncBroadcast = 0,
+  ncclFuncReduce = 1,
+  ncclFuncAllGather = 2,
+  ncclFuncReduceScatter = 3,
+  ncclFuncAllReduce = 4,
+  ncclFuncSendRecv = 5,
+  ncclFuncSend = 6,
+  ncclFuncRecv = 7,
+  ncclNumFuncs = 8
+} ncclFunc_t;
+
+typedef enum {
+  inspectorSuccess = 0,
+  inspectorUninitializedError,
+  inspectorMemoryError,
+  inspectorFileOpenError,
+  inspectorDisabledError,
+  inspectorLockError,
+  inspectorPthreadError,
+  inspectorJsonError,
+  inspectorCudaError,
+  inspectorBadHash,
+  inspectorDeleteUnknownCommError,
+  inspectorAddDuplicateCommError,
+  inspectorNop,
+  inspectorNullTally,
+  inspectorGlobalInitError,
+  inspectorReturn,
+} inspectorResult_t;
+
+typedef enum {
+  inspectorTimingSourceKernelGpu = 0,
+  inspectorTimingSourceKernelCpu = 1,
+  inspectorTimingSourceCollectiveCpu = 2,
+} inspectorTimingSource_t;
+
+struct inspectorEventTraceInfo {
+  uint64_t ts;
+  uint64_t sn;
+};
+
+typedef enum {
+  NCCL_INSP_EVT_TRK_COLL_START = 0,
+  NCCL_INSP_EVT_TRK_COLL_STOP = 1,
+  NCCL_INSP_EVT_TRK_COLL_NEVT = 2,
+} inspectorEventTrkColl_t;
+
+typedef enum {
+  NCCL_INSP_EVT_TRK_KERNEL_START = 0,
+  NCCL_INSP_EVT_TRK_KERNEL_STOP = 1,
+  NCCL_INSP_EVT_TRK_KERNEL_RECORD = 2,
+  NCCL_INSP_EVT_TRK_KERNEL_NEVT = 3,
+} inspectorEventTrkKernel_t;
+
+struct inspectorEventTrkKernelInfo {
+  struct inspectorEventTraceInfo evntTrace[NCCL_INSP_EVT_TRK_KERNEL_NEVT];
+};
+
+struct inspectorEventTrkCollInfo {
+  int sn;
+  uint32_t nChannels;
+  struct inspectorEventTraceInfo evntTrace[NCCL_INSP_EVT_TRK_COLL_NEVT];
+  struct inspectorEventTrkKernelInfo kernelCh[MAX_CHANNELS];
+};
+
+struct inspectorCompletedCollInfo {
+  ncclFunc_t func;
+  uint64_t sn;
+  size_t msgSizeBytes;
+  uint64_t execTimeUsecs;
+  inspectorTimingSource_t timingSource;
+  double algoBwGbs;
+  double busBwGbs;
+  // Event trace information
+  struct inspectorEventTrkCollInfo collEvtTrk;
+};
+
+enum {
+  NCCL_COMM_HASH_LENGTH = 17
+};
+
+struct inspectorCommInfo {
+  struct inspectorCommInfo* next;
+
+  const char* commName;
+  uint64_t commHash;
+  char commHashStr[NCCL_COMM_HASH_LENGTH];
+  int rank;
+  int nranks;
+  int nnodes;
+
+  bool dump;
+  struct inspectorCompletedCollInfo completedCollInfo;
+  pthread_rwlock_t guard;
+};
+
+struct inspectorKernelChInfo {
+  uint64_t type;
+  int refCount; /*unused*/
+  struct inspectorCollInfo *collInfo;
+  uint8_t channelId;
+  uint64_t tsStartUsec;
+  uint64_t tsCompletedUsec;
+  uint64_t startGpuClk;
+  uint64_t stopGpuClk;
+};
+
+struct inspectorCollInfo {
+  uint64_t type;
+  int refCount;
+  struct inspectorCommInfo *commInfo;
+  const char* func;
+  uint64_t sn;
+  size_t msgSizeBytes;
+  uint64_t tsStartUsec;
+  uint64_t tsCompletedUsec;
+  uint32_t nChannels;
+  uint32_t nKernelChStarted;
+  uint32_t nKernelChCompleted;
+  pthread_rwlock_t guard;
+  struct inspectorKernelChInfo kernelCh[MAX_CHANNELS];
+  struct inspectorEventTrkCollInfo collEvtTrk;
+};
+
+
+
+extern ncclDebugLogger_t logFn;
+#define VERSION(...) logFn(NCCL_LOG_VERSION, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
+#define INFO(FLAGS, ...) logFn(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
+#define WARN(...) logFn(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
+
+inline int ncclTypeSize(ncclDataType_t type) {
+  switch (type) {
+  case ncclInt8:
+  case ncclUint8:
+  case ncclFloat8e4m3:
+  case ncclFloat8e5m2:
+    return 1;
+  case ncclFloat16:
+  case ncclBfloat16:
+    return 2;
+  case ncclInt32:
+  case ncclUint32:
+  case ncclFloat32:
+    return 4;
+  case ncclInt64:
+  case ncclUint64:
+  case ncclFloat64:
+    return 8;
+  default:
+    return -1;
+  }
+}
+
+const char* inspectorErrorString(inspectorResult_t result);
+
+inspectorResult_t inspectorLockInit(pthread_rwlock_t* lockRef);
+inspectorResult_t inspectorLockDestroy(pthread_rwlock_t* lockRef);
+inspectorResult_t inspectorLockRd(pthread_rwlock_t* lockRef);
+inspectorResult_t inspectorLockWr(pthread_rwlock_t* lockRef);
+inspectorResult_t inspectorUnlockRWLock(pthread_rwlock_t* lockRef);
+inspectorResult_t inspectorGlobalInit(int rank);
+inspectorResult_t inspectorGlobalFinalize();
+uint64_t inspectorGetTime();
+inspectorResult_t inspectorAddComm(struct inspectorCommInfo **commInfo,
+                                   const char* commName, uint64_t commHash,
+                                   int nNodes, int nranks, int rank);
+inspectorResult_t inspectorDelComm(struct inspectorCommInfo *commInfo);
+
+void inspectorUpdateCollPerf(struct inspectorCompletedCollInfo *completedColl,
+                             struct inspectorCollInfo *collInfo);
+ncclDataType_t inspectorStringToDatatype(const char* str);
+
+void inspectorComputeCollBw(struct inspectorCommInfo *commInfo,
+                            struct inspectorCompletedCollInfo *completedColl,
+                            ncclFunc_t collType);
@@ -0,0 +1,493 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <stdio.h>
+#include <pthread.h>
+#include <string.h>
+#include <linux/limits.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include "profiler.h"
+#include "inspector.h"
+
+#define __hidden __attribute__ ((visibility("hidden")))
+
+static int gInitialized;
+
+static pthread_mutex_t gLock = PTHREAD_MUTEX_INITIALIZER;
+
+
+/*
+ * Description:
+ *   Records an event trace with timestamp and sequence number
+ *
+ * Thread Safety:
+ *   Not thread-safe - must be called with proper locking. This function
+ *   is designed to be called from within locked sections where the
+ *   collective info structure is already protected.
+ *
+ * Input:
+ *   struct inspectorEventTraceInfo* evtTrace - event trace array
+ *   int eventIndex - index in the event trace array (must be valid)
+ *   struct inspectorCollInfo* collInfo - collective info structure (must not be NULL)
+ *
+ * Output:
+ *   Event trace is updated with current timestamp and next sequence
+ *   number from collective
+ *
+ * Return:
+ *   uint64_t - the sequence number assigned to this event
+ *
+ * Preconditions:
+ *   - collInfo must not be NULL
+ *   - eventIndex must be within valid bounds for evtTrace array
+ *   - Function must be called from within a locked section
+ */
+static uint64_t inspectorRecordEventTrace(struct inspectorEventTraceInfo* evtTrace,
+                                          int eventIndex,
+                                          struct inspectorCollInfo* collInfo) {
+  evtTrace[eventIndex].ts = inspectorGetTime();
+  evtTrace[eventIndex].sn = ++collInfo->collEvtTrk.sn; // Increment coll sequence counter
+
+  return evtTrace[eventIndex].sn;
+}
+
+/*
+ * Description:
+ *
+ *   Initializes the NCCL Inspector plugin and global state for a
+ *   communicator.
+ *
+ * Thread Safety:
+ *   Thread-safe (uses mutex for initialization).
+ *
+ * Input:
+ *   void** context - pointer to plugin context.
+ *   int* eActivationMask - pointer to activation mask output.
+ *   const char* commName - communicator name.
+ *   uint64_t commHash - communicator hash.
+ *   int nNodes - number of nodes.
+ *   int nranks - number of ranks.
+ *   int rank - rank.
+ *   ncclDebugLogger_t logfn - logger function pointer.
+ *
+ * Output:
+ *   context is set to plugin context; eActivationMask is set.
+ *
+ * Return:
+ *   ncclResult_t - success or error code.
+ *
+ */
+__hidden ncclResult_t inspectorPluginInit(void** context, uint64_t commHash,
+                                          int* eActivationMask,
+                                          const char* commName,
+                                          int nNodes, int nranks, int rank,
+                                          ncclDebugLogger_t logfn) {
+  inspectorResult_t res = inspectorSuccess;
+  *context = nullptr;
+  logFn = logfn;
+
+  pthread_mutex_lock(&gLock);
+  if (++gInitialized == 1) {
+    res = inspectorGlobalInit(rank);
+    if (res != inspectorSuccess) {
+      WARN("Inspector Init Failed %s:%d -> error %d: %s",__FILE__, __LINE__, res,
+           inspectorErrorString(res));
+      gInitialized = 0;
+      pthread_mutex_unlock(&gLock);
+      return ncclInternalError;
+    }
+  }
+  pthread_mutex_unlock(&gLock);
+
+  INS_CHK_GOTO(inspectorAddComm((struct inspectorCommInfo **)context,
+                                commName, commHash,
+                                nNodes, nranks, rank), res, success);
+  *eActivationMask = ncclProfileColl | ncclProfileKernelCh;
+  INFO(NCCL_INIT, "PROFILER/Plugin: init commName: %s commHash: %lu nranks: %d rank: %d",
+       commName ? commName : "", commHash, nranks, rank);
+success:
+  if (res != inspectorSuccess) {
+    return ncclInternalError;
+  } else {
+    return ncclSuccess;
+  }
+}
+
+/*
+ * Description:
+ *
+ *   Finalizes the NCCL Inspector plugin and global state for a
+ *   communicator.
+ *
+ * Thread Safety:
+ *   Thread-safe (uses mutex for finalization).
+ *
+ * Input:
+ *   void* context - plugin context.
+ *
+ * Output:
+ *   Plugin context is finalized and cleaned up.
+ *
+ * Return:
+ *   ncclResult_t - success or error code.
+ *
+ */
+__hidden ncclResult_t inspectorPluginFinalize(void* context) {
+  inspectorDelComm((struct inspectorCommInfo *)context);
+  pthread_mutex_lock(&gLock);
+  if (--gInitialized == 0) {
+    inspectorGlobalFinalize();
+  }
+  pthread_mutex_unlock(&gLock);
+  return ncclSuccess;
+}
+
+inspectorResult_t inspectorPluginCollInfoRef(struct inspectorCollInfo *collInfo) {
+  collInfo->refCount += 1;
+  return inspectorSuccess;
+}
+
+inspectorResult_t inspectorPluginCollInfoRefSafe(struct inspectorCollInfo *collInfo) {
+  inspectorLockWr(&collInfo->guard);
+  inspectorPluginCollInfoRef(collInfo);
+  inspectorUnlockRWLock(&collInfo->guard);
+  return inspectorSuccess;
+}
+
+inspectorResult_t inspectorPluginCollInfoDeRef(struct inspectorCollInfo *collInfo) {
+  collInfo->refCount -= 1;
+  if (collInfo->refCount == 0) {
+    inspectorLockDestroy(&collInfo->guard);
+    memset(collInfo, 0, sizeof(struct inspectorCollInfo));
+    free(collInfo);
+    return inspectorReturn;
+  }
+  return inspectorSuccess;
+}
+
+inspectorResult_t inspectorPluginCollInfoDeRefSafe(struct inspectorCollInfo *collInfo) {
+  inspectorLockWr(&collInfo->guard);
+  inspectorResult_t res = inspectorPluginCollInfoDeRef(collInfo);
+  inspectorUnlockRWLock(&collInfo->guard);
+  return res;
+}
+
+/*
+ * Description:
+ *   Initializes a new inspectorCollInfo structure for a collective
+ *   event.
+ *
+ * Thread Safety:
+ *   Not thread-safe (allocates and initializes a new collective info
+ *   structure).
+ *
+ * Input:
+ *
+ *   struct inspectorCollInfo **collInfo - pointer to output
+ *   collective info struct.
+ *   ncclProfilerEventDescr_t *eDescr - event descriptor.
+ *
+ * Output:
+ *   collInfo is set to the new collective info struct.
+ *
+ * Return:
+ *   None.
+ */
+static void inspectorPluginCollInfoInit(struct inspectorCollInfo **collInfo,
+                                        ncclProfilerEventDescr_t *eDescr,
+                                        struct inspectorCommInfo *commInfo) {
+  struct inspectorCollInfo *collInfoPtr
+    = (struct inspectorCollInfo*)calloc(1, sizeof(struct inspectorCollInfo));
+  if (collInfoPtr == nullptr) {
+    WARN("Inspector: Failed to allocate memory for collective info structure");
+    *collInfo = nullptr;
+    return;
+  }
+  collInfoPtr->type = ncclProfileColl;
+  collInfoPtr->refCount = 0;
+  inspectorPluginCollInfoRef(collInfoPtr); //self ref; no locks needed
+  collInfoPtr->func = eDescr->coll.func;
+  collInfoPtr->sn = eDescr->coll.seqNumber;
+  collInfoPtr->nChannels = eDescr->coll.nChannels;
+  if (collInfoPtr->nChannels > 0) {
+    inspectorPluginCollInfoRef(collInfoPtr); //extra ref for kernel completion
+  }
+  collInfoPtr->tsStartUsec = inspectorGetTime();
+  collInfoPtr->msgSizeBytes =
+    ncclTypeSize(inspectorStringToDatatype(eDescr->coll.datatype)) * eDescr->coll.count;
+
+
+  collInfoPtr->commInfo = commInfo;
+  collInfoPtr->collEvtTrk.sn = 0;
+  collInfoPtr->collEvtTrk.nChannels = collInfoPtr->nChannels;
+  inspectorRecordEventTrace(collInfoPtr->collEvtTrk.evntTrace,
+                            NCCL_INSP_EVT_TRK_COLL_START, collInfoPtr);
+
+  inspectorLockInit(&collInfoPtr->guard);
+  *collInfo = collInfoPtr;
+}
+
+/*
+ * Description:
+ *
+ *   Initializes a new inspectorKernelChInfo structure for a kernel
+ *   channel event.
+ *
+ * Thread Safety:
+ *   Not thread-safe (initializes kernel channel info within a
+ *   collective info structure).
+ *
+ * Input:
+ *   struct inspectorKernelChInfo **kernelChInfo - pointer to output
+ *   kernel channel info struct.
+ *   ncclProfilerEventDescr_t *eDescr - event descriptor.
+ *
+ * Output:
+ *
+ *   kernelChInfo is set to the new kernel channel info struct.
+ *
+ * Return:
+ *   None.
+ */
+static void inspectorPluginKernelChInfoInit(struct inspectorKernelChInfo **kernelChInfo,
+                                            ncclProfilerEventDescr_t *eDescr) {
+  if (eDescr->parentObj) {
+    uint64_t parentType=*(uint64_t*)eDescr->parentObj;
+    if (parentType == ncclProfileColl) {
+      struct inspectorCollInfo *collInfo = (struct inspectorCollInfo*)eDescr->parentObj;
+      if (collInfo && collInfo->type == ncclProfileColl) {
+        inspectorLockWr(&collInfo->guard);
+        struct inspectorEventTraceInfo *krnlEvtTrk =
+          collInfo->collEvtTrk.kernelCh[eDescr->kernelCh.channelId].evntTrace;
+        inspectorRecordEventTrace(krnlEvtTrk,
+                                  NCCL_INSP_EVT_TRK_KERNEL_START,
+                                  collInfo);
+        struct inspectorKernelChInfo *kernelChInfoPtr
+          = &collInfo->kernelCh[eDescr->kernelCh.channelId];
+        kernelChInfoPtr->type = ncclProfileKernelCh;
+        kernelChInfoPtr->channelId = eDescr->kernelCh.channelId;
+        kernelChInfoPtr->startGpuClk = eDescr->kernelCh.pTimer;
+        if (kernelChInfoPtr->stopGpuClk == 0) {
+          inspectorPluginCollInfoRef(collInfo); //Pairs with Record Kernel Stop event
+        }
+        kernelChInfoPtr->tsStartUsec = inspectorGetTime();
+        if (collInfo->nKernelChStarted == 0) {
+          collInfo->tsStartUsec = kernelChInfoPtr->tsStartUsec;
+        }
+        collInfo->nKernelChStarted += 1;
+        inspectorPluginCollInfoRef(collInfo); //Pairs with Stop Kernel Event
+        kernelChInfoPtr->collInfo = collInfo;
+
+        *kernelChInfo = kernelChInfoPtr;
+        inspectorUnlockRWLock(&collInfo->guard);
+      }
+    }
+  }
+}
+/*
+ * Description:
+ *
+ *   Starts a profiling event for the NCCL Inspector plugin.
+ *
+ * Thread Safety:
+ *   Thread-safe (allocates and initializes event structures).
+ *
+ * Input:
+ *   void* context - plugin context.
+ *   void** eHandle - pointer to event handle output.
+ *   ncclProfilerEventDescr_t* eDescr - event descriptor.
+ *
+ * Output:
+ *   eHandle is set to the new event structure.
+ *
+ * Return:
+ *   ncclResult_t - success or error code.
+ *
+ */
+__hidden ncclResult_t inspectorPluginStartEvent(void* context,
+                                                void** eHandle,
+                                                ncclProfilerEventDescr_t* eDescr) {
+  if (context == nullptr || eDescr == nullptr) {
+    INFO(NCCL_INIT, "Profiler/Plugin: context/eDescr NULL for start event %s", __func__);
+    return ncclSuccess;
+  }
+  *eHandle = nullptr;
+  if (eDescr->type == ncclProfileColl) {
+    struct inspectorCollInfo *collEvent = nullptr;
+    struct inspectorCommInfo *commInfoCtx = (struct inspectorCommInfo*)context;
+    inspectorPluginCollInfoInit(&collEvent, eDescr, commInfoCtx);
+    *eHandle = collEvent;
+  } else if (eDescr->type == ncclProfileKernelCh) {
+    struct inspectorKernelChInfo *kernelChEvent = nullptr;
+    inspectorPluginKernelChInfoInit(&kernelChEvent, eDescr);
+    *eHandle = kernelChEvent;
+  } else {
+    return ncclSuccess;
+  }
+  return ncclSuccess;
+}
+
+/*
+ * Description:
+ *
+ *   Stops a profiling event for the NCCL Inspector plugin.
+ *
+ * Thread Safety:
+ *
+ *   Thread-safe (updates event state and performance info).
+ *
+ * Input:
+ *
+ *   void *eHandle - event handle.
+ *
+ * Output:
+ *
+ *   Event is stopped and performance info may be updated.
+ *
+ * Return:
+ *   ncclResult_t - success or error code.
+ *
+ */
+__hidden ncclResult_t inspectorPluginStopEvent(void *eHandle) {
+
+  if (eHandle == nullptr) {
+    INFO(NCCL_INIT,
+         "Profiler/Plugin: Event Handle NULL for start event %s", __func__);
+    return ncclSuccess;
+  }
+  uint64_t type = *(uint64_t *)eHandle;
+  inspectorResult_t res = inspectorSuccess;
+
+  if (type == ncclProfileColl) {
+    struct inspectorCollInfo *collInfo = (struct inspectorCollInfo *)eHandle;
+    // Record collective stop event
+    inspectorLockWr(&collInfo->guard);
+    inspectorRecordEventTrace(collInfo->collEvtTrk.evntTrace,
+                              NCCL_INSP_EVT_TRK_COLL_STOP,
+                              collInfo);
+    res = inspectorPluginCollInfoDeRef(collInfo);
+    if (res == inspectorReturn) {
+      // WARN("NCCL Inspector unnatural return: inspectorPluginStopEvent:ncclProfileColl");
+      return ncclSuccess;
+    }
+    inspectorUnlockRWLock(&collInfo->guard);
+    return ncclSuccess;
+  } else if (type == ncclProfileKernelCh) {
+    struct inspectorKernelChInfo *kernelChInfo
+      = (struct inspectorKernelChInfo *)eHandle;
+    struct inspectorCollInfo *collInfo = kernelChInfo->collInfo;
+    if (collInfo && collInfo->type == ncclProfileColl) {
+      inspectorLockWr(&collInfo->guard);
+      struct inspectorEventTraceInfo *krnlEvtTrk =
+        collInfo->collEvtTrk.kernelCh[kernelChInfo->channelId].evntTrace;
+      inspectorRecordEventTrace(krnlEvtTrk,
+                                NCCL_INSP_EVT_TRK_KERNEL_STOP,
+                                collInfo);
+      kernelChInfo->tsCompletedUsec = inspectorGetTime();
+      collInfo->nKernelChCompleted += 1;
+
+      res = inspectorPluginCollInfoDeRef(collInfo);
+      if (res == inspectorReturn) {
+        WARN("NCCL Inspector unnatural return: inspectorPluginStopEvent:ncclProfileKernelCh");
+        return ncclSuccess;
+      }
+      if ((collInfo->nKernelChCompleted == collInfo->nKernelChStarted)
+          && (collInfo->nKernelChCompleted == collInfo->nChannels)) {
+        struct inspectorCompletedCollInfo completedColl;
+        struct inspectorCommInfo *commInfo = collInfo->commInfo;
+        collInfo->tsCompletedUsec = kernelChInfo->tsCompletedUsec;
+        inspectorUpdateCollPerf(&completedColl, collInfo);
+
+        res = inspectorPluginCollInfoDeRef(collInfo);
+        if (res != inspectorReturn) {
+          inspectorUnlockRWLock(&collInfo->guard);
+        }
+        if (commInfo != nullptr) {
+          inspectorLockWr(&commInfo->guard);
+          inspectorComputeCollBw(commInfo,
+                                 &completedColl,
+                                 completedColl.func);
+          memcpy(&commInfo->completedCollInfo,
+                 &completedColl,
+                 sizeof(struct inspectorCompletedCollInfo));
+          commInfo->dump = true;
+          inspectorUnlockRWLock(&commInfo->guard);
+        }
+        return ncclSuccess;
+      }
+      inspectorUnlockRWLock(&collInfo->guard);
+    }
+    return ncclSuccess;
+  }
+  return ncclSuccess;
+}
+
+/*
+ * Description:
+ *
+ *   Records the state of a profiling event for the NCCL Inspector
+ *   plugin.
+ *
+ * Thread Safety:
+ *
+ *   Thread-safe (updates event state as needed).
+ *
+ * Input:
+ *   void* eHandle - event handle.
+ *   ncclProfilerEventState_t eState - event state.
+ *   ncclProfilerEventStateArgs_t* eStateArgs - event state arguments.
+ *
+ * Output:
+ *   Event state is updated as needed.
+ *
+ * Return:
+ *   ncclResult_t - success or error code.
+ *
+ */
+__hidden ncclResult_t inspectorPluginRecordEventState(void* eHandle,
+                                                      ncclProfilerEventState_t eState,
+                                                      ncclProfilerEventStateArgs_t* eStateArgs) {
+  if (eHandle == nullptr || eStateArgs == nullptr)
+    return ncclSuccess;
+
+  uint64_t type = *(uint64_t *)eHandle;
+
+  if (type == ncclProfileKernelCh && eState == ncclProfilerKernelChStop) {
+    struct inspectorKernelChInfo *kernelChInfo = (struct inspectorKernelChInfo *)eHandle;
+    struct inspectorCollInfo *collInfo = kernelChInfo->collInfo;
+    inspectorResult_t res = inspectorSuccess;
+    if (collInfo && collInfo->type == ncclProfileColl) {
+      inspectorLockWr(&collInfo->guard);
+      struct inspectorEventTraceInfo *krnlEvtTrk
+        = collInfo->collEvtTrk.kernelCh[kernelChInfo->channelId].evntTrace;
+      inspectorRecordEventTrace(krnlEvtTrk,
+                                NCCL_INSP_EVT_TRK_KERNEL_RECORD,
+                                collInfo);
+      kernelChInfo->stopGpuClk = eStateArgs->kernelCh.pTimer;
+      if (kernelChInfo->startGpuClk != 0) {
+        res = inspectorPluginCollInfoDeRef(collInfo);
+        if (res == inspectorReturn) {
+          WARN("NCCL Inspector unnatural return: inspectorPluginRecordEventState");
+          return ncclSuccess;
+        }
+      }
+      inspectorUnlockRWLock(&collInfo->guard);
+    }
+  }
+  return ncclSuccess;
+}
+
+ncclProfiler_t ncclProfiler_v5 = {
+  "Inspector",
+  inspectorPluginInit,
+  inspectorPluginStartEvent,
+  inspectorPluginStopEvent,
+  inspectorPluginRecordEventState,
+  inspectorPluginFinalize,
+};
@@ -0,0 +1,496 @@
+#include "json.h"
+#include <assert.h>
+#include <math.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+const char* jsonErrorString(jsonResult_t res) {
+  switch (res) {
+  case jsonSuccess:
+    return "jsonSuccess";
+  case jsonFileError:
+    return "jsonFileError";
+  case jsonUnknownStateError:
+    return "jsonUnknownStateError";
+  case jsonEmptyStateError:
+    return "jsonEmptyStateError";
+  case jsonExpectedNonNoneStateError:
+    return "jsonExpectedNonNoneStateError";
+  case jsonMemoryError:
+    return "jsonMemoryError";
+  case jsonStringOverflowError:
+    return "jsonStringOverflowError";
+  case jsonStringBadChar:
+    return "jsonStringBadChar";
+  case jsonLockError:
+    return "jsonLockError";
+  default:
+    return "unknown json error";
+  }
+}
+
+// We use these statics to mantain a stack of states where we are writing.
+typedef struct jsonFileOutput {
+  jsonState_t* states;
+  size_t state_cap; // Allocated stack capacity
+  size_t state_n;   // # of items in the stack.
+  FILE* fp;
+  pthread_mutex_t mutex;
+} jsonFileOutput;
+
+jsonResult_t jsonInitFileOutput(jsonFileOutput** jfo, const char* outfile) {
+  jsonFileOutput* new_jfo = (jsonFileOutput*)malloc(sizeof(jsonFileOutput));
+  if (new_jfo == NULL) {
+    return jsonMemoryError;
+  }
+  if (pthread_mutex_init(&new_jfo->mutex, NULL) != 0) {
+    free(new_jfo);
+    *jfo = 0;
+    return jsonLockError;
+  }
+  new_jfo->states = NULL;
+  new_jfo->state_cap = 0;
+  new_jfo->state_n = 0;
+  new_jfo->fp = fopen(outfile, "w");
+  if (new_jfo->fp == NULL) {
+    free(new_jfo);
+    *jfo = 0;
+    return jsonFileError;
+  }
+  *jfo = new_jfo;
+  return jsonSuccess;
+}
+
+jsonResult_t jsonNewline(jsonFileOutput* jfo) {
+  fprintf(jfo->fp, "\n");
+  return jsonSuccess;
+}
+
+jsonResult_t jsonFlushOutput(jsonFileOutput* jfo) {
+  fflush(jfo->fp);
+  return jsonSuccess;
+}
+
+jsonResult_t jsonLockOutput(jsonFileOutput* jfo) {
+  if (pthread_mutex_lock(&jfo->mutex) != 0) {
+    return jsonLockError;
+  }
+  return jsonSuccess;
+}
+
+jsonResult_t jsonUnlockOutput(jsonFileOutput* jfo) {
+  if (pthread_mutex_unlock(&jfo->mutex) != 0) {
+    return jsonLockError;
+  }
+  return jsonSuccess;
+}
+
+jsonResult_t jsonFinalizeFileOutput(jsonFileOutput* jfo) {
+  // Really should probably complain if we aren't in a valid state
+
+  if (pthread_mutex_destroy(&jfo->mutex) != 0) {
+    free(jfo);
+    return jsonLockError;
+  }
+  if (jfo->states != NULL) {
+    free(jfo->states);
+  }
+  jfo->states = NULL;
+  jfo->state_cap = 0;
+  jfo->state_n = 0;
+  if (jfo->fp) {
+    fclose(jfo->fp);
+    jfo->fp = 0;
+  }
+
+  free(jfo);
+  return jsonSuccess;
+}
+
+static int utf8copy(unsigned char* out, int out_lim, const unsigned char* in) {
+  int copy_len;
+  if ((in[0] & 0xE0) == 0xC0) {
+    // 2-byte sequence
+    if ((in[1] & 0xC0) != 0x80 || out_lim < 2) {
+      return 0;
+    }
+    copy_len = 2;
+  } else if ((in[0] & 0xF0) == 0xE0) {
+    // 3-byte sequence
+    if ((in[1] & 0xC0) != 0x80 || (in[2] & 0xC0) != 0x80 || out_lim < 3) {
+      return 0;
+    }
+    copy_len = 3;
+  } else if ((in[0] & 0xF8) == 0xF0) {
+    // 4-byte sequence
+    if ((in[1] & 0xC0) != 0x80 || (in[2] & 0xC0) != 0x80 || (in[3] & 0xC0) != 0x80 || out_lim < 4) {
+      return 0;
+    }
+    copy_len = 4;
+  } else {
+    // Invalid start byte
+    return 0;
+  }
+
+  for (int i = 0; i < copy_len; ++i) {
+    out[i] = in[i];
+  }
+
+  return copy_len;
+}
+
+// This tries to sanitize/quote a string from 'in' into 'out',
+// assuming 'out' has length 'lim'.  We mainly quote ",/,\,\t,\n, and
+// bail if we encounter non-printable stuff or non-ASCII stuff.
+// 'in' should be null-terminated, of course.
+//
+// We return false if we were not able to copy all of 'in', either for
+// length reasons or for unhandled characters.
+static jsonResult_t sanitizeJson(unsigned char out[], int lim, const unsigned char* in) {
+  int c = 0;
+  while (*in) {
+    if (c + 1 >= lim) {
+      out[c] = 0;
+      return jsonStringOverflowError;
+    }
+    switch (*in) {
+    case '"':
+    case '\\':
+    case '/':
+    case '\t':
+    case '\n':
+      if (c + 2 > lim) {
+        out[c] = 0;
+        return jsonStringOverflowError;
+      }
+
+      out[c++] = '\\';
+      if (*in == '\n') {
+        out[c++] = 'n';
+      } else if (*in == '\t') {
+        out[c++] = 't';
+      } else {
+        out[c++] = *in;
+      }
+      ++in;
+      break;
+    default:
+      if (*in <= 0x1F) {
+        out[c] = 0;
+        return jsonStringBadChar;
+      } else if (*in <= 0x7F) {
+        out[c++] = *in;
+        ++in;
+      } else {
+        const int utf8len = utf8copy(out + c, lim - c - 1, in);
+        if (utf8len == 0) {
+          out[c] = 0;
+          return jsonStringBadChar;
+        }
+        c += utf8len;
+        in += utf8len;
+      }
+      break;
+    }
+  }
+  out[c] = 0;
+  return jsonSuccess;
+}
+
+static size_t max(size_t a, size_t b) {
+  if (a < b) {
+    return b;
+  }
+  return a;
+}
+
+// Push state onto the state stack. Reallocate for extra storage if needed.
+// Because JSON_NONE is a pseudo-state, don't allow it to be pushed.
+static jsonResult_t jsonPushState(jsonFileOutput* jfo, jsonState_t state) {
+  if (state == JSON_NONE) {
+    return jsonExpectedNonNoneStateError;
+  }
+  if (jfo->state_cap <= (jfo->state_n + 1)) {
+    jfo->state_cap = max((size_t)16, jfo->state_cap * 2);
+    jfo->states = (jsonState_t*)realloc(jfo->states, sizeof(jsonState_t) * jfo->state_cap);
+    if (jfo->states == 0) {
+      return jsonMemoryError;
+    }
+  }
+  jfo->states[jfo->state_n++] = state;
+  return jsonSuccess;
+}
+
+// Return the current state at the top of the stack
+static jsonState_t jsonCurrState(const jsonFileOutput* jfo) {
+  if (jfo->state_n == 0) {
+    return JSON_NONE;
+  }
+  return jfo->states[jfo->state_n - 1];
+}
+
+// Replace the stack with state (equivalent to a pop & push if stack is not empty)
+static jsonResult_t jsonReplaceState(jsonFileOutput* jfo, jsonState_t state) {
+  if (state == JSON_NONE) {
+    return jsonExpectedNonNoneStateError;
+  }
+  if (jfo->state_n == 0) {
+    return jsonEmptyStateError;
+  }
+  jfo->states[jfo->state_n - 1] = state;
+  return jsonSuccess;
+}
+
+// Pop the top state off the stack, or return that the state is empty
+static jsonState_t jsonPopState(jsonFileOutput* jfo) {
+  if (jfo->state_n == 0) {
+    return JSON_NONE;
+  }
+  return jfo->states[--jfo->state_n];
+}
+
+// Emit a key and separator. Santize the key.
+// This is only acceptable if the top state is an object
+// Emit a ',' separator of we aren't the first item.
+jsonResult_t jsonKey(jsonFileOutput* jfo, const char* name) {
+  switch (jsonCurrState(jfo)) {
+  case JSON_OBJECT_EMPTY:
+    jsonReplaceState(jfo, JSON_OBJECT_SOME);
+    break;
+  case JSON_OBJECT_SOME:
+    fprintf(jfo->fp, ",");
+    break;
+  default:
+    return jsonUnknownStateError;
+  }
+  unsigned char tmp[2048];
+  const jsonResult_t res = sanitizeJson(tmp, sizeof(tmp), (const unsigned char*)name);
+  if (res != jsonSuccess) {
+    return res;
+  }
+  fprintf(jfo->fp, "\"%s\":", tmp);
+  jsonPushState(jfo, JSON_KEY);
+  return jsonSuccess;
+}
+
+// Helper function for inserting values.
+// Only acceptable after keys, top-level, or in lists.
+// Emit preceeding ',' if in a list and not first item.
+static jsonResult_t jsonValHelper(jsonFileOutput* jfo) {
+  switch (jsonCurrState(jfo)) {
+  case JSON_LIST_EMPTY:
+    jsonReplaceState(jfo, JSON_LIST_SOME);
+    break;
+  case JSON_LIST_SOME:
+    fprintf(jfo->fp, ",");
+    break;
+  case JSON_KEY:
+    jsonPopState(jfo);
+    break;
+  case JSON_NONE:
+    break;
+  default:
+    return jsonUnknownStateError;
+  }
+  return jsonSuccess;
+}
+
+// Start an object
+jsonResult_t jsonStartObject(jsonFileOutput* jfo) {
+  const jsonResult_t res = jsonValHelper(jfo);
+  if (res != jsonSuccess) {
+    return res;
+  }
+  fprintf(jfo->fp, "{");
+  return jsonPushState(jfo, JSON_OBJECT_EMPTY);
+}
+
+// Close an object
+jsonResult_t jsonFinishObject(jsonFileOutput* jfo) {
+  switch (jsonPopState(jfo)) {
+  case JSON_OBJECT_EMPTY:
+  case JSON_OBJECT_SOME:
+    break;
+  default:
+    return jsonUnknownStateError;
+  }
+  fprintf(jfo->fp, "}");
+  return jsonSuccess;
+}
+
+// Start a list
+jsonResult_t jsonStartList(jsonFileOutput* jfo) {
+  const jsonResult_t res = jsonValHelper(jfo);
+  if (res != jsonSuccess) {
+    return res;
+  }
+  fprintf(jfo->fp, "[");
+  return jsonPushState(jfo, JSON_LIST_EMPTY);
+}
+
+// Close a list
+jsonResult_t jsonFinishList(jsonFileOutput* jfo) {
+  switch (jsonPopState(jfo)) {
+  case JSON_LIST_EMPTY:
+  case JSON_LIST_SOME:
+    break;
+  default:
+    return jsonUnknownStateError;
+  }
+  fprintf(jfo->fp, "]");
+  return jsonSuccess;
+}
+
+// Write a null value
+jsonResult_t jsonNull(jsonFileOutput* jfo) {
+  const jsonResult_t res = jsonValHelper(jfo);
+  if (res != jsonSuccess) {
+    return res;
+  }
+  fprintf(jfo->fp, "null");
+  return jsonSuccess;
+}
+
+// Write a (sanititzed) string
+jsonResult_t jsonStr(jsonFileOutput* jfo, const char* str) {
+  if (str == NULL) {
+    jsonNull(jfo);
+    return jsonSuccess;
+  }
+  const jsonResult_t res = jsonValHelper(jfo);
+  if (res != jsonSuccess) {
+    return res;
+  }
+  unsigned char tmp[2048];
+  const jsonResult_t san_res = sanitizeJson(tmp, sizeof(tmp), (const unsigned char*)str);
+  if (san_res != jsonSuccess) {
+    return san_res;
+  }
+  fprintf(jfo->fp, "\"%s\"", tmp);
+  return jsonSuccess;
+}
+
+// Write a bool as "true" or "false" strings.
+jsonResult_t jsonBool(jsonFileOutput* jfo, bool val) {
+  return jsonStr(jfo, val ? "true" : "false");
+}
+
+// Write an integer value
+jsonResult_t jsonInt(jsonFileOutput* jfo, const int val) {
+  const jsonResult_t res = jsonValHelper(jfo);
+  if (res != jsonSuccess) {
+    return res;
+  }
+  fprintf(jfo->fp, "%d", val);
+  return jsonSuccess;
+}
+
+// Write an integer value
+jsonResult_t jsonUint32(jsonFileOutput* jfo, const uint32_t val) {
+  const jsonResult_t res = jsonValHelper(jfo);
+  if (res != jsonSuccess) {
+    return res;
+  }
+  fprintf(jfo->fp, "%u", val);
+  return jsonSuccess;
+}
+
+
+// Write an integer value
+jsonResult_t jsonUint64(jsonFileOutput* jfo, const uint64_t val) {
+  const jsonResult_t res = jsonValHelper(jfo);
+  if (res != jsonSuccess) {
+    return res;
+  }
+  fprintf(jfo->fp, "%lu", val);
+  return jsonSuccess;
+}
+
+// Write a size_t value
+jsonResult_t jsonSize_t(jsonFileOutput* jfo, const size_t val) {
+  const jsonResult_t res = jsonValHelper(jfo);
+  if (res != jsonSuccess) {
+    return res;
+  }
+  fprintf(jfo->fp, "%zu", val);
+  return jsonSuccess;
+}
+
+// Write a double value
+jsonResult_t jsonDouble(jsonFileOutput* jfo, const double val) {
+  const jsonResult_t res = jsonValHelper(jfo);
+  if (res != jsonSuccess) {
+    return res;
+  }
+  if (val != val) {
+    fprintf(jfo->fp, "\"nan\"");
+  } else {
+    fprintf(jfo->fp, "%lf", val);
+  }
+  return jsonSuccess;
+}
+
+#ifdef DO_JSON_TEST
+// compile with
+// gcc json.cc -Iinclude/ -DDO_JSON_TEST -o json_test
+// run with:
+// ./json_test
+// if something fails, it will print out the error
+// if it all works, print out "output matches reference"
+#define JSONCHECK(expr)                                                                            \
+  do {                                                                                             \
+    const jsonResult_t res = (expr);                                                               \
+    if (res != jsonSuccess) {                                                                      \
+      fprintf(stderr, "jsonError: %s\n", jsonErrorString(res));                                    \
+      exit(1);                                                                                     \
+    }                                                                                              \
+  } while (0)
+
+int main() {
+
+  const char refstr[] =
+      "{\"number\":123,\"utfstring\":\"∮ E⋅da = Q,  n → ∞, ∑ f(i) = ∏ g(i), ∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ "
+      "¬β = ¬(¬α ∨ β),\",\"list\":[\"true\",null,9423812381231,3123111,0.694234]}";
+
+  jsonFileOutput* jfo;
+  JSONCHECK(jsonInitFileOutput(&jfo, "test.json"));
+  JSONCHECK(jsonStartObject(jfo));
+  JSONCHECK(jsonKey(jfo, "number"));
+  JSONCHECK(jsonInt(jfo, 123));
+  JSONCHECK(jsonKey(jfo, "utfstring"));
+  JSONCHECK(
+      jsonStr(jfo, "∮ E⋅da = Q,  n → ∞, ∑ f(i) = ∏ g(i), ∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ ¬β = ¬(¬α ∨ β),"));
+  JSONCHECK(jsonKey(jfo, "list"));
+  JSONCHECK(jsonStartList(jfo));
+  JSONCHECK(jsonBool(jfo, true));
+  JSONCHECK(jsonNull(jfo));
+  JSONCHECK(jsonUint64(jfo, 9423812381231ULL));
+  JSONCHECK(jsonSize_t(jfo, 3123111));
+  JSONCHECK(jsonDouble(jfo, 0.69423413));
+  JSONCHECK(jsonFinishList(jfo));
+  JSONCHECK(jsonFinishObject(jfo));
+  JSONCHECK(jsonFinalizeFileOutput(jfo));
+
+  FILE* fp = fopen("test.json", "r");
+
+  const size_t reflen = sizeof(refstr) / sizeof(char);
+
+  char buffer[reflen];
+
+  fread(buffer, sizeof(char), reflen, fp);
+
+  fclose(fp);
+
+  if (memcmp(buffer, refstr, reflen) == 0) {
+    printf("output matches reference\n");
+  } else {
+    printf("output    %s\nreference %s\n", buffer, refstr);
+    return 1;
+  }
+
+  return 0;
+}
+
+#endif
@@ -0,0 +1,83 @@
+#pragma once
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stddef.h>
+
+typedef enum {
+  JSON_NONE, // A pseudo-state meaning that the document is empty
+  JSON_KEY,
+  JSON_OBJECT_EMPTY,
+  JSON_OBJECT_SOME,
+  JSON_LIST_EMPTY,
+  JSON_LIST_SOME,
+} jsonState_t;
+
+typedef enum {
+  jsonSuccess,
+  jsonFileError,
+  jsonUnknownStateError,
+  jsonEmptyStateError,
+  jsonExpectedNonNoneStateError,
+  jsonStringOverflowError,
+  jsonStringBadChar,
+  jsonMemoryError,
+  jsonLockError,
+} jsonResult_t;
+
+const char *jsonErrorString(jsonResult_t res);
+
+typedef struct jsonFileOutput jsonFileOutput;
+
+jsonResult_t jsonLockOutput(jsonFileOutput *jfo);
+
+jsonResult_t jsonUnlockOutput(jsonFileOutput *jfo);
+
+jsonResult_t jsonInitFileOutput(jsonFileOutput **jfo,
+                                const char *outfile);
+
+jsonResult_t jsonFinalizeFileOutput(jsonFileOutput *jfo);
+
+jsonResult_t jsonNewline(jsonFileOutput *jfo);
+jsonResult_t jsonFlushOutput(jsonFileOutput *jfo);
+
+// Emit a key and separator. Santize the key.
+// This is only acceptable if the top state is an object
+// Emit a ',' separator of we aren't the first item.
+jsonResult_t jsonKey(jsonFileOutput *jfo, const char *name);
+
+// Start an object
+jsonResult_t jsonStartObject(jsonFileOutput *jfo);
+
+// Close an object
+jsonResult_t jsonFinishObject(jsonFileOutput *jfo);
+
+// Start a list
+jsonResult_t jsonStartList(jsonFileOutput *jfo);
+
+// Close a list
+jsonResult_t jsonFinishList(jsonFileOutput *jfo);
+
+// Emit a null value
+jsonResult_t jsonNull(jsonFileOutput *jfo);
+
+// Write a (sanititzed) string
+jsonResult_t jsonStr(jsonFileOutput *jfo, const char *str);
+
+// Write a bool as "true" or "false" strings.
+jsonResult_t jsonBool(jsonFileOutput *jfo, bool val);
+
+// Write an integer value
+jsonResult_t jsonInt(jsonFileOutput *jfo, const int val);
+
+//Write an unsigned int value
+jsonResult_t jsonUint32(jsonFileOutput *jfo, const uint32_t val);
+
+// Write an integer value
+jsonResult_t jsonUint64(jsonFileOutput *jfo, const uint64_t val);
+
+// Write a size_t value
+jsonResult_t jsonSize_t(jsonFileOutput *jfo, const size_t val);
+
+// Write a double value
+jsonResult_t jsonDouble(jsonFileOutput *jfo, const double val);
@@ -0,0 +1,73 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef COMMON_H_
+#define COMMON_H_
+
+/* typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; */
+/* typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys; */
+
+/* Data types */
+typedef enum { ncclInt8       = 0, ncclChar       = 0,
+               ncclUint8      = 1,
+               ncclInt32      = 2, ncclInt        = 2,
+               ncclUint32     = 3,
+               ncclInt64      = 4,
+               ncclUint64     = 5,
+               ncclFloat16    = 6, ncclHalf       = 6,
+               ncclFloat32    = 7, ncclFloat      = 7,
+               ncclFloat64    = 8, ncclDouble     = 8,
+               ncclBfloat16   = 9,
+               ncclFloat8e4m3 = 10,
+               ncclFloat8e5m2 = 11,
+               ncclNumTypes   = 12
+} ncclDataType_t;
+
+typedef enum {
+  NCCL_LOG_NONE = 0,
+  NCCL_LOG_VERSION = 1,
+  NCCL_LOG_WARN = 2,
+  NCCL_LOG_INFO = 3,
+  NCCL_LOG_ABORT = 4,
+  NCCL_LOG_TRACE = 5
+} ncclDebugLogLevel;
+
+typedef enum { ncclSuccess                 =  0,
+               ncclUnhandledCudaError      =  1,
+               ncclSystemError             =  2,
+               ncclInternalError           =  3,
+               ncclInvalidArgument         =  4,
+               ncclInvalidUsage            =  5,
+               ncclRemoteError             =  6,
+               ncclInProgress              =  7,
+               ncclNumResults              =  8 } ncclResult_t;
+
+
+typedef enum {
+  NCCL_INIT = 0x1,
+  NCCL_COLL = 0x2,
+  NCCL_P2P = 0x4,
+  NCCL_SHM = 0x8,
+  NCCL_NET = 0x10,
+  NCCL_GRAPH = 0x20,
+  NCCL_TUNING = 0x40,
+  NCCL_ENV = 0x80,
+  NCCL_ALLOC = 0x100,
+  NCCL_CALL = 0x200,
+  NCCL_PROXY = 0x400,
+  NCCL_NVLS = 0x800,
+  NCCL_BOOTSTRAP = 0x1000,
+  NCCL_REG = 0x2000,
+  NCCL_PROFILE = 0x4000,
+  NCCL_RAS = 0x8000,
+  NCCL_INSPECTOR = 0x100000, // big number to avoid short-term conflicts
+  NCCL_ALL = ~0
+} ncclDebugLogSubSys;
+
+
+typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
+
+#endif
@@ -0,0 +1,85 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_H_
+#define PROFILER_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "common.h"
+
+enum {
+  ncclProfileGroup          = (1 << 0),  // group event type
+  ncclProfileColl           = (1 << 1),  // host collective call event type
+  ncclProfileP2p            = (1 << 2),  // host point-to-point call event type
+  ncclProfileProxyOp        = (1 << 3),  // proxy operation event type
+  ncclProfileProxyStep      = (1 << 4),  // proxy step event type
+  ncclProfileProxyCtrl      = (1 << 5),  // proxy control event type
+  ncclProfileKernelCh       = (1 << 6),  // kernel channel event type
+  ncclProfileNetPlugin      = (1 << 7),  // network plugin-defined, events
+  ncclProfileGroupApi       = (1 << 8),  // Group API events
+  ncclProfileCollApi        = (1 << 9),  // Collective API events
+  ncclProfileP2pApi         = (1 << 10), // Point-to-Point API events
+  ncclProfileKernelLaunch   = (1 << 11), // Kernel launch events
+};
+
+typedef enum {
+  ncclProfilerProxyOpSendPosted        = 0,  // deprecated in v4
+  ncclProfilerProxyOpSendRemFifoWait   = 1,  // deprecated in v4
+  ncclProfilerProxyOpSendTransmitted   = 2,  // deprecated in v4
+  ncclProfilerProxyOpSendDone          = 3,  // deprecated in v4
+  ncclProfilerProxyOpRecvPosted        = 4,  // deprecated in v4
+  ncclProfilerProxyOpRecvReceived      = 5,  // deprecated in v4
+  ncclProfilerProxyOpRecvTransmitted   = 6,  // deprecated in v4
+  ncclProfilerProxyOpRecvDone          = 7,  // deprecated in v4
+  ncclProfilerProxyOpInProgress_v4     = 19,
+
+  /* Legacy proxy profiler states */
+  ncclProfilerProxyStepSendGPUWait     = 8,
+  ncclProfilerProxyStepSendPeerWait_v4 = 20,
+  ncclProfilerProxyStepSendWait        = 9,
+  ncclProfilerProxyStepRecvWait        = 10,
+  ncclProfilerProxyStepRecvFlushWait   = 11,
+  ncclProfilerProxyStepRecvGPUWait     = 12,
+
+  /* Legacy proxy control states */
+  ncclProfilerProxyCtrlIdle            = 13,
+  ncclProfilerProxyCtrlActive          = 14,
+  ncclProfilerProxyCtrlSleep           = 15,
+  ncclProfilerProxyCtrlWakeup          = 16,
+  ncclProfilerProxyCtrlAppend          = 17,
+  ncclProfilerProxyCtrlAppendEnd       = 18,
+
+  /* Network defined events states */
+  ncclProfilerNetPluginUpdate          = 21,
+
+  /* Kernel event states */
+  ncclProfilerKernelChStop             = 22,
+
+  /* Group API States */
+  ncclProfilerEndGroupApiStart         = 23,
+  ncclProfilerBeginGroupApiEnd         = 24
+} ncclProfilerEventState_t;
+
+typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t;
+typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t;
+typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t;
+typedef ncclProfilerEventState_t ncclProfilerEventState_v4_t;
+typedef ncclProfilerEventState_t ncclProfilerEventState_v5_t;
+
+#include "profiler_v5.h"
+#include "profiler_v4.h"
+#include "profiler_v3.h"
+#include "profiler_v2.h"
+#include "profiler_v1.h"
+#include "profiler_net.h"
+
+typedef ncclProfiler_v5_t ncclProfiler_t;
+typedef ncclProfilerEventDescr_v5_t ncclProfilerEventDescr_t;
+typedef ncclProfilerEventStateArgs_v5_t ncclProfilerEventStateArgs_t;
+
+#endif // end include guard
@@ -0,0 +1,19 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_NET_H_
+#define PROFILER_NET_H_
+
+#define NCCL_PROFILER_NET_VER_BITS  (16)
+#define NCCL_PROFILER_NET_VER_MASK  (~0U >> NCCL_PROFILER_NET_VER_BITS)
+#define NCCL_PROFILER_NET_TYPE_MASK (~0U << NCCL_PROFILER_NET_VER_BITS)
+
+typedef enum {
+  NCCL_PROFILER_NET_TYPE_IB   = (1U << NCCL_PROFILER_NET_VER_BITS),
+  NCCL_PROFILER_NET_TYPE_SOCK = (2U << NCCL_PROFILER_NET_VER_BITS),
+} ncclProfilerNetType;
+
+#endif
@@ -0,0 +1,112 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V1_H_
+#define PROFILER_V1_H_
+
+#include <stdint.h>
+#include <stddef.h>
+#include <sys/types.h>
+
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint64_t seqNumber;
+      uint8_t func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      uint8_t datatype;
+      uint32_t op;
+      size_t trafficBytes;
+      uint8_t nMaxChannels;
+      uint8_t nWarps;
+      uint8_t algo;
+      uint8_t proto;
+      int isCollnet;
+      int isNvls;
+    } coll;
+
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint8_t func;
+      void* buff;
+      uint8_t datatype;
+      size_t count;
+      int peer;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+  };
+} ncclProfilerEventDescr_v1_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+    int steps;
+  } proxyOp;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+} ncclProfilerEventStateArgs_v1_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v1_t;
+
+#endif
@@ -0,0 +1,108 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V2_H_
+#define PROFILER_V2_H_
+
+#include <stdint.h>
+#include <stddef.h>
+#include <sys/types.h>
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      size_t trafficBytes;
+      uint8_t nMaxChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+    } coll;
+
+    struct {
+      const char* name;
+      uint64_t commHash;
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+  };
+} ncclProfilerEventDescr_v2_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+    int steps;
+  } proxyOp;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+} ncclProfilerEventStateArgs_v2_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v2_t;
+
+#endif
@@ -0,0 +1,116 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V3_H_
+#define PROFILER_V3_H_
+
+#include <stdint.h>
+#include <stddef.h>
+#include <sys/types.h>
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      uint8_t nMaxChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+    } coll;
+
+    struct {
+      const char* name;
+      uint64_t commHash;
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+
+    struct {
+      uint8_t channelId;
+    } kernelCh;
+
+    struct {
+      int64_t id;
+      void* data;
+    } netPlugin;
+  };
+} ncclProfilerEventDescr_v3_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+    int steps;
+  } proxyOp;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+} ncclProfilerEventStateArgs_v3_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v3_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v3_t eState, ncclProfilerEventStateArgs_v3_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v3_t;
+
+#endif
@@ -0,0 +1,127 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V4_H_
+#define PROFILER_V4_H_
+
+#include <stdint.h>
+#include <stddef.h>
+#include <sys/types.h>
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      uint8_t nChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+    } coll;
+
+    struct {
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+      uint8_t nChannels;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+
+    struct {
+      uint8_t channelId;
+      uint64_t pTimer;          // start timestamp from GPU globaltimer
+    } kernelCh;
+
+    struct {
+      int64_t id;
+      void* data;
+    } netPlugin;
+  };
+} ncclProfilerEventDescr_v4_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+  } proxyStep;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+
+  struct {
+    void* data;
+  } netPlugin;
+
+  struct {
+    uint64_t pTimer;
+  } kernelCh;
+} ncclProfilerEventStateArgs_v4_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  //  - commName       : user assigned communicator name
+  //  - commHash       : communicator id
+  //  - nNodes         : number of nodes in communicator
+  //  - nranks         : number of ranks in communicator
+  //  - rank           : rank identifier in communicator
+  //  - logfn          : logger function
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v4_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v4_t eState, ncclProfilerEventStateArgs_v4_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v4_t;
+
+#endif
@@ -0,0 +1,151 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V5_H_
+#define PROFILER_V5_H_
+
+typedef struct {
+  uint64_t type;                // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      bool graphCaptured;
+      int groupDepth;
+    } groupApi;
+
+    struct {
+      const char* func;
+      size_t count;
+      const char* datatype;
+      int root;
+      void* stream;
+      bool graphCaptured;
+    } collApi;
+
+    struct {
+      const char* func;
+      size_t count;
+      const char* datatype;
+      void* stream;
+      bool graphCaptured;
+    } p2pApi;
+
+    struct {
+      void* stream;
+    } kernelLaunch;
+
+    struct {
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      uint8_t nChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+      void* parentGroup; // for backward compatibility with v4
+    } coll;
+
+    struct {
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+      uint8_t nChannels;
+      void* parentGroup; // for backward compatibility with v4
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+
+    struct {
+      uint8_t channelId;
+      uint64_t pTimer;          // start timestamp from GPU globaltimer
+    } kernelCh;
+
+    struct {
+      int64_t id;
+      void* data;
+    } netPlugin;
+  };
+} ncclProfilerEventDescr_v5_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+  } proxyStep;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+
+  struct {
+    void* data;
+  } netPlugin;
+
+  struct {
+    uint64_t pTimer;
+  } kernelCh;
+} ncclProfilerEventStateArgs_v5_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  //  - commId         : communicator id
+  //  - commName       : user assigned communicator name
+  //  - nNodes         : number of nodes in communicator
+  //  - nranks         : number of ranks in communicator
+  //  - rank           : rank identifier in communicator
+  //  - logfn          : logger function
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, uint64_t commId, int* eActivationMask, const char* commName, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v5_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v5_t eState, ncclProfilerEventStateArgs_v5_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v5_t;
+
+#endif
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NCCL_TYPES_H_
+#define NCCL_TYPES_H_
+
+/* Data types */
+typedef enum { ncclInt8       = 0, ncclChar       = 0,
+               ncclUint8      = 1,
+               ncclInt32      = 2, ncclInt        = 2,
+               ncclUint32     = 3,
+               ncclInt64      = 4,
+               ncclUint64     = 5,
+               ncclFloat16    = 6, ncclHalf       = 6,
+               ncclFloat32    = 7, ncclFloat      = 7,
+               ncclFloat64    = 8, ncclDouble     = 8,
+               ncclBfloat16   = 9,
+} ncclDataType_t;
+
+#endif
@@ -0,0 +1,12 @@
+#ifndef VERSION_H
+#define VERSION_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+const char* get_git_version_info();
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif // VERSION_H
@@ -1,8 +1,8 @@
 diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc
-index 7af56a6c..5c3e3d46 100644
+index 9bfd8dcf..4d3f0a08 100644
 --- a/src/transport/net_ib.cc
 +++ b/src/transport/net_ib.cc
-@@ -28,6 +28,7 @@
+@@ -29,6 +29,7 @@
 
 #include "ibvwrap.h"
 #include "mlx5/mlx5dvwrap.h"
@@ -10,9 +10,9 @@ index 7af56a6c..5c3e3d46 100644
 #include "graph/xml.h"
 
 #define MAXSUFFIXSIZE 16
-@@ -107,9 +108,31 @@ struct ncclIbMergedDev ncclIbMergedDevs[MAX_IB_VDEVS];
+@@ -110,16 +111,38 @@ struct ncclIbMergedDev ncclIbMergedDevs[MAX_IB_VDEVS];
 struct ncclIbDev ncclIbDevs[MAX_IB_DEVS];
- pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER;
+ static std::mutex ncclIbMutex;
 static int ncclIbRelaxedOrderingEnabled = 0;
 +static bool rcclAinicRoce = 0;
 +static bool rcclCtsInlineData = 0;
@@ -35,6 +35,13 @@ index 7af56a6c..5c3e3d46 100644
 +static ncclChannelToUd nccl_channel_ud_map[MAXCHANNELS][ncclIbChannelTypeMax];
 +static bool nccl_channel_last_ud[MAX_IB_DEVS][ncclIbChannelTypeMax];
 
+ // With ncclNet_v11_t the NCCL core initializes the network plugin per-communicator
+ // rather than once for all communicators. However, the internal plugin implementation
+ // still assumes the plugin is initialized only once across all communicators. The ref
+ // counter makes sure the plugin internally initializes only once. When per communicator
+ // context support is added to the plugin the ref counter can be removed.
+ static int netRefCount;
+ 
 #define NCCL_IB_LLSTR(ll) (((ll) == IBV_LINK_LAYER_INFINIBAND) ? "IB" : (((ll) == IBV_LINK_LAYER_ETHERNET) ? "RoCE" : "UNSPECIFIED"))
 
 +#define NCCL_CTS_QP_SLOT_INVALID 0xFF
@@ -42,7 +49,7 @@ index 7af56a6c..5c3e3d46 100644
 #define NCCL_IB_SL_DEFAULT 0
 #define NCCL_IB_TC_DEFAULT 0
 
-@@ -131,6 +154,13 @@ NCCL_PARAM(IbEceEnable,"IB_ECE_ENABLE",1);
+@@ -141,6 +164,13 @@ NCCL_PARAM(IbEceEnable,"IB_ECE_ENABLE",1);
 NCCL_PARAM(IbDataDirect,"IB_DATA_DIRECT",1);
 NCCL_PARAM(IbQpsPerConn, "IB_QPS_PER_CONNECTION", 1);
 RCCL_PARAM(IbQpsPerP2p, "IB_QPS_PER_P2P", 0);
@@ -56,7 +63,7 @@ index 7af56a6c..5c3e3d46 100644
 
 static ncclResult_t ncclIbStatsInit(struct ncclIbStats* stat) {
   __atomic_store_n(&stat->fatalErrorCount, 0, __ATOMIC_RELAXED);
-@@ -630,6 +660,10 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr
+@@ -779,6 +809,10 @@ ncclResult_t ncclIbInit(void** ctx, uint64_t commId, ncclNetCommConfig_t* config
   static int shownIbHcaEnv = 0;
   if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; }
   if(wrap_mlx5dv_symbols() != ncclSuccess) { INFO(NCCL_NET, "NET/IB : Failed to open mlx5dv symbols. Advance features like CX-8 Direct-NIC will be disabled."); }
@@ -67,7 +74,7 @@ index 7af56a6c..5c3e3d46 100644
 
   // Detect IB cards
   int nIbDevs = 0;
-@@ -783,6 +817,24 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr
+@@ -944,6 +978,23 @@ ncclResult_t ncclIbInit(void** ctx, uint64_t commId, ncclNetCommConfig_t* config
     INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using%s %s; OOB %s:%s", line, ncclIbRelaxedOrderingEnabled ? "[RO]" : "",
           ncclIbIfName, ncclSocketToString(&ncclIbIfAddr, addrline));
 
@@ -88,11 +95,10 @@ index 7af56a6c..5c3e3d46 100644
 +           "IB Use Inline: enabled; GDR Flush: disabled", rcclCtsInlineData ? "Enabled": "Disabled",
 +           rcclCtsOffloadEnabled ? "Enabled": "Disabled");
 +    }
-+
-     pthread_mutex_unlock(&ncclIbLock);
   }
 exit:
-@@ -1112,6 +1164,8 @@ struct ncclIbListenComm {
+   ibContext.trafficClass = config->trafficClass;
+@@ -1271,6 +1322,8 @@ struct ncclIbListenComm {
   struct ncclIbCommStage stage;
 };
 
@@ -101,7 +107,7 @@ index 7af56a6c..5c3e3d46 100644
 struct alignas(64) ncclIbSendFifo {
   uint64_t addr;
   uint64_t size;
-@@ -1122,10 +1176,21 @@ struct alignas(64) ncclIbSendFifo {
+@@ -1281,10 +1334,21 @@ struct alignas(64) ncclIbSendFifo {
   char padding[16];
 };
 
@@ -123,7 +129,7 @@ index 7af56a6c..5c3e3d46 100644
 };
 
 struct ncclIbRemSizesFifo {
-@@ -1172,6 +1237,7 @@ struct ncclIbSendComm {
+@@ -1331,6 +1395,7 @@ struct ncclIbSendComm {
   struct ncclIbNetCommBase base;
   // Start with fifo and ibv structs as they have alignment restrictions
   struct ncclIbSendFifo fifo[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
@@ -131,7 +137,7 @@ index 7af56a6c..5c3e3d46 100644
   struct ibv_sge sges[NCCL_NET_IB_MAX_RECVS];
   struct ibv_send_wr wrs[NCCL_NET_IB_MAX_RECVS + 1];
   // Each dev correlates to a mergedIbDev
-@@ -1187,6 +1253,7 @@ struct ncclIbSendComm {
+@@ -1346,6 +1411,7 @@ struct ncclIbSendComm {
 static_assert((sizeof(struct ncclIbNetCommBase) % 32) == 0, "ncclIbNetCommBase size must be 32-byte multiple to ensure fifo is at proper offset");
 static_assert((offsetof(struct ncclIbSendComm, fifo) % 32) == 0, "ncclIbSendComm fifo must be 32-byte aligned");
 static_assert((sizeof(struct ncclIbSendFifo) % 32) == 0, "ncclIbSendFifo element size must be 32-byte multiples");
@@ -139,7 +145,7 @@ index 7af56a6c..5c3e3d46 100644
 static_assert((offsetof(struct ncclIbSendComm, sges) % 32) == 0, "sges must be 32-byte aligned");
 static_assert((offsetof(struct ncclIbSendComm, wrs) % 32) == 0, "wrs must be 32-byte aligned");
 
-@@ -1201,6 +1268,7 @@ struct ncclIbGpuFlush {
+@@ -1360,6 +1426,7 @@ struct ncclIbGpuFlush {
 
 struct ncclIbRemFifo {
   struct ncclIbSendFifo elems[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
@@ -147,8 +153,8 @@ index 7af56a6c..5c3e3d46 100644
   uint64_t fifoTail;
   uint64_t addr;
   uint32_t flags;
-@@ -1265,20 +1333,59 @@ returning:
-   return res;
+@@ -1415,20 +1482,59 @@ ncclResult_t ncclIbDestroyBase(struct ncclIbNetCommDevBase* base) {
+   return ncclSuccess;
 }
 
 -ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base, int access_flags, void* qp_context, struct ncclIbQp* qp) {
@@ -209,7 +215,7 @@ index 7af56a6c..5c3e3d46 100644
   struct ibv_qp_attr qpAttr;
   memset(&qpAttr, 0, sizeof(struct ibv_qp_attr));
   qpAttr.qp_state = IBV_QPS_INIT;
-@@ -1288,6 +1395,9 @@ ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base,
+@@ -1438,6 +1544,9 @@ ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base,
   NCCLCHECK(wrap_ibv_modify_qp(qp->qp, &qpAttr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS));
   TRACE(NCCL_NET, "NET/IB : ncclIbCreateQp port=%d dev=%d devName=%s ndevs=%d nmdevs=%d qpn=%u pkey=%u pd=%p",
     ib_port, base->ibDevN, ncclIbDevs[base->ibDevN].devName, ncclNIbDevs, ncclNMergedIbDevs, qp->qp->qp_num, qpAttr.pkey_index, base->pd);
@@ -219,16 +225,16 @@ index 7af56a6c..5c3e3d46 100644
   return ncclSuccess;
 }
 
-@@ -1371,7 +1481,7 @@ fail:
+@@ -1521,7 +1630,7 @@ fail:
   goto exit;
 }
 
-ncclResult_t ncclIbConnect(int dev, ncclNetCommConfig_t* config, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
-+ncclResult_t ncclIbConnect(int dev, ncclNetCommConfig_t* config, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) {
+-ncclResult_t ncclIbConnect(void* ctx, int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
+ncclResult_t ncclIbConnect(void* ctx, int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) {
   ncclResult_t ret = ncclSuccess;
   struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle;
   struct ncclIbCommStage* stage = &handle->stage;
-@@ -1379,8 +1489,13 @@ ncclResult_t ncclIbConnect(int dev, ncclNetCommConfig_t* config, void* opaqueHan
+@@ -1529,8 +1638,13 @@ ncclResult_t ncclIbConnect(void* ctx, int dev, void* opaqueHandle, void** sendCo
   int ready;
   uint8_t link_layer = IBV_LINK_LAYER_UNSPECIFIED;
   int isP2p = 0; 
@@ -242,7 +248,7 @@ index 7af56a6c..5c3e3d46 100644
   if (stage->state == ncclIbCommStateConnect)      goto ib_connect_check;
   if (stage->state == ncclIbCommStateSendDevList)  goto ib_send_dev_list;
   if (stage->state == ncclIbCommStateRecvDevList)  goto ib_recv_dev_list;
-@@ -1461,7 +1576,7 @@ ib_recv_dev_list:
+@@ -1612,7 +1726,7 @@ ib_recv_dev_list:
   for (int q = 0; q < comm->base.nqps; q++) {
     ncclIbSendCommDev* commDev = comm->devs + devIndex;
     ncclIbDev* ibDev = ncclIbDevs + commDev->base.ibDevN;
@@ -251,7 +257,7 @@ index 7af56a6c..5c3e3d46 100644
     comm->base.qps[q].devIndex = devIndex;
     meta.qpInfo[q].qpn      = comm->base.qps[q].qp->qp_num;
     meta.qpInfo[q].devIndex = comm->base.qps[q].devIndex;
-@@ -1486,7 +1601,11 @@ ib_recv_dev_list:
+@@ -1637,7 +1751,11 @@ ib_recv_dev_list:
     devInfo->lid           = ibDev->portAttr.lid;
     devInfo->ibv_dev_index = commDev->base.ibDevN;
     // Prepare my fifo
@@ -264,10 +270,10 @@ index 7af56a6c..5c3e3d46 100644
     devInfo->fifoRkey = commDev->fifoMr->rkey;
 
     // Pack local GID info
-@@ -1528,7 +1647,11 @@ ib_recv_dev_list:
-       return ncclInternalError;
+@@ -1680,7 +1798,11 @@ ib_recv_dev_list:
     }
   }
+   config = (ncclNetCommConfig_t*)ctx;
 -  meta.fifoAddr = (uint64_t)comm->fifo;
 +  if (rcclCtsInlineData) {
 +    meta.fifoAddr = (uint64_t)comm->fifo_inline;
@@ -277,7 +283,7 @@ index 7af56a6c..5c3e3d46 100644
   meta.sl = (ncclParamIbSl() != -1) ? ncclParamIbSl() : (config && config->trafficClass != NCCL_NET_TRAFFIC_CLASS_UNDEF) ? config->trafficClass : NCCL_IB_SL_DEFAULT;
   meta.tc = (ncclParamIbTc() != -1) ? ncclParamIbTc() : (config && config->trafficClass != NCCL_NET_TRAFFIC_CLASS_UNDEF) ? config->trafficClass : NCCL_IB_TC_DEFAULT;
   strncpy(meta.devName, mergedDev->devName, MAX_MERGED_DEV_NAME);
-@@ -1673,18 +1796,22 @@ ncclResult_t ncclIbCheckVProps(ncclNetVDeviceProps_t* vProps1, ncclNetVDevicePro
+@@ -1825,18 +1947,22 @@ ncclResult_t ncclIbCheckVProps(ncclNetVDeviceProps_t* vProps1, ncclNetVDevicePro
   return ncclSuccess;
 }
 
@@ -302,7 +308,7 @@ index 7af56a6c..5c3e3d46 100644
   if (stage->state == ncclIbCommStateAccept)   goto ib_accept_check;
   if (stage->state == ncclIbCommStateRecvDevList) goto ib_recv_dev_list;
   if (stage->state == ncclIbCommStateSendDevList) goto ib_send_dev_list;
-@@ -1814,7 +1941,7 @@ ib_recv:
+@@ -1966,7 +2092,7 @@ ib_recv:
     // Local ibDevN
     ibDevN = rComm->devs[devIndex].base.ibDevN;
     ibDev = ncclIbDevs + ibDevN;
@@ -311,7 +317,7 @@ index 7af56a6c..5c3e3d46 100644
     qp->devIndex = devIndex;
     devIndex = (devIndex + 1) % rComm->base.vProps.ndevs;
 
-@@ -1840,16 +1967,22 @@ ib_recv:
+@@ -1992,16 +2118,22 @@ ib_recv:
 
   useDmaBuf  = (ncclIbDmaBufSupport(lComm->dev) == ncclSuccess);
   rComm->flushEnabled = ((ncclIbGdrSupport() == ncclSuccess || useDmaBuf)
@@ -337,7 +343,7 @@ index 7af56a6c..5c3e3d46 100644
 
     // Allocate Flush dummy buffer for GPU Direct RDMA
     if (rComm->flushEnabled) {
-@@ -1887,7 +2020,7 @@ ib_recv:
+@@ -2039,7 +2171,7 @@ ib_recv:
       rCommDev->gpuFlush.sge.addr = (uint64_t)&rComm->gpuFlushHostMem;
       rCommDev->gpuFlush.sge.length = 1;
       rCommDev->gpuFlush.sge.lkey = rCommDev->gpuFlush.hostMr->lkey;
@@ -346,7 +352,7 @@ index 7af56a6c..5c3e3d46 100644
       struct ncclIbDevInfo devInfo;
       devInfo.lid         = ibDev->portAttr.lid;
       devInfo.link_layer  = ibDev->portAttr.link_layer;
-@@ -2115,10 +2248,15 @@ ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) {
+@@ -2257,10 +2389,15 @@ ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) {
 
 NCCL_PARAM(IbSplitDataOnQps, "IB_SPLIT_DATA_ON_QPS", 0);
 
@@ -364,7 +370,7 @@ index 7af56a6c..5c3e3d46 100644
   if (nreqs > NCCL_NET_IB_MAX_RECVS) return ncclInternalError;
 
   uint64_t wr_id = 0ULL;
-@@ -2130,7 +2268,11 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
+@@ -2272,7 +2409,11 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
     sge->addr=(uintptr_t)reqs[r]->send.data;
     wr->opcode = IBV_WR_RDMA_WRITE;
     wr->send_flags = 0;
@@ -377,7 +383,7 @@ index 7af56a6c..5c3e3d46 100644
     wr->next = wr + 1;
     wr_id += (reqs[r] - comm->base.reqs) << (r*8);
 #ifdef NCCL_ENABLE_NET_PROFILING
-@@ -2141,7 +2283,7 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
+@@ -2283,7 +2424,7 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
   // Write size as immediate data. In the case of multi-send, only write
   // 0 or 1 as size to indicate whether there was data sent or received.
   uint32_t immData = 0;
@@ -386,7 +392,7 @@ index 7af56a6c..5c3e3d46 100644
     immData = reqs[0]->send.size;
   } else {
     int* sizes = comm->remSizesFifo.elems[slot];
-@@ -2151,22 +2293,24 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
+@@ -2293,22 +2434,24 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
   }
 
   struct ibv_send_wr* lastWr = comm->wrs+nreqs-1;
@@ -424,7 +430,7 @@ index 7af56a6c..5c3e3d46 100644
   lastWr->next = NULL;
   lastWr->send_flags = IBV_SEND_SIGNALED;
 
-@@ -2182,7 +2326,11 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
+@@ -2324,7 +2467,11 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
       //ncclIbAddEvent(reqs[r], devIndex, &comm->devs[devIndex].base);
 
       // Select proper rkey (needed even for 0-size send)
@@ -437,7 +443,7 @@ index 7af56a6c..5c3e3d46 100644
 
       int chunkSize = DIVUP(DIVUP(reqs[r]->send.size, nqps), align) * align;
       int length = std::min(reqs[r]->send.size-reqs[r]->send.offset, chunkSize);
-@@ -2198,7 +2346,7 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
+@@ -2340,7 +2487,7 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
       }
     }
 
@@ -446,7 +452,7 @@ index 7af56a6c..5c3e3d46 100644
       // Also make sure lastWr writes remote sizes using the right lkey
       comm->remSizesFifo.sge.lkey = comm->remSizesFifo.mrs[devIndex]->lkey;
       lastWr->wr.rdma.rkey = comm->remSizesFifo.rkeys[devIndex];
-@@ -2256,32 +2404,46 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void*
+@@ -2398,32 +2545,46 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void*
   NCCLCHECK(ncclIbStatsCheckFatalCount(&comm->base.stats,__func__));
 
   struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) mhandle;
@@ -511,7 +517,7 @@ index 7af56a6c..5c3e3d46 100644
     }
 
     struct ncclIbRequest* req;
-@@ -2325,10 +2487,12 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void*
+@@ -2467,10 +2628,12 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void*
     }
 
     TIME_START(0);
@@ -526,7 +532,7 @@ index 7af56a6c..5c3e3d46 100644
     memset(reqs, 0, NCCL_NET_IB_MAX_RECVS*sizeof(struct ncclIbRequest*));
     comm->fifoHead++;
     TIME_STOP(0);
-@@ -2341,30 +2505,60 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void*
+@@ -2483,30 +2646,60 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void*
 
 ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, size_t* sizes, int* tags, void** mhandles, struct ncclIbRequest* req) {
   struct ibv_send_wr wr;
@@ -566,10 +572,7 @@ index 7af56a6c..5c3e3d46 100644
     struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) mhandles[i];
 +    if (rcclCtsInlineData) {
 +      localElemCtsInline[i].addr = (uint64_t)data[i];
- 
-    // Send all applicable rkeys
-    for (int j = 0; j < comm->base.vProps.ndevs; j++)
-      localElem[i].rkeys[j] = mhandleWrapper->mrs[j]->rkey;
+
 +      // Send all applicable rkeys
 +      for (int j = 0; j < comm->base.vProps.ndevs; j++)
 +        localElemCtsInline[i].rkeys[j] = mhandleWrapper->mrs[j]->rkey;
@@ -583,14 +586,17 @@ index 7af56a6c..5c3e3d46 100644
 +    } else {
 +      localElem[i].addr = (uint64_t)data[i];
 
+-    // Send all applicable rkeys
+-    for (int j = 0; j < comm->base.vProps.ndevs; j++)
+-      localElem[i].rkeys[j] = mhandleWrapper->mrs[j]->rkey;
+      // Send all applicable rkeys
+      for (int j = 0; j < comm->base.vProps.ndevs; j++)
+        localElem[i].rkeys[j] = mhandleWrapper->mrs[j]->rkey;
+ 
 -    localElem[i].nreqs = n;
 -    localElem[i].size = sizes[i]; // Sanity/Debugging
 -    localElem[i].tag = tags[i];
 -    localElem[i].idx = comm->remFifo.fifoTail+1;
-+      // Send all applicable rkeys
-+      for (int j = 0; j < comm->base.vProps.ndevs; j++)
-+        localElem[i].rkeys[j] = mhandleWrapper->mrs[j]->rkey;
-+
 +      localElem[i].nreqs = n;
 +      localElem[i].size = sizes[i]; // Sanity/Debugging
 +      localElem[i].tag = tags[i];
@@ -600,7 +606,7 @@ index 7af56a6c..5c3e3d46 100644
   }
   wr.wr.rdma.remote_addr = comm->remFifo.addr + slot*NCCL_NET_IB_MAX_RECVS*sizeof(struct ncclIbSendFifo);
 
-@@ -2372,8 +2566,12 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, siz
+@@ -2514,8 +2707,12 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, siz
   wr.wr.rdma.rkey = comm->base.remDevs[ctsQp->remDevIdx].fifoRkey;
 
   // Set the correct sge properties
@@ -615,7 +621,7 @@ index 7af56a6c..5c3e3d46 100644
   wr.sg_list = &comm->devs[ctsQp->devIndex].fifoSge;
   wr.num_sge = 1;
 
-@@ -2403,7 +2601,13 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, siz
+@@ -2545,7 +2742,13 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, siz
   //
   // slot == devIndex - When writing to fifo slot N, and this QP lives on device index N, it should send signalled.
   // This works out that each fifo posting QP gets drained
@@ -630,7 +636,7 @@ index 7af56a6c..5c3e3d46 100644
     wr.send_flags |= IBV_SEND_SIGNALED;
     wr.wr_id = req - comm->base.reqs;
     ncclIbAddEvent(req, ctsQp->devIndex, &comm->devs[ctsQp->devIndex].base);
-@@ -2418,10 +2622,16 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, siz
+@@ -2560,10 +2763,16 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, siz
 
   comm->remFifo.fifoTail++;
 
@@ -647,7 +653,7 @@ index 7af56a6c..5c3e3d46 100644
   struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
   if (comm->base.ready == 0) {
     WARN("NET/IB: ncclIbIrecv() called when comm->base.ready == 0");
-@@ -2431,6 +2641,11 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int*
+@@ -2573,6 +2782,11 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int*
   if (n > NCCL_NET_IB_MAX_RECVS) return ncclInternalError;
   NCCLCHECK(ncclIbStatsCheckFatalCount(&comm->base.stats,__func__));
 
@@ -659,7 +665,7 @@ index 7af56a6c..5c3e3d46 100644
   struct ncclIbRequest* req;
   NCCLCHECK(ncclIbGetRequest(&comm->base, &req));
   req->type = NCCL_NET_IB_REQ_RECV;
-@@ -2444,50 +2659,64 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int*
+@@ -2586,50 +2800,64 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int*
     req->devBases[i] = &comm->devs[i].base;
   }
 
@@ -756,7 +762,7 @@ index 7af56a6c..5c3e3d46 100644
 }
 
 ncclResult_t ncclIbIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) {
-@@ -2556,6 +2785,8 @@ static int getReqQpIndex(struct ncclIbRequest* req, int request, int qpNumber) {
+@@ -2698,6 +2926,8 @@ static int getReqQpIndex(struct ncclIbRequest* req, int request, int qpNumber) {
 }
 #endif
 
@@ -765,7 +771,7 @@ index 7af56a6c..5c3e3d46 100644
 ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
   struct ncclIbRequest *r = (struct ncclIbRequest*)request;
   *done = 0;
-@@ -2589,13 +2820,18 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
+@@ -2731,13 +2961,18 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
 
     int totalWrDone = 0;
     int wrDone = 0;
@@ -786,7 +792,7 @@ index 7af56a6c..5c3e3d46 100644
         totalWrDone += wrDone;
         if (wrDone == 0) { TIME_CANCEL(3); } else { TIME_STOP(3); }
         if (wrDone == 0) continue;
-@@ -2742,7 +2978,7 @@ ncclResult_t rcclNetP2pPolicy(void* handle, int isP2p) {
+@@ -2889,7 +3124,7 @@ ncclResult_t rcclNetP2pPolicy(void* handle, int isP2p) {
 }
 
 ncclNet_t ncclNetIb = {
@@ -179,4 +179,4 @@ When developing new tuner plugins:
 - [NCCL Documentation](https://docs.nvidia.com/deeplearning/nccl/)
 - Example plugin implementations in this directory

-For questions and support, refer to the NCCL community resources and documentation.
+For questions and support, refer to the NCCL community resources and documentation.
@@ -0,0 +1,49 @@
+# Compiled shared objects and binaries
+*.so
+*.o
+*.a
+*.out
+*.exe
+*.dll
+*.dylib
+*.bin
+*.elf
+
+# Python cache
+__pycache__/
+*.pyc
+*.pyo
+
+# Build and test artifacts
+/build/
+*.log
+*.tmp
+*.swp
+
+# Ignore all CSV files except scripts/sample_performance_data.csv
+*.csv
+!scripts/sample_performance_data.csv
+
+# Ignore all .conf files except nccl_tuner.conf
+*.conf
+!nccl_tuner.conf
+
+my_configs
+
+# Ignore test binary
+test/test_plugin
+
+# Editor/OS files
+.DS_Store
+Thumbs.db
+
+# Backup files
+*~
+*.bak
+
+# Ignore by convention
+*.old
+*.orig
+
+# Git
+.git/
@@ -0,0 +1,26 @@
+# Find all C source files in current directory
+set(SRC_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/plugin.c
+)
+
+# Create shared library
+add_library(nccl-tuner-example SHARED ${SRC_FILES})
+
+# Set include directories
+target_include_directories(nccl-tuner-example PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/nccl
+)
+
+# Set output name to match Makefile
+set_target_properties(nccl-tuner-example PROPERTIES
+    OUTPUT_NAME "nccl-tuner-example"
+    PREFIX "lib"
+    POSITION_INDEPENDENT_CODE ON
+    LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/test/unit/plugins
+)
+
+# Add custom target for clean (equivalent to Makefile clean target)
+add_custom_target(clean-tuner-lib
+    COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/libnccl-tuner-example.so
+    COMMENT "Cleaning libnccl-tuner-example.so"
+)
@@ -45,6 +45,40 @@ typedef enum {

 #define NCCL_ALGO_PROTO_IGNORE -1.0

+#define NCCL_HW_NVLINK 0
+#define NCCL_HW_PCI 1
+#define NCCL_HW_NET 2
+#define NCCL_NUM_HW_LINKS 3
+
+#define NCCL_VOLTA_COMPCAP_IDX 0
+#define NCCL_AMPERE_COMPCAP_IDX 1
+#define NCCL_HOPPER_COMPCAP_IDX 2
+#define NCCL_BLACKWELL_COMPCAP_IDX 3
+#define NCCL_NUM_COMPCAPS 4
+
+#define NCCL_TUNING_SCALE_1NODE 0
+#define NCCL_TUNING_SCALE_2NODES 1
+#define NCCL_TUNING_SCALE_4NODES 2
+#define NCCL_NUM_TUNING_SCALES 3
+
+typedef struct {
+  int nNvlDomains;                    // number of NVLink domains
+  int minRanksPerNvlDomain;           // minimum ranks across all NVLink domains
+  int maxRanksPerNvlDomain;           // maximum ranks across all NVLink domains
+} ncclNvlDomainInfo_v5_t;
+
+typedef struct {
+  double baseLatencies [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  double hwLatencies [NCCL_NUM_HW_LINKS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+
+  double llMaxBws [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES];
+  double perChMaxRingLL128Bws [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES];
+  double perChMaxTreeLL128Bws [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES];
+  double perChMaxTreeBws [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES];
+
+
+} ncclTunerConstants_v5_t;
+
 // API to be implemented by external tuner
 typedef struct {
  // Name of the tuner
@@ -52,12 +86,17 @@ typedef struct {

  // Initializes tuner states.
  // Inputs:
+  //   - commId: communicator identifier
  //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
  //   - nNodes: number of nodes in current communicator.
  //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
+  //   - nvlDomainInfo: NVL domain information struct
  // Outputs:
  //   - context: tuner context object
-  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
+  // Input/Output:
+  //   - constants: tuner constants
+  ncclResult_t (*init)(void** ctx, uint64_t commId, size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction,
+                      ncclNvlDomainInfo_v5_t* nvlDomainInfo, ncclTunerConstants_v5_t* constants);

  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
  // Inputs:
@@ -87,11 +126,13 @@ typedef struct {

  // Terminates the plugin and cleans up any resources that the plugin allocated.
  // context: tuner context object
-  ncclResult_t (*destroy)(void* context);
-} ncclTuner_v4_t;
+  ncclResult_t (*finalize)(void* context);
+} ncclTuner_v5_t;

-typedef ncclTuner_v4_t ncclTuner_t;
+typedef ncclTuner_v5_t ncclTuner_t;
+typedef ncclNvlDomainInfo_v5_t ncclNvlDomainInfo_t;
+typedef ncclTunerConstants_v5_t ncclTunerConstants_t;

-#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4"
+#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v5"

 #endif
@@ -51,6 +51,7 @@ typedef struct {
  size_t nRanks;
  size_t nNodes;
  ncclDebugLogger_t logFunction;
+  ncclNvlDomainInfo_v5_t nvlDomainInfo;
 } TunerContext;

 // Parse collective type from string
@@ -289,7 +290,25 @@ static ncclResult_t loadConfig(TunerContext* ctx, const char* filename) {
  return ncclSuccess;
 }

-__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) {
+__hidden ncclResult_t pluginInit(void** context, uint64_t commId, size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction,
+                                 ncclNvlDomainInfo_v5_t* nvlDomainInfo, ncclTunerConstants_v5_t* constants) {
+
+  if (NULL != constants) {
+    // NCCL constants tuning
+    // Tune NCCL's internal tuning model to improve base algo/proto selection.
+    // Note: Example numbers are for reference only.
+    //       Actual numbers may vary depending on the hardware and network topology.
+    //       These numbers are not guaranteed to be optimal for all cases.
+    // Limit the tree bandwidth to 15GB/s
+    constants->perChMaxTreeBws[NCCL_BLACKWELL_COMPCAP_IDX][NCCL_TUNING_SCALE_4NODES] = 15.0;
+
+    // Limit the ring bandwidth to 20GB/s
+    constants->perChMaxRingLL128Bws[NCCL_BLACKWELL_COMPCAP_IDX][NCCL_TUNING_SCALE_4NODES] = 20.0;
+
+    // Set NVLSTree base network latency to 24us
+    constants->hwLatencies[NCCL_HW_NET][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] = 24.0;
+  }
+  
  TunerContext* ctx = (TunerContext*)malloc(sizeof(TunerContext));
  if (!ctx) return ncclSystemError;

@@ -299,10 +318,16 @@ __hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t
  ctx->nRanks = nRanks;
  ctx->nNodes = nNodes;
  ctx->logFunction = logFunction;
+  if (nvlDomainInfo) {
+    ctx->nvlDomainInfo = *nvlDomainInfo;
+  } else {
+    memset(&ctx->nvlDomainInfo, 0, sizeof(ncclNvlDomainInfo_v5_t));
+  }

  if (logFunction) {
    logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
-                "TUNER/ExamplePlugin: Initializing tuner for %zu nodes, %zu ranks", nNodes, nRanks);
+                "TUNER/ExamplePlugin: Initializing tuner for %zu nodes, %zu ranks, %d NVL domains",
+                nNodes, nRanks, ctx->nvlDomainInfo.nNvlDomains);
  }

  // Try to load config file from environment variable or default location
@@ -435,7 +460,7 @@ __hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size
  return ncclSuccess;
 }

-__hidden ncclResult_t pluginDestroy(void* context) {
+__hidden ncclResult_t pluginFinalize(void* context) {
  if (context) {
    TunerContext* ctx = (TunerContext*)context;
    if (ctx->configs) {
@@ -446,11 +471,12 @@ __hidden ncclResult_t pluginDestroy(void* context) {
  return ncclSuccess;
 }

+
 #define PLUGIN_NAME "Example"

-const ncclTuner_v4_t ncclTunerPlugin_v4 = {
+const ncclTuner_v5_t ncclTunerPlugin_v5 = {
  .name = PLUGIN_NAME,
  .init = pluginInit,
  .getCollInfo = pluginGetCollInfo,
-  .destroy = pluginDestroy
+  .finalize = pluginFinalize
 };
@@ -98,12 +98,12 @@ int test_plugin_init() {
  void* context = NULL;

  // Test successful initialization
-  ncclResult_t result = pluginInit(8, 2, mock_logger, &context);
+  ncclResult_t result = pluginInit(&context, 0, 8, 2, mock_logger, NULL, NULL);
  TEST_ASSERT(result == ncclSuccess, "Plugin init should succeed");
  TEST_ASSERT(context != NULL, "Context should be allocated");

  // Clean up
-  pluginDestroy(context);
+  pluginFinalize(context);
  TEST_PASS();
 }

@@ -123,11 +123,11 @@ int test_config_parsing_valid() {
  setenv("NCCL_TUNER_CONFIG_FILE", "test_valid.conf", 1);

  void* context = NULL;
-  ncclResult_t result = pluginInit(16, 2, mock_logger, &context);
+  ncclResult_t result = pluginInit(&context, 0, 16, 2, mock_logger, NULL, NULL);
  TEST_ASSERT(result == ncclSuccess, "Plugin init with valid config should succeed");

  // Clean up
-  pluginDestroy(context);
+  pluginFinalize(context);
  unlink("test_valid.conf");
  unsetenv("NCCL_TUNER_CONFIG_FILE");
  TEST_PASS();
@@ -144,12 +144,12 @@ int test_config_parsing_invalid() {
  setenv("NCCL_TUNER_CONFIG_FILE", "test_invalid.conf", 1);

  void* context = NULL;
-  ncclResult_t result = pluginInit(8, 1, mock_logger, &context);
+  ncclResult_t result = pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);
  // Should still succeed but with no valid configs loaded
  TEST_ASSERT(result == ncclSuccess, "Plugin init should succeed even with invalid config");

  // Clean up
-  pluginDestroy(context);
+  pluginFinalize(context);
  unlink("test_invalid.conf");
  unsetenv("NCCL_TUNER_CONFIG_FILE");
  TEST_PASS();
@@ -165,7 +165,7 @@ int test_collective_matching() {
  setenv("NCCL_TUNER_CONFIG_FILE", "test_match.conf", 1);

  void* context = NULL;
-  pluginInit(8, 1, mock_logger, &context);
+  pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);

  // Create mock cost table
  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
@@ -209,7 +209,7 @@ int test_collective_matching() {
  TEST_ASSERT(nChannels == 4, "Should set 4 channels");

  // Clean up
-  pluginDestroy(context);
+  pluginFinalize(context);
  unlink("test_match.conf");
  unsetenv("NCCL_TUNER_CONFIG_FILE");
  TEST_PASS();
@@ -226,7 +226,7 @@ int test_size_matching() {
  setenv("NCCL_TUNER_CONFIG_FILE", "test_size.conf", 1);

  void* context = NULL;
-  pluginInit(8, 1, mock_logger, &context);
+  pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);

  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
@@ -280,7 +280,7 @@ int test_size_matching() {
  TEST_ASSERT(nChannels == 8, "Large: Should set 8 channels");

  // Clean up
-  pluginDestroy(context);
+  pluginFinalize(context);
  unlink("test_size.conf");
  unsetenv("NCCL_TUNER_CONFIG_FILE");
  TEST_PASS();
@@ -298,7 +298,7 @@ int test_topology_matching() {

  // Test with single node setup
  void* context1 = NULL;
-  pluginInit(8, 1, mock_logger, &context1);  // 8 ranks, 1 node
+  pluginInit(&context1, 0, 8, 1, mock_logger, NULL, NULL);  // 8 ranks, 1 node

  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
@@ -316,11 +316,11 @@ int test_topology_matching() {
  TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Single node: Should match tree config");
  TEST_ASSERT(nChannels == 2, "Single node: Should set 2 channels");

-  pluginDestroy(context1);
+  pluginFinalize(context1);

  // Test with 4 nodes, 32 ranks setup
  void* context2 = NULL;
-  pluginInit(32, 4, mock_logger, &context2);  // 32 ranks, 4 nodes
+  pluginInit(&context2, 0, 32, 4, mock_logger, NULL, NULL);  // 32 ranks, 4 nodes

  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
@@ -349,7 +349,7 @@ int test_default_channels() {
  setenv("NCCL_TUNER_CONFIG_FILE", "test_default.conf", 1);

  void* context = NULL;
-  pluginInit(8, 1, mock_logger, &context);
+  pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);

  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
@@ -369,7 +369,7 @@ int test_default_channels() {
  TEST_ASSERT(nChannels == 1, "Should keep default channels (1) when config has -1");

  // Clean up
-  pluginDestroy(context);
+  pluginFinalize(context);
  unlink("test_default.conf");
  unsetenv("NCCL_TUNER_CONFIG_FILE");
  TEST_PASS();
@@ -386,7 +386,7 @@ int test_regbuff_matching() {
  setenv("NCCL_TUNER_CONFIG_FILE", "test_regbuff.conf", 1);

  void* context = NULL;
-  pluginInit(8, 1, mock_logger, &context);
+  pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);

  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
@@ -437,7 +437,7 @@ int test_regbuff_matching() {
  TEST_ASSERT(nChannels == 8, "Any regBuff: Should set 8 channels");

  // Clean up
-  pluginDestroy(context);
+  pluginFinalize(context);
  unlink("test_regbuff.conf");
  unsetenv("NCCL_TUNER_CONFIG_FILE");
  TEST_PASS();
@@ -454,7 +454,7 @@ int test_pipeops_matching() {
  setenv("NCCL_TUNER_CONFIG_FILE", "test_pipeops.conf", 1);

  void* context = NULL;
-  pluginInit(8, 1, mock_logger, &context);
+  pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);

  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
@@ -504,7 +504,7 @@ int test_pipeops_matching() {
  TEST_ASSERT(nChannels == 8, "Any pipeOps: Should set 8 channels");

  // Clean up
-  pluginDestroy(context);
+  pluginFinalize(context);
  unlink("test_pipeops.conf");
  unsetenv("NCCL_TUNER_CONFIG_FILE");
  TEST_PASS();
@@ -519,7 +519,7 @@ int test_no_match_fallback() {
  setenv("NCCL_TUNER_CONFIG_FILE", "test_fallback.conf", 1);

  void* context = NULL;
-  pluginInit(8, 1, mock_logger, &context);
+  pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);

  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
@@ -543,7 +543,7 @@ int test_no_match_fallback() {
  TEST_ASSERT(nChannels == 1, "Should use default channels");

  // Clean up
-  pluginDestroy(context);
+  pluginFinalize(context);
  unlink("test_fallback.conf");
  unsetenv("NCCL_TUNER_CONFIG_FILE");
  TEST_PASS();
@@ -593,7 +593,7 @@ int test_large_config() {

  // Initialize plugin with large config
  void* context = NULL;
-  ncclResult_t result = pluginInit(16, 4, mock_logger, &context);
+  ncclResult_t result = pluginInit(&context, 0, 16, 4, mock_logger, NULL, NULL);
  TEST_ASSERT(result == ncclSuccess, "Plugin init with large config should succeed");
  TEST_ASSERT(context != NULL, "Context should be allocated");

@@ -652,7 +652,7 @@ int test_large_config() {
  TEST_ASSERT(result == ncclSuccess, "GetCollInfo should work with large config set");

  // Clean up
-  pluginDestroy(context);
+  pluginFinalize(context);
  unlink(large_config_file);
  unsetenv("NCCL_TUNER_CONFIG_FILE");

@@ -684,7 +684,7 @@ int test_very_large_config_stress() {

  // Test initialization with stress config
  void* context = NULL;
-  ncclResult_t result = pluginInit(8, 2, mock_logger, &context);
+  ncclResult_t result = pluginInit(&context, 0, 8, 2, mock_logger, NULL, NULL);
  TEST_ASSERT(result == ncclSuccess, "Plugin should handle very large config files");

  TunerContext* ctx = (TunerContext*)context;
@@ -705,7 +705,7 @@ int test_very_large_config_stress() {
  }

  // Clean up
-  pluginDestroy(context);
+  pluginFinalize(context);
  unlink(stress_config_file);
  unsetenv("NCCL_TUNER_CONFIG_FILE");

@@ -726,7 +726,7 @@ int test_empty_config() {
  setenv("NCCL_TUNER_CONFIG_FILE", empty_config_file, 1);

  void* context = NULL;
-  ncclResult_t result = pluginInit(8, 2, mock_logger, &context);
+  ncclResult_t result = pluginInit(&context, 0, 8, 2, mock_logger, NULL, NULL);
  TEST_ASSERT(result == ncclSuccess, "Plugin should handle empty config files");

  TunerContext* ctx = (TunerContext*)context;
@@ -751,13 +751,134 @@ int test_empty_config() {
  TEST_ASSERT(result == ncclSuccess, "GetCollInfo should work with empty config");

  // Clean up
-  pluginDestroy(context);
+  pluginFinalize(context);
  unlink(empty_config_file);
  unsetenv("NCCL_TUNER_CONFIG_FILE");

  TEST_PASS();
 }

+// Test NVLink domain info handling
+int test_nvl_domain_info() {
+  printf("Testing NVLink domain info handling...\n");
+
+  // Test NVLink domain structure with min/max ranks per domain
+  ncclNvlDomainInfo_v5_t nvl_domain = {
+    .nNvlDomains = 2, // 2 nodes = 2 domains
+    .minRanksPerNvlDomain = 3, // minimum ranks across all domains (bottleneck)
+    .maxRanksPerNvlDomain = 5  // maximum ranks across all domains (capacity)
+  };
+  
+  void* context = NULL;
+  ncclResult_t result = pluginInit(&context, 0, 8, 2, mock_logger, &nvl_domain, NULL);
+  TEST_ASSERT(result == ncclSuccess, "Plugin init with NVLink domains should succeed");
+  
+  // Validate NVLD info structure
+  TEST_ASSERT(nvl_domain.nNvlDomains == 2, "Should have 2 domains (nodes)");
+  TEST_ASSERT(nvl_domain.minRanksPerNvlDomain == 3, "Should have minimum 3 ranks per domain");
+  TEST_ASSERT(nvl_domain.maxRanksPerNvlDomain == 5, "Should have maximum 5 ranks per domain");
+  
+  // Clean up
+  pluginFinalize(context);
+  printf("NVLink domain info test passed!\n");
+  TEST_PASS();
+}
+
+int test_tuner_constants() {
+  // Initialize constants to -1.0 for testing purposes
+  ncclTunerConstants_v5_t constants = {
+    // Base latencies: [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]
+    .baseLatencies = {
+      {-1.0, -1.0, -1.0},    // NCCL_ALGO_TREE: LL, LL128, Simple
+      {-1.0, -1.0, -1.0},    // NCCL_ALGO_RING: LL, LL128, Simple
+      {-1.0, -1.0, -1.0},   // NCCL_ALGO_COLLNET_DIRECT
+      {-1.0, -1.0, -1.0},   // NCCL_ALGO_COLLNET_CHAIN
+      {-1.0, -1.0, -1.0},    // NCCL_ALGO_NVLS
+      {-1.0, -1.0, -1.0},    // NCCL_ALGO_NVLS_TREE
+      {-1.0, -1.0, -1.0}     // NCCL_ALGO_PAT
+    },
+
+    // Hardware latencies: [NCCL_NUM_HW_LINKS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]
+    .hwLatencies = {
+      // NCCL_HW_NVLINK
+      {
+        {-1.0, -1.0, -1.0},    // TREE
+        {-1.0, -1.0, -1.0},    // RING
+        {-1.0, -1.0, -1.0},    // COLLNET_DIRECT
+        {-1.0, -1.0, -1.0},    // COLLNET_CHAIN
+        {-1.0, -1.0, -1.0},    // NVLS
+        {-1.0, -1.0, -1.0},    // NVLS_TREE
+        {-1.0, -1.0, -1.0}     // PAT
+      },
+      // NCCL_HW_PCI
+      {
+        {-1.0, -1.0, -1.0},   // TREE
+        {-1.0, -1.0, -1.0},    // RING
+        {-1.0, -1.0, -1.0},  // COLLNET_DIRECT
+        {-1.0, -1.0, -1.0},  // COLLNET_CHAIN
+        {-1.0, -1.0, -1.0},     // NVLS
+        {-1.0, -1.0, -1.0},   // NVLS_TREE
+        {-1.0, -1.0, -1.0}   // PAT
+      },
+      // NCCL_HW_NET
+      {
+        {-1.0, -1.0, -1.0},  // TREE
+        {-1.0, -1.0, -1.0},  // RING
+        {-1.0, -1.0, -1.0},  // COLLNET_DIRECT
+        {-1.0, -1.0, -1.0},  // COLLNET_CHAIN
+        {-1.0, -1.0, -1.0},  // NVLS
+        {-1.0, -1.0, -1.0},  // NVLS_TREE
+        {-1.0, -1.0, -1.0}   // PAT
+      }
+    },
+
+    // LL maximum bandwidths: [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES]
+    .llMaxBws = {
+      {-1.0, -1.0, -1.0},  // Volta: 1node, 2nodes, 4nodes
+      {-1.0, -1.0, -1.0},  // Ampere: 1node, 2nodes, 4nodes
+      {-1.0, -1.0, -1.0},  // Hopper: 1node, 2nodes, 4nodes
+      {-1.0, -1.0, -1.0}   // Blackwell: 1node, 2nodes, 4nodes
+    },
+
+    // Per-channel maximum Ring LL128 bandwidths: [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES]
+    .perChMaxRingLL128Bws = {
+      {-1.0, -1.0, -1.0},   // Volta: 1node, 2nodes, 4nodes
+      {-1.0, -1.0, -1.0},  // Ampere: 1node, 2nodes, 4nodes
+      {-1.0, -1.0, -1.0},  // Hopper: 1node, 2nodes, 4nodes
+      {-1.0, -1.0, -1.0}   // Blackwell: 1node, 2nodes, 4nodes
+    },
+
+    // Per-channel maximum Tree LL128 bandwidths: [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES]
+    .perChMaxTreeLL128Bws = {
+      {-1.0, -1.0, -1.0},    // Volta: 1node, 2nodes, 4nodes
+      {-1.0, -1.0, -1.0},   // Ampere: 1node, 2nodes, 4nodes
+      {-1.0, -1.0, -1.0},  // Hopper: 1node, 2nodes, 4nodes
+      {-1.0, -1.0, -1.0}   // Blackwell: 1node, 2nodes, 4nodes
+    },
+
+    // Per-channel maximum Tree bandwidths: [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES]
+    .perChMaxTreeBws = {
+      {-1.0, -1.0, -1.0},  // Volta: 1node, 2nodes, 4nodes
+      {-1.0, -1.0, -1.0},  // Ampere: 1node, 2nodes, 4nodes
+      {-1.0, -1.0, -1.0},  // Hopper: 1node, 2nodes, 4nodes
+      {-1.0, -1.0, -1.0}   // Blackwell: 1node, 2nodes, 4nodes
+    }
+  };
+
+  void* context = NULL;
+  ncclResult_t result = pluginInit(&context, 0, 8, 2, mock_logger, NULL, &constants);
+  TEST_ASSERT(result == ncclSuccess, "Plugin init with constants should succeed");
+
+  // Test that the constants were set correctly
+  TEST_ASSERT(constants.perChMaxTreeBws[NCCL_BLACKWELL_COMPCAP_IDX][NCCL_TUNING_SCALE_4NODES] == 15.0, "Tree bandwidth should be 15GB/s");
+  TEST_ASSERT(constants.perChMaxRingLL128Bws[NCCL_BLACKWELL_COMPCAP_IDX][NCCL_TUNING_SCALE_4NODES] == 20.0, "Ring bandwidth should be 20GB/s");
+  TEST_ASSERT(constants.hwLatencies[NCCL_HW_NET][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] == 24.0, "NVLSTree base network latency should be 24us");
+
+  // Clean up
+  pluginFinalize(context);
+  TEST_PASS();
+}
+
 // Test runner function pointer type
 typedef int (*TestFunction)(void);

@@ -783,6 +904,8 @@ TestCase test_cases[] = {
  {"large-config", test_large_config, "Large configuration files (dynamic allocation)"},
  {"stress-config", test_very_large_config_stress, "Very large configuration stress test"},
  {"empty-config", test_empty_config, "Empty configuration file handling"},
+  {"nvl-domain", test_nvl_domain_info, "NVL domain info handling"},
+  {"constants", test_tuner_constants, "Tuner constants initialization"},
  {NULL, NULL, NULL} // End marker
 };

@@ -826,6 +949,7 @@ int main(int argc, char* argv[]) {
  if (argc == 1) {
    // No arguments - run all tests
    for (int i = 0; test_cases[i].name != NULL; i++) {
+      printf("Running test: %s\n", test_cases[i].name);
      total++;
      passed += test_cases[i].func();
    }
@@ -26,7 +26,7 @@ install_dependencies=false
 install_library=false
 install_prefix="${ROCM_PATH}"
 log_trace=false
-msccl_kernel_enabled=true
+msccl_kernel_enabled=false
 mscclpp_enabled=false
 enable_mscclpp_clip=false
 num_parallel_jobs=$(nproc)
@@ -56,7 +56,7 @@ function display_help()
    echo "       --debug                 Build debug library"
    echo "       --enable_backtrace      Build with custom backtrace support"
    echo "       --disable-colltrace     Build without collective trace"
-    echo "       --disable-msccl-kernel  Build without MSCCL kernels"
+    echo "       --enable-msccl-kernel   Build with MSCCL kernels"
    echo "       --dump-asm              Disassemble code and dump assembly with inline code"
    echo "       --enable-mscclpp        Build with MSCCL++ support"
    echo "       --enable-mscclpp-clip   Build MSCCL++ with clip wrapper on bfloat16 and half addition routines"
@@ -32,13 +32,8 @@ CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)

 # You should define NVCC_GENCODE in your environment to the minimal set
 # of archs to reduce compile time.
-CUDA8_GENCODE = -gencode=arch=compute_50,code=sm_50 \
-                -gencode=arch=compute_60,code=sm_60 \
+CUDA8_GENCODE = -gencode=arch=compute_60,code=sm_60 \
                -gencode=arch=compute_61,code=sm_61
-ifeq ($(shell test "0$(CUDA_MAJOR)" -lt 12; echo $$?),0)
-# SM35 is deprecated from CUDA12.0 onwards
-CUDA8_GENCODE += -gencode=arch=compute_35,code=sm_35
-endif
 CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70
 CUDA10_GENCODE = -gencode=arch=compute_75,code=sm_75
 CUDA11_GENCODE = -gencode=arch=compute_80,code=sm_80
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 27
-NCCL_PATCH   := 7
+NCCL_MINOR   := 28
+NCCL_PATCH   := 3
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
@@ -10,7 +10,7 @@ build : debian.build txz.build

 BUILDDIR ?= $(abspath ../build)
 ABSBUILDDIR := $(abspath $(BUILDDIR))
-TARGETS := debian txz
+TARGETS := debian txz doc
 all:   ${TARGETS:%=%.build}
 prep:  ${TARGETS:%=%.prep}
 build: ${TARGETS:%=%.build}
@@ -1,4 +1,4 @@
 bin/ncclras /usr/bin
-include/nccl.h /usr/include
+include/* /usr/include
 lib/libnccl.so /usr/lib/${pkg:MultiArch}
 lib/libnccl_static.a /usr/lib/${pkg:MultiArch}
@@ -47,8 +47,8 @@ ln -s libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_li
 # devel
 install -m 755 -d $RPM_BUILD_ROOT/%{_bindir}
 install -m 755 -d $RPM_BUILD_ROOT/%{_includedir}
+cp -a include/* $RPM_BUILD_ROOT/%{_includedir}/
 install -m 755 bin/ncclras $RPM_BUILD_ROOT/%{_bindir}
-install -m 644 include/nccl.h $RPM_BUILD_ROOT/%{_includedir}
 ln -s libnccl.so.${nccl:Major} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so

 # static
@@ -67,7 +67,7 @@ rm -rf $RPM_BUILD_ROOT
 %doc LICENSE.txt
 %defattr(-,root,root,-)
 %{_bindir}/ncclras
-%{_includedir}/nccl.h
+%{_includedir}/*
 %{_libdir}/libnccl.so

 %files static
@@ -22,7 +22,7 @@ prep: $(TXZTARGETS)
 build: prep
 	$(MAKE) -C ../../src clean
 	@printf "Building source tar.xz package\n"
-	(cd $(BUILDDIR); bash srctxz/create_srctxz.sh)
+	(cd $(BUILDDIR); SRCTXZ_APITESTS=$(SRCTXZ_APITESTS) bash srctxz/create_srctxz.sh)
 	mkdir -p $(PKGDIR)
 	mv $(BUILDDIR)/../../nccl-src*.txz $(PKGDIR)

@@ -28,8 +28,34 @@ NCCL_SUFFIX=${nccl:Suffix}
 NCCL_BUILD=${pkg:Revision}

 NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${NCCL_BUILD}"
+if [ "${SRCTXZ_APITESTS}" = "1" ]; then
+  NCCLNAME+="-apitest"
+fi

-tar --exclude build \
+
+INCLUDE_TEST_ENTRIES=("apitest" "googletest" "gtest.mk")
+
+if [ "${SRCTXZ_APITESTS}" = "1" ]; then
+  # Exclude all entries inside test folder except those in INCLUDE_TEST_ENTRIES
+  for entry in $(ls $NCCLDIR/test); do
+    if [[ ! " ${INCLUDE_TEST_ENTRIES[@]} " =~ " $entry " ]]; then
+      EXCLUDE_TEST+=" --exclude $NCCLDIR/test/$entry"
+    fi
+  done
+else
+  # Exclude the entire test directory
+  EXCLUDE_TEST+=" --exclude test"
+fi
+
+tar --exclude fortran \
+    --exclude doc \
+    --exclude plc \
+    --exclude build \
    --exclude ".git*" \
+    --exclude share \
+    --exclude ompi \
+    --exclude ext-net \
    --exclude pkg/srctxz \
+    --exclude docker \
+    $EXCLUDE_TEST \
    --transform "s/^$NCCLDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $NCCLDIR
@@ -0,0 +1,180 @@
+# Source files
+set(LIBSRCFILES
+    bootstrap.cc
+    channel.cc
+    ce_coll.cc
+    collectives.cc
+    debug.cc
+    enqueue.cc
+    group.cc
+    init.cc
+    init_nvtx.cc
+    proxy.cc
+    transport.cc
+    mnnvl.cc
+    allocator.cc
+    sym_kernels.cc
+    dev_runtime.cc
+)
+
+# Add compatibility shim if using static cudart
+if(CUDARTLIB STREQUAL "cudart_static")
+    list(APPEND LIBSRCFILES enhcompat.cc)
+endif()
+
+# Configure pkg-config file
+configure_file(
+    ${CMAKE_CURRENT_SOURCE_DIR}/nccl.pc.in
+    ${CMAKE_BINARY_DIR}/lib/pkgconfig/nccl.pc
+    @ONLY
+)
+
+# Add files from subdirectories
+add_subdirectory(transport)
+add_subdirectory(misc)
+add_subdirectory(register)
+add_subdirectory(graph)
+add_subdirectory(plugin)
+add_subdirectory(device)
+add_subdirectory(nccl_device)
+add_subdirectory(ras)
+add_subdirectory(scheduler)
+
+add_compile_options(-fmacro-prefix-map=${CMAKE_CURRENT_SOURCE_DIR}/=)
+
+# Add all source files
+list(APPEND LIBSRCFILES
+    ${TRANSPORT_SOURCES}
+    ${MISC_SOURCES}
+    ${REGISTER_SOURCES}
+    ${GRAPH_SOURCES}
+    ${PLUGIN_SOURCES}
+    ${RAS_SOURCES}
+    ${SYM_SOURCES}
+    ${SCHEDULER_SOURCES}
+)
+
+###################### Create a shared NCCL library ############################
+add_library(nccl SHARED)
+
+target_sources(nccl PRIVATE ${LIBSRCFILES})
+
+# Include directories
+target_include_directories(nccl PUBLIC
+    ${CMAKE_CURRENT_SOURCE_DIR}/device
+    ${CMAKE_CURRENT_SOURCE_DIR}/include
+    ${CMAKE_CURRENT_SOURCE_DIR}/include/plugin
+    ${CUDAToolkit_INCLUDE_DIRS}
+    ${CUDAToolkit_INCLUDE_DIRS}/cccl
+)
+
+add_custom_command(
+    OUTPUT ${CMAKE_BINARY_DIR}/include/nccl.h
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_BINARY_DIR}/include
+    COMMAND sed -e "s/\\\$$\\{nccl:Major\\}/${NCCL_MAJOR}/g"
+                -e "s/\\\$$\\{nccl:Minor\\}/${NCCL_MINOR}/g"
+                -e "s/\\\$$\\{nccl:Patch\\}/${NCCL_PATCH}/g"
+                -e "s/\\\$$\\{nccl:Suffix\\}/${NCCL_SUFFIX}/g"
+                -e "s/\\\$$\\{nccl:Version\\}/${NCCL_VERSION_CODE}/g"
+                ${CMAKE_CURRENT_SOURCE_DIR}/nccl.h.in > ${CMAKE_BINARY_DIR}/include/nccl.h
+    BYPRODUCTS ${CMAKE_BINARY_DIR}/include/nccl.h
+)
+
+add_custom_target(nccl_header DEPENDS ${CMAKE_BINARY_DIR}/include/nccl.h)
+
+add_dependencies(nccl nccl_header)
+
+# Set version and output name
+set_target_properties(nccl PROPERTIES
+    VERSION ${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}
+    SOVERSION ${NCCL_MAJOR}
+    OUTPUT_NAME "nccl"
+    PREFIX "lib"
+)
+
+# Set CUDA specific flags
+set_target_properties(nccl PROPERTIES
+    CUDA_SEPARABLE_COMPILATION ON
+    CUDA_RESOLVE_DEVICE_SYMBOLS ON
+    CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}"
+    POSITION_INDEPENDENT_CODE ON
+)
+
+# Link libraries
+target_link_libraries(nccl
+    PRIVATE
+    nccl_device
+    pthread
+    rt
+    dl
+    ${CUDAToolkit_LIBRARIES}
+    ${EXTRA_LIBS}
+)
+
+# Set output directories for nccl shared library
+set_target_properties(nccl PROPERTIES
+    LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+)
+
+###################### Create a ras binary executable ############################
+set(RAS_BINSRCFILES ras/client.cc)
+
+add_executable(ncclras ${RAS_BINSRCFILES})
+
+target_include_directories(ncclras PUBLIC
+    ${CMAKE_BINARY_DIR}/include
+    ${CUDAToolkit_INCLUDE_DIRS}
+)
+
+add_dependencies(ncclras nccl_header)
+
+target_link_libraries(ncclras
+    PRIVATE
+    pthread
+    rt
+    dl
+)
+
+# Set output directory for ncclras executable
+set_target_properties(ncclras PROPERTIES
+    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
+)
+
+###################### Create a static NCCL library ############################
+add_library(nccl_static STATIC ${LIBSRCFILES})
+
+# Include directories
+target_include_directories(nccl_static PUBLIC
+    ${CMAKE_CURRENT_SOURCE_DIR}/device
+    ${CMAKE_CURRENT_SOURCE_DIR}/include
+    ${CMAKE_CURRENT_SOURCE_DIR}/include/plugin
+    ${CUDAToolkit_INCLUDE_DIRS}
+    ${CUDAToolkit_INCLUDE_DIRS}/cccl
+)
+
+# Add dependency on nccl_header
+add_dependencies(nccl_static nccl_header)
+
+# Link libraries
+target_link_libraries(nccl_static
+    PRIVATE
+    nccl_device
+    pthread
+    rt
+    dl
+    ${CUDAToolkit_LIBRARIES}
+    ${EXTRA_LIBS}
+)
+
+# Set CUDA specific flags
+set_target_properties(nccl_static PROPERTIES
+    CUDA_SEPARABLE_COMPILATION ON
+    CUDA_RESOLVE_DEVICE_SYMBOLS ON
+    CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}"
+    POSITION_INDEPENDENT_CODE ON
+)
+
+# Set output directory for nccl_static library
+set_target_properties(nccl_static PROPERTIES
+    ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+)
@@ -7,10 +7,12 @@ include ../makefiles/common.mk
 include ../makefiles/version.mk

 ##### src files
-INCEXPORTS  := nccl.h
+INCEXPORTS  := nccl.h nccl_device.h \
+	$(patsubst include/%,%,$(wildcard include/nccl_device/*.h include/nccl_device/impl/*.h))
+
 LIBSRCFILES := \
 	bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \
-	init.cc init_nvtx.cc proxy.cc transport.cc mnnvl.cc allocator.cc symmetric.cc \
+	init.cc init_nvtx.cc proxy.cc transport.cc mnnvl.cc allocator.cc dev_runtime.cc sym_kernels.cc ce_coll.cc \
 	$(wildcard graph/*.cc) \
 	$(wildcard misc/*.cc) \
 	$(wildcard transport/*.cc) \
@@ -19,6 +21,8 @@ LIBSRCFILES := \
 	$(wildcard plugin/net/*.cc) \
 	$(wildcard plugin/tuner/*.cc) \
 	$(wildcard plugin/profiler/*.cc) \
+	$(wildcard nccl_device/*.cc) \
+	$(wildcard scheduler/*.cc) \
 	$(filter-out ras/client.cc,$(wildcard ras/*.cc))
 BINSRCFILES := ras/client.cc

@@ -123,6 +127,16 @@ $(INCDIR)/nccl_%.h : include/nccl_%.h
 	mkdir -p $(INCDIR)
 	install -m 644 $< $@

+$(INCDIR)/nccl_device/%.h: include/nccl_device/%.h
+	@printf "Grabbing   %-35s > %s\n" $< $@
+	mkdir -p $(INCDIR)/nccl_device
+	install -m 644 $< $@
+
+$(INCDIR)/nccl_device/impl/%.h: include/nccl_device/impl/%.h
+	@printf "Grabbing   %-35s > %s\n" $< $@
+	mkdir -p $(INCDIR)/nccl_device/impl
+	install -m 644 $< $@
+
 $(PKGDIR)/%.pc : %.pc
 	@printf "Grabbing   %-35s > %s\n" $< $@
 	mkdir -p $(PKGDIR)
@@ -149,7 +163,7 @@ install : build
 	mkdir -p $(PREFIX)/bin
 	cp -P -v $(BUILDDIR)/lib/lib* $(PREFIX)/lib/
 	cp -P -v $(BUILDDIR)/lib/pkgconfig/* $(PREFIX)/lib/pkgconfig/
-	cp -v $(BUILDDIR)/include/* $(PREFIX)/include/
+	cp -v -r $(BUILDDIR)/include/* $(PREFIX)/include/
 	cp -v $(BUILDDIR)/bin/ncclras $(PREFIX)/bin/

 FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cc" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|gdrwrap.h|nccl.h')
@@ -7,10 +7,11 @@
 #include "comm.h"
 #include "transport.h"
 #include "group.h"
+#include "nvtx.h"

 NCCL_API(ncclResult_t, ncclMemAlloc, void **ptr, size_t size);
 ncclResult_t  ncclMemAlloc_impl(void **ptr, size_t size) {
-  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  NCCL_NVTX3_FUNC_RANGE;
  ncclResult_t ret = ncclSuccess;

 #if ROCM_VERSION >= 70000
@@ -99,7 +100,7 @@ fail:

 NCCL_API(ncclResult_t, ncclMemFree, void *ptr);
 ncclResult_t  ncclMemFree_impl(void *ptr) {
-  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  NCCL_NVTX3_FUNC_RANGE;
  ncclResult_t ret = ncclSuccess;
  int saveDevice;

@@ -129,70 +130,339 @@ fail:
  goto exit;
 }

-// This is a collective function and should be called by all ranks in the communicator
-ncclResult_t ncclCommSymmetricAllocInternal(struct ncclComm* comm, size_t size, size_t alignment, void** symPtr) {
-  ncclResult_t ret = ncclSuccess;
-  void* regSymAddr = NULL;
-  size_t allocSize = size;
-  size_t granularity;
-  CUdevice cuDev;
-  CUmemAllocationProp memprop = {};
-  CUmemGenericAllocationHandle memHandle;
-  int bit = 0, cnt = 0;
+////////////////////////////////////////////////////////////////////////////////
+// ncclSpace:
+//
+// This datastructure "cuts" the line of non-negative integers into segments
+// which alternate between "full" (allocated) and "empty" (not allocated). The
+// cuts are sorted ascending. The segment after the last cut must be empty
+// (the unallocated frontier). Knwoing this we can deduce whether the segment
+// ending at cut[i] is full or empty with this formula:
+//   isFull(i) = (i%2 != ncuts%2)

-  // aligment must be power of 2 as an input
-  while (bit < sizeof(size_t) * 8) {
-    if (alignment & (1L << bit)) cnt++;
-    if (cnt == 2) {
-      WARN("rank %d alignment %ld is not power of 2", comm->rank, alignment);
-      goto fail;
+void ncclSpaceConstruct(struct ncclSpace* a) {
+  memset(a, 0, sizeof(*a));
+}
+
+void ncclSpaceDestruct(struct ncclSpace* a) {
+  free(a->cuts);
+}
+
+static void insertSegment(struct ncclSpace* a, int index, int64_t lo, int64_t hi) {
+  // Insert space for two cuts in `a->cuts[]` before `index`.
+  if (a->count + 2 > a->capacity) {
+    a->capacity *= 2;
+    if (a->capacity == 0) a->capacity = 16;
+    int64_t* cuts1 = (int64_t*)malloc(a->capacity*sizeof(int64_t));
+    for (int i=0; i < index; i++) cuts1[i] = a->cuts[i];
+    for (int i=index; i < a->count; i++) cuts1[i+2] = a->cuts[i];
+    free(a->cuts);
+    a->cuts = cuts1;
+  } else {
+    for (int i=a->count-1; index <= i; i--) a->cuts[i+2] = a->cuts[i];
+  }
+  a->cuts[index+0] = lo;
+  a->cuts[index+1] = hi;
+  a->count += 2;
+
+  // Filter pairs of adjacent repeated values from cuts[]. Since these mark
+  // boundaries where segments transition between full<->empty, dropping such a
+  // pair fuses two adjacent segments together. Examples:
+  //   [1,2,3,3,4] -> [1,2,4]
+  //   [1,2,3,3,3,4] -> [1,2,3,4] // have to leave one 3 because its a full<->empty transition
+  //   [1,2,3,3,3,3,4] -> [1,2,4]
+  // Leading zeros don't have to be in pairs, they are always dropped:
+  //   [0,1,2] -> [1,2]
+  //   [0,0,1,2] -> [1,2]
+  int r = index, w = index; // Read and write cursors.
+  int64_t prev = r==0 ? 0 : a->cuts[r-1];
+  while (r < a->count) {
+    int64_t cur = a->cuts[r++];
+    a->cuts[w++] = cur;
+    if (prev == cur) { // Repeated value is an empty segment which can be deleted.
+      // Erase last two cuts or just one if we're at the start.
+      w -= w==1 ? 1 : 2;
+      // Zeros can only occur at the beginning (due to being sorted). We want to
+      // drop any number of zeros, but only even numbers of other repeated values.
+      // So set to zero here, which will make prev=0, thus if next value is zero
+      // it will be dropped but if its not zero then it will need to begin a new
+      // pair to be dropped.
+      cur = 0;
    }
-    bit++;
+    prev = cur;
  }
-  // temporarily align the alignment to NCCL_REC_PAGE_SIZE
-  ALIGN_SIZE(alignment, NCCL_REC_PAGE_SIZE);
-
-  CUCHECKGOTO(cuDeviceGet(&cuDev, comm->cudaDev), ret, fail);
-  memprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
-  memprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-  memprop.requestedHandleType = ncclCuMemHandleType;
-  memprop.location.id = cuDev;
-  CUCHECKGOTO(cuMemGetAllocationGranularity(&granularity, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail);
-  ALIGN_SIZE(allocSize, granularity);
-
-  CUCHECKGOTO(cuMemCreate(&memHandle, allocSize, &memprop, 0), ret, fail);
-  ALIGN_SIZE(comm->symAllocHead, alignment);
-  NCCLCHECKGOTO(ncclIpcSymmetricMap(comm, comm->symAllocHead, allocSize, memHandle, &regSymAddr), ret, fail);
-  NCCLCHECKGOTO(ncclNvlsSymmetricMap(comm, comm->symAllocHead, allocSize, regSymAddr), ret, fail);
-  NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
-  comm->symAllocHead += allocSize;
-  *symPtr = regSymAddr;
-
-exit:
-  return ret;
-fail:
-  *symPtr = NULL;
-  goto exit;
+  a->count = w;
 }

-ncclResult_t ncclCommSymmetricFreeInternal(struct ncclComm* comm, void* symPtr) {
-  CUmemGenericAllocationHandle handle;
-  size_t size = 0;
-  ncclResult_t ret = ncclSuccess;
-  int saveDev = comm->cudaDev;
-  CUDACHECKGOTO(cudaGetDevice(&saveDev), ret, fail);
-  if (ncclCuMemEnable()) {
-    CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
-    CUCHECKGOTO(cuMemRetainAllocationHandle(&handle, symPtr), ret, fail);
-    CUCHECKGOTO(cuMemRelease(handle), ret, fail);
-    CUCHECKGOTO(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)symPtr), ret, fail);
-    NCCLCHECKGOTO(ncclNvlsSymmetricFree(comm, size, symPtr), ret, fail);
-    NCCLCHECKGOTO(ncclIpcSymmetricFree(comm, size, symPtr), ret, fail);
-    CUCHECKGOTO(cuMemRelease(handle), ret, fail);
+ncclResult_t ncclSpaceAlloc(
+    struct ncclSpace* a, int64_t limit, int64_t size, int align,
+    int64_t* outOffset
+  ) {
+  // When allocating we try to locate the first empty segment which can hold
+  // the allocation and move its lower cut upward.
+  int i = a->count%2; // First empty segment ends at cuts[i]
+  size_t off;
+  while (i <= a->count) {
+    size_t lo = i == 0 ? 0 : a->cuts[i-1];
+    size_t hi = i == a->count ? limit : a->cuts[i];
+    off = alignUp(lo, align);
+    if (off + size <= hi) {
+      *outOffset = off;
+      if (i == 0 || off + size == hi) { // Slow path required.
+        insertSegment(a, i, off, off+size);
+      } else { // We can just append to the end of a full segment.
+        a->cuts[i-1] = off + size;
+      }
+      return ncclSuccess;
+    }
+    i += 2; // Next empty segment
  }
-exit:
-  CUDACHECK(cudaSetDevice(saveDev));
-  return ret;
-fail:
-  goto exit;
+  WARN("Allocation failed. No suitable space found to accommodate size=0x%lx within limit=0x%lx", (long)size, (long)limit);
+  return ncclInternalError;
+}
+
+ncclResult_t ncclSpaceFree(struct ncclSpace* a, int64_t offset, int64_t size) {
+  if (a->count == 0 || a->cuts[a->count-1] <= offset) {
+    WARN("No allocation found at offset=0x%lx", (long)offset);
+    return ncclInternalError;
+  }
+
+  // This could be binary search, but since allocate is linear there's no point.
+  int i = 1 - a->count%2; // First full segment ends at cuts[i]
+  while (a->cuts[i] <= offset) i += 2;
+
+  int64_t lo = i==0 ? 0 : a->cuts[i-1];
+  int64_t hi = a->cuts[i];
+
+  if (offset < lo || hi < offset + size) {
+    WARN("Given size=0x%lx extends beyond allocation.", (long)size);
+    return ncclInternalError;
+  }
+
+  // First try the two fast cases which just shrink a segment from one side.
+  if (i != 0 && lo == offset && offset + size != hi) {
+    a->cuts[i-1] = offset + size; // Bring bottom up.
+  } else if (lo != offset && offset + size == hi) {
+    a->cuts[i] = offset; // Bring top down.
+  } else { // Slow path.
+    insertSegment(a, i, offset, offset+size);
+  }
+  return ncclSuccess;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// ncclShadowPool:
+
+struct ncclShadowPage { // A contiguous block of (at most) 64 objects
+  struct ncclShadowPage* next;
+  int objSize;
+  uint64_t freeMask;
+  void* devObjs;
+};
+struct ncclShadowObject {
+  struct ncclShadowObject* next;
+  void* devObj;
+  void* hostObj;
+  struct ncclShadowPage* page; // null if not allocated in page but directly in CUDA mempool.
+};
+
+void ncclShadowPoolConstruct(struct ncclShadowPool* pool) {
+  pool->hbits = 0;
+  pool->count = 0;
+  pool->table = nullptr;
+  pool->pages = nullptr;
+}
+
+ncclResult_t ncclShadowPoolDestruct(struct ncclShadowPool* pool) {
+  if (pool->hbits != 0) {
+    cudaStream_t stream;
+    CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+
+    if (pool->count != 0) {
+      for (int i=0; i < 1<<pool->hbits; i++) {
+        struct ncclShadowObject* obj = pool->table[i];
+        while (obj != nullptr) {
+          struct ncclShadowPage* page = obj->page;
+          if (page != nullptr) {
+            if (page->freeMask == 0) { // Put full pages back into page list.
+              page->freeMask = 1;
+              page->next = pool->pages;
+              pool->pages = page;
+            }
+          } else {
+            cudaFreeAsync(obj->devObj, stream);
+          }
+          struct ncclShadowObject* next = obj->next;
+          free(obj);
+          obj = next;
+        }
+      }
+    }
+    free(pool->table);
+
+    while (pool->pages != nullptr) {
+      cudaFreeAsync(pool->pages->devObjs, stream);
+      struct ncclShadowPage* next = pool->pages->next;
+      free(pool->pages);
+      pool->pages = next;
+    }
+
+    cudaStreamSynchronize(stream);
+    cudaStreamDestroy(stream);
+    cudaMemPoolDestroy(pool->memPool);
+  }
+  return ncclSuccess;
+}
+
+static int hashBucket(int hbits, void* devObj) {
+  uintptr_t h = reinterpret_cast<uintptr_t>(devObj);
+  h ^= h>>32;
+  h *= 0x9e3779b97f4a7c13;
+  return (uint64_t)h >> (64-hbits);
+}
+
+static void hashInsert(struct ncclShadowPool* pool, struct ncclShadowObject* obj) {
+  int b = hashBucket(pool->hbits, obj->devObj);
+  obj->next = pool->table[b];
+  pool->table[b] = obj;
+}
+
+ncclResult_t ncclShadowPoolAlloc(
+    struct ncclShadowPool* pool, size_t size, void** outDevObj, void** outHostObj,
+    cudaStream_t stream
+  ) {
+  if (size == 0) {
+    if (outDevObj) *outDevObj = nullptr;
+    if (outHostObj) *outHostObj = nullptr;
+    return ncclSuccess;
+  }
+
+  int hbits = pool->hbits;
+  if (hbits == 0) {
+    cudaMemPoolProps props = {};
+    props.allocType = cudaMemAllocationTypePinned;
+    props.handleTypes = cudaMemHandleTypeNone;
+    props.location.type = cudaMemLocationTypeDevice;
+    cudaGetDevice(&props.location.id);
+    CUDACHECK(cudaMemPoolCreate(&pool->memPool, &props));
+
+    pool->hbits = hbits = 4;
+    pool->table = (struct ncclShadowObject**)malloc(sizeof(struct ncclShadowObject*)<<hbits);
+    for (int i=0; i < 1<<hbits; i++) pool->table[i] = nullptr;
+  }
+
+  // Check for hash table size increase before inserting. Maintain 2:1 object:bucket ratio.
+  if (pool->count+1 > 2<<hbits) {
+    struct ncclShadowObject** table0 = pool->table;
+    struct ncclShadowObject** table1 = (struct ncclShadowObject**)malloc(sizeof(struct ncclShadowObject*)<<(hbits+1));
+    pool->table = table1;
+    pool->hbits = hbits+1;
+    for (int i1=0; i1 < 2<<hbits; i1++) table1[i1] = nullptr;
+    for (int i0=0; i0 < 1<<hbits; i0++) {
+      struct ncclShadowObject* obj = table0[i0];
+      while (obj) {
+        struct ncclShadowObject* next = obj->next;
+        hashInsert(pool, obj);
+        obj = next;
+      }
+    }
+    hbits += 1; // match pool->hbits
+    free(table0);
+  }
+
+  struct ncclShadowPage* page;
+  void *devObj;
+  if ((64<<10)/size >= 3) {
+    int shift = std::max<int>(0, (int)log2Down(size) + 1 - 4);
+    int pageObjSize = ((size + (1<<shift)-1)>>shift)<<shift;
+    struct ncclShadowPage** pagePtr = &pool->pages;
+    while (true) {
+      page = *pagePtr;
+      if (page == nullptr) {
+        size_t pageSize = std::min<size_t>(64<<10, 64*pageObjSize);
+        page = (struct ncclShadowPage*)malloc(sizeof(struct ncclShadowPage));
+        page->objSize = pageObjSize;
+        page->freeMask = uint64_t(-1)>>(64 - pageSize/pageObjSize);
+        page->next = pool->pages;
+        pool->pages = page;
+        CUDACHECK(cudaMallocFromPoolAsync(&page->devObjs, pageSize, pool->memPool, stream));
+        CUDACHECK(cudaMemsetAsync(page->devObjs, 0, pageSize, stream));
+        // fall through...
+      }
+      if (page->objSize == pageObjSize) {
+        int slot = popFirstOneBit(&page->freeMask);
+        devObj = (char*)page->devObjs + slot*pageObjSize;
+        if (page->freeMask == 0) *pagePtr = page->next; // Remove full page from list.
+        break;
+      }
+      pagePtr = &page->next;
+    }
+  } else {
+    page = nullptr;
+    CUDACHECK(cudaMallocFromPoolAsync(&devObj, size, pool->memPool, stream));
+    CUDACHECK(cudaMemsetAsync(devObj, 0, size, stream));
+  }
+
+  struct ncclShadowObject* obj = (struct ncclShadowObject*)malloc(
+    sizeof(struct ncclShadowObject) + /*padding=*/alignof(max_align_t)-1 + size
+  );
+  obj->page = page;
+  obj->devObj = devObj;
+  obj->hostObj = alignUp((char*)(obj+1), alignof(max_align_t));
+  memset(obj->hostObj, 0, size);
+  hashInsert(pool, obj);
+  pool->count += 1;
+  if (outDevObj) *outDevObj = devObj;
+  if (outHostObj) *outHostObj = obj->hostObj;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclShadowPoolFree(struct ncclShadowPool* pool, void* devObj, cudaStream_t stream) {
+  if (devObj == nullptr) return ncclSuccess;
+
+  int b = hashBucket(pool->hbits, devObj);
+  struct ncclShadowObject** pobj = &pool->table[b];
+  while (true) {
+    if (*pobj == nullptr) {
+      WARN("Device object does not exist in shadow pool.");
+      return ncclInternalError;
+    }
+    if ((*pobj)->devObj == devObj) break;
+    pobj = &(*pobj)->next;
+  }
+  struct ncclShadowObject* obj = *pobj;
+  *pobj = obj->next;
+  if (obj->page != nullptr) {
+    if (obj->page->freeMask == 0) {
+      obj->page->next = pool->pages;
+      pool->pages = obj->page;
+    }
+    int slot = ((char*)obj->devObj - (char*)obj->page->devObjs)/obj->page->objSize;
+    obj->page->freeMask |= uint64_t(1)<<slot;
+  } else {
+    CUDACHECK(cudaFreeAsync(devObj, stream));
+  }
+  free(obj);
+  pool->count -= 1;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclShadowPoolToHost(struct ncclShadowPool* pool, void* devObj, void** hostObj) {
+  if (devObj == nullptr) {
+    *hostObj = nullptr;
+    return ncclSuccess;
+  }
+
+  int b = hashBucket(pool->hbits, devObj);
+  struct ncclShadowObject* obj = pool->table[b];
+  while (true) {
+    if (obj == nullptr) {
+      WARN("Device object does not exist in shadow pool.");
+      return ncclInternalError;
+    }
+    if (obj->devObj == devObj) break;
+    obj = obj->next;
+  }
+  *hostObj = obj->hostObj;
+  return ncclSuccess;
 }
@@ -15,6 +15,7 @@
 #include "signals.h" // [RCCL]
 #include "param.h"
 #include "ras.h"
+#include <mutex>

 #define BOOTSTRAP_N_CHECK_ABORT           10000
 #define BOOTSTRAP_TAG_CONNECT             (0x1 << 31)
@@ -86,13 +87,13 @@ struct bootstrapRootArgs {
 static char bootstrapNetIfName[MAX_IF_NAME_SIZE+1];
 static union ncclSocketAddress bootstrapNetIfAddr;
 static int bootstrapNetInitDone = 0;
-pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER;
+static std::mutex bootstrapNetMutex;

 NCCL_PARAM(BootstrapNetEnable,"OOB_NET_ENABLE", 0);

 ncclResult_t bootstrapNetInit() {
  if (bootstrapNetInitDone == 0) {
-    pthread_mutex_lock(&bootstrapNetLock);
+    std::lock_guard<std::mutex> lock(bootstrapNetMutex);
    if (bootstrapNetInitDone == 0) {
      const char* env = ncclGetEnv("NCCL_COMM_ID");
      int nIfs = 0;
@@ -100,21 +101,18 @@ ncclResult_t bootstrapNetInit() {
        union ncclSocketAddress remoteAddr;
        if (ncclSocketGetAddrFromString(&remoteAddr, env) != ncclSuccess) {
          WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
-          pthread_mutex_unlock(&bootstrapNetLock);
          return ncclInvalidArgument;
        }
        NCCLCHECK(ncclFindInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE,
                                               &nIfs));
        if (nIfs <= 0) {
          WARN("NET/Socket : No usable listening interface found");
-          pthread_mutex_unlock(&bootstrapNetLock);
          return ncclSystemError;
        }
      } else {
        NCCLCHECK(ncclFindInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1, &nIfs));
        if (nIfs <= 0) {
          WARN("Bootstrap : no socket interface found");
-          pthread_mutex_unlock(&bootstrapNetLock);
          return ncclInvalidUsage;
        }
      }
@@ -124,7 +122,6 @@ ncclResult_t bootstrapNetInit() {
      INFO(NCCL_BOOTSTRAP, "Bootstrap: Using%s", line);
      bootstrapNetInitDone = 1;
    }
-    pthread_mutex_unlock(&bootstrapNetLock);
  }
  return ncclSuccess;
 }
@@ -486,7 +483,7 @@ static ncclResult_t getUDS(uint64_t* peerUDS) {
 static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) {
  static int devOOB = -1;
  if (devOOB < 0) {
-    pthread_mutex_lock(&bootstrapNetLock);
+    std::lock_guard<std::mutex> lock(bootstrapNetMutex);
    if (devOOB < 0) {
      const char* userIfEnv = ncclGetEnv("NCCL_OOB_NET_IFNAME");
      if (userIfEnv && strlen(userIfEnv) > 0) {
@@ -517,7 +514,6 @@ static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) {
            WARN("no device found matching %s%s, verify NCCL_OOB_NET_IFNAME", searchExact ? "exactly " : "", userIfEnv);
          else
            WARN("no device found after excluding %s%s, verify NCCL_OOB_NET_IFNAME", searchExact ? "exactly " : "", userIfEnv);
-          pthread_mutex_unlock(&bootstrapNetLock);
          return ncclInvalidArgument;
        }
      } else {
@@ -530,13 +526,12 @@ static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) {
      bool hasProp = res == ncclSuccess;
      INFO(NCCL_BOOTSTRAP, "Bootstrap: Using %s:%d", (hasProp) ? props.name : "N/A", (hasProp) ? props.port : -1);
    }
-    pthread_mutex_unlock(&bootstrapNetLock);
  }
  *dev = devOOB;
  return ncclSuccess;
 }

-static ncclResult_t netRingConnect(ncclNet_t* net, struct bootstrapListen_t* listen, char peerHandle[NCCL_NET_HANDLE_MAXSIZE],
+static ncclResult_t netRingConnect(void* ctx, ncclNet_t* net, struct bootstrapListen_t* listen, char peerHandle[NCCL_NET_HANDLE_MAXSIZE],
                                   void** sendComm, ncclNetDeviceHandle_t** sendDevHandle,
                                   void** recvComm, ncclNetDeviceHandle_t** recvDevHandle, volatile uint32_t* abortFlag) {

@@ -544,7 +539,7 @@ static ncclResult_t netRingConnect(ncclNet_t* net, struct bootstrapListen_t* lis
  do {
    NCCLCHECK(checkAbort(abortFlag, &abortCounter));
    if (!*sendComm)
-      NCCLCHECK(net->connect(listen->net.dev, NULL, peerHandle, sendComm, sendDevHandle));
+      NCCLCHECK(net->connect(ctx, listen->net.dev, peerHandle, sendComm, sendDevHandle));
    if (!*recvComm)
      NCCLCHECK(net->accept(listen->net.comm, recvComm, recvDevHandle));
  } while (!*sendComm || !*recvComm);
@@ -660,7 +655,7 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
  if (ncclParamBootstrapNetEnable()) {
    // Create net interface for other ranks to contact me (all gather)
    NCCLCHECK(netGetDevice(rank, comm, &STATE_LISTEN(state, net.dev)));
-    NCCLCHECK(state->net->listen(STATE_LISTEN(state, net.dev), STATE_LISTEN(state, net.handle), &STATE_LISTEN(state, net.comm)));
+    NCCLCHECK(state->net->listen(comm->netContext, STATE_LISTEN(state, net.dev), STATE_LISTEN(state, net.handle), &STATE_LISTEN(state, net.comm)));
    memcpy(info.connectInfo.handle, STATE_LISTEN(state, net.handle), NCCL_NET_HANDLE_MAXSIZE);
  } else {
    // create socket for ring neightbor to contact mee
@@ -714,7 +709,7 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {

  // accept and connect the ring network
  if (ncclParamBootstrapNetEnable()) {
-    NCCLCHECK(netRingConnect(state->net, &state->listen, nextPeer.handle,
+    NCCLCHECK(netRingConnect(comm->netContext, state->net, &state->listen, nextPeer.handle,
                             &STATE_RING(state, net.sendComm), &STATE_RING(state, net.sendDevHandle),
                             &STATE_RING(state, net.recvComm), &STATE_RING(state, net.recvDevHandle), state->abortFlag));
  } else {
@@ -807,7 +802,7 @@ ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclCo
  // create a handle for the others to reach out to me
  if (ncclParamBootstrapNetEnable()) {
    NCCLCHECKGOTO(netGetDevice(rank, comm, &STATE_LISTEN(state, net.dev)), ret, fail);
-    NCCLCHECKGOTO(state->net->listen(STATE_LISTEN(state, net.dev), STATE_LISTEN(state, net.handle), &STATE_LISTEN(state, net.comm)), ret, fail);
+    NCCLCHECKGOTO(state->net->listen(comm->netContext, STATE_LISTEN(state, net.dev), STATE_LISTEN(state, net.handle), &STATE_LISTEN(state, net.comm)), ret, fail);
    memcpy(info.handle, STATE_LISTEN(state, net.handle), NCCL_NET_HANDLE_MAXSIZE);
  } else {
    // create socket for ring neightbor to contact mee
@@ -826,7 +821,7 @@ ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclCo
  NCCLCHECKGOTO(bootstrapSend(parent->bootstrap, prev, BOOTSTRAP_TAG_COMMSPLIT, &info, sizeof(union ringConnectInfo)), ret, fail);
  NCCLCHECKGOTO(bootstrapRecv(parent->bootstrap, next, BOOTSTRAP_TAG_COMMSPLIT, &nextPeer, sizeof(union ringConnectInfo)), ret, fail);
  if (ncclParamBootstrapNetEnable()) {
-    NCCLCHECKGOTO(netRingConnect(state->net, &state->listen, nextPeer.handle,
+    NCCLCHECKGOTO(netRingConnect(comm->netContext, state->net, &state->listen, nextPeer.handle,
                                 &STATE_RING(state, net.sendComm), &STATE_RING(state, net.sendDevHandle),
                                 &STATE_RING(state, net.recvComm), &STATE_RING(state, net.recvDevHandle), state->abortFlag),
                  ret, fail);
@@ -0,0 +1,615 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "comm.h"
+#include "register_inline.h"
+#include <cuda.h>
+#include "rocmwrap.h"
+#include "ce_coll.h"
+#include "alloc.h"
+
+// Static constant for graph synchronization
+static const uint32_t GRAPH_SYNC_VALUE = 1;
+
+// Static constants for intra-batch synchronization to improve CE collective performance with large scale
+// Frequency of intra-batch synchronization
+static const uint32_t CE_COLL_INTRA_BATCH_SYNC_FREQ = 8;
+// Message threshold for intra-batch synchronization
+static const uint64_t CE_COLL_INTRA_BATCH_SYNC_MSG_THRESHOLD = 512*1024*1024;
+
+ncclResult_t ncclCeInit(struct ncclComm* comm) {
+  ncclResult_t ret = ncclSuccess;
+
+  uint8_t* ceDevBase;
+  size_t ceDevBaseSize = alignUp(comm->nRanks*sizeof(uint32_t), 16) * 2;
+  ncclWindow_vidmem* ceWinDev;
+  ncclWindow_vidmem* ceWinDevHost;
+
+  // Ensure symmetric memory runtime is initialized
+  NCCLCHECKGOTO(ncclDevrInitOnce(comm), ret, fail);
+  // Allocate and register memory for the symmetric memory
+  NCCLCHECKGOTO(ncclMemAlloc((void**)&ceDevBase, ceDevBaseSize), ret, fail);
+  NCCLCHECKGOTO(ncclDevrWindowRegisterInGroup(comm, ceDevBase, ceDevBaseSize, NCCL_WIN_COLL_SYMMETRIC, &ceWinDev), ret, fail);
+  NCCLCHECKGOTO(ncclShadowPoolToHost(&comm->devrState.shadows, ceWinDev, &ceWinDevHost), ret, fail);
+  // Get the ncclDevrWindow from the winHost field
+  comm->ceColl.ceSyncWin = (struct ncclDevrWindow*)ceWinDevHost->winHost;
+
+  comm->ceColl.baseUCSymReadyOffset = 0;
+  comm->ceColl.baseUCSymComplOffset = alignUp(comm->nRanks*sizeof(uint32_t), 16);
+  comm->ceColl.baseUCSymReadyPtr = (uint8_t*)comm->ceColl.ceSyncWin->userPtr + comm->ceColl.baseUCSymReadyOffset;
+  comm->ceColl.baseUCSymComplPtr = (uint8_t*)comm->ceColl.ceSyncWin->userPtr + comm->ceColl.baseUCSymComplOffset;
+  comm->ceColl.ceSeqNum = 0;
+  comm->ceColl.useCompletePtr = false;
+  comm->ceColl.intraBatchSyncFreq = CE_COLL_INTRA_BATCH_SYNC_FREQ;
+  comm->ceColl.intraBatchSyncMsgThreshold = CE_COLL_INTRA_BATCH_SYNC_MSG_THRESHOLD;
+  INFO(NCCL_INIT, "Init CE, rank %d baseUCSymReadyPtr %p, baseUCSymComplPtr %p, seq num %d", comm->rank, comm->ceColl.baseUCSymReadyPtr, comm->ceColl.baseUCSymComplPtr, comm->ceColl.ceSeqNum);
+
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+ncclResult_t ncclCeFinalize(struct ncclComm* comm) {
+  ncclResult_t ret = ncclSuccess;
+  
+  // Clean up ceInitTaskQueue
+  while (!ncclIntruQueueEmpty(&comm->ceInitTaskQueue)) {
+    struct ncclCeInitTask* task = ncclIntruQueueDequeue(&comm->ceInitTaskQueue);
+    free(task);
+  }
+  
+  // Clean up CE resources
+  if (comm->ceColl.baseUCSymReadyPtr != NULL) {
+    if (comm->ceColl.ceSyncWin && comm->ceColl.ceSyncWin->vidmem) {
+      NCCLCHECKGOTO(ncclCommWindowDeregister(comm, comm->ceColl.ceSyncWin->vidmem), ret, fail);
+      NCCLCHECKGOTO(ncclMemFree(comm->ceColl.baseUCSymReadyPtr), ret, fail);
+    }
+    comm->ceColl.baseUCSymReadyPtr = NULL;
+    comm->ceColl.baseUCSymComplPtr = NULL;
+    comm->ceColl.ceSyncWin = NULL;
+  }
+
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+bool ncclCeImplemented(ncclFunc_t coll, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty) {
+  int driverVersion;
+  if (ncclCudaDriverVersion(&driverVersion) != ncclSuccess) return false;
+
+  // CE is supported in CUDA 12.5 and later
+  if (driverVersion >= 12050) {
+    switch (coll) {
+    case ncclFuncAllGather:
+    case ncclFuncAlltoAll:
+    case ncclFuncScatter:
+    case ncclFuncGather:
+      return true;
+    default:
+      return false;
+    }
+  }
+  return false;
+}
+
+ncclResult_t ncclPrepMCSync(struct ncclComm* comm, bool isComplete, hipStreamBatchMemOpParams* batchParams, size_t* opIdx, cudaStream_t stream) {
+  ncclResult_t ret = ncclSuccess;
+
+  uint32_t* readyPtrs    = (uint32_t*)comm->ceColl.baseUCSymReadyPtr;
+  uint32_t* completePtrs = (uint32_t*)comm->ceColl.baseUCSymComplPtr;
+
+  bool capturing = ncclCudaGraphValid(comm->planner.capturingGraph);
+  uint32_t currentSeq = ++comm->ceColl.ceSeqNum;
+
+  // Source pointer is either the constant graph sync value or the sequence number
+  void* srcPtr = capturing ? (void*)&GRAPH_SYNC_VALUE : (void*)&currentSeq;
+  // Wait value is either the constant graph sync value or the sequence number
+  uint32_t waitValue = capturing ? GRAPH_SYNC_VALUE : currentSeq;
+
+  // Use multi-cast address as destination pointer
+  void* mcDstPtr;
+  void* dstPtr = isComplete ? (void*)&completePtrs[comm->rank] : (void*)&readyPtrs[comm->rank];
+  size_t offset = (uint8_t*)dstPtr - (uint8_t*)comm->ceColl.ceSyncWin->userPtr;
+  NCCLCHECKGOTO(ncclDevrGetLsaTeamPtrMC(comm, comm->ceColl.ceSyncWin, offset, ncclTeamLsa(comm), &mcDstPtr), ret, fail);
+  
+  // Write our own ready/complete flag to the multi-cast address
+  CUDACHECKGOTO(cudaMemcpyAsync(
+    mcDstPtr,
+    srcPtr,
+    sizeof(uint32_t),
+    cudaMemcpyHostToDevice,
+    stream), ret, fail);
+
+  // Add local wait operations for every other rank
+  for (int r = 0; r < comm->nRanks; ++r) {
+    if (r == comm->rank) continue;
+    batchParams[*opIdx] = {};
+    // batchParams[*opIdx].waitValue.operation = CU_STREAM_MEM_OP_WAIT_VALUE_32;
+    batchParams[*opIdx].waitValue.address = (CUdeviceptr)(isComplete ? (void*)&completePtrs[r] : (void*)&readyPtrs[r]);
+    batchParams[*opIdx].waitValue.value = waitValue;
+    batchParams[*opIdx].waitValue.flags = CU_STREAM_WAIT_VALUE_EQ;
+    (*opIdx)++;
+  }
+
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+ncclResult_t ncclPrepUCSync(struct ncclComm* comm, bool isComplete,
+                               hipStreamBatchMemOpParams* batchParams,
+                               size_t* opIdx) {
+  ncclResult_t ret = ncclSuccess;
+
+  uint32_t* readyPtrs    = (uint32_t*)comm->ceColl.baseUCSymReadyPtr;
+  uint32_t* completePtrs = (uint32_t*)comm->ceColl.baseUCSymComplPtr;
+
+  bool capturing = ncclCudaGraphValid(comm->planner.capturingGraph);
+  uint32_t currentSeq = ++comm->ceColl.ceSeqNum;
+
+  // Write our own ready/complete flag to remote ranks
+  uint32_t waitValue = capturing ? GRAPH_SYNC_VALUE : currentSeq;
+  for (int r = 0; r < comm->nRanks; ++r) {
+    if (r == comm->rank) continue;
+    void * peerDstPtr;
+    void* dstPtr = isComplete ? (void*)&completePtrs[comm->rank] : (void*)&readyPtrs[comm->rank];
+    size_t offset = (uint8_t*)dstPtr - (uint8_t*)comm->ceColl.ceSyncWin->userPtr;
+    NCCLCHECKGOTO(ncclDevrGetLsaRankPtr(comm, comm->ceColl.ceSyncWin, offset, r, &peerDstPtr), ret, fail);
+    batchParams[*opIdx] = {};
+    // batchParams[*opIdx].writeValue.operation = CU_STREAM_MEM_OP_WRITE_VALUE_32;
+    batchParams[*opIdx].writeValue.address  = (CUdeviceptr)peerDstPtr;
+    batchParams[*opIdx].writeValue.value = waitValue;
+    // batchParams[*opIdx].writeValue.flags = CU_STREAM_WRITE_VALUE_DEFAULT;
+    (*opIdx)++;
+  }
+
+  // Add local wait operations for every other rank
+  for (int r = 0; r < comm->nRanks; ++r) {
+    if (r == comm->rank) continue;
+    batchParams[*opIdx] = {};
+    // batchParams[*opIdx].waitValue.operation = CU_STREAM_MEM_OP_WAIT_VALUE_32;
+    batchParams[*opIdx].waitValue.address  = (CUdeviceptr)(isComplete ? (void*)&completePtrs[r] : (void*)&readyPtrs[r]);
+    batchParams[*opIdx].waitValue.value = waitValue;
+    batchParams[*opIdx].waitValue.flags = CU_STREAM_WAIT_VALUE_EQ;
+    (*opIdx)++;
+  }
+
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+
+ncclResult_t ncclMemOpSync(struct ncclComm* comm, cudaStream_t stream) {
+  ncclResult_t ret = ncclSuccess;
+
+  // Get pointers to the ready and complete synchronization arrays
+  uint32_t* readyPtrs = (uint32_t*)comm->ceColl.baseUCSymReadyPtr;
+  uint32_t* completePtrs = (uint32_t*)comm->ceColl.baseUCSymComplPtr;
+  
+  // Allocate enough slots for all possible ops
+  size_t batchSize = (comm->nvlsSupport ? NCCL_CE_SYNC_OPS_PER_RANK_MC : NCCL_CE_SYNC_OPS_PER_RANK_UC) * comm->nRanks;
+  size_t opIdx = 0;
+
+  // Prepare batch memory operations for synchronization
+  hipStreamBatchMemOpParams* batchParams = nullptr;
+  NCCLCHECKGOTO(ncclCalloc(&batchParams, batchSize), ret, fail);
+
+  if (comm->nvlsSupport) {
+    NCCLCHECKGOTO(ncclPrepMCSync(comm, comm->ceColl.useCompletePtr, batchParams, &opIdx, stream), ret, fail);
+  } else {
+    NCCLCHECKGOTO(ncclPrepUCSync(comm, comm->ceColl.useCompletePtr, batchParams, &opIdx), ret, fail);
+  }
+
+  // For CUDA graph capture, add reset operation
+  if (ncclCudaGraphValid(comm->planner.capturingGraph)) {
+    for (int i = 0; i < comm->nRanks; i++) {
+      batchParams[opIdx] = {};
+      // batchParams[opIdx].writeValue.operation = CU_STREAM_MEM_OP_WRITE_VALUE_32;
+      batchParams[opIdx].writeValue.address = (CUdeviceptr)(comm->ceColl.useCompletePtr ? (void*)&completePtrs[i] : (void*)&readyPtrs[i]);
+      batchParams[opIdx].writeValue.value = 0;
+      // batchParams[opIdx].writeValue.flags = CU_STREAM_WRITE_VALUE_DEFAULT;
+      opIdx++;
+    }
+  }
+  
+  // Execute all memory operations in a single batch
+  CUCHECKGOTO(hipStreamBatchMemOp(stream, opIdx, batchParams, 0), ret, fail);
+
+  // Toggle the flag for next call
+  comm->ceColl.useCompletePtr = !comm->ceColl.useCompletePtr;
+
+exit:
+  if (batchParams) free(batchParams);
+  return ret;
+fail:
+  goto exit;
+}
+
+ncclResult_t ncclCeInitBatchOpsParams(struct ncclCeBatchOpsParams* params, int nRanks) {
+  ncclResult_t ret = ncclSuccess;
+  
+  params->srcs = nullptr;
+  params->dsts = nullptr;
+  params->sizes = nullptr;
+  params->numOps = 0;
+  params->intraBatchSync = false;
+#if CUDART_VERSION >= 12080
+  params->attrs = nullptr;
+  params->attrIdxs = nullptr;
+  params->numAttrs = 0;
+#endif
+  
+  NCCLCHECKGOTO(ncclCalloc(&params->srcs, nRanks), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&params->dsts, nRanks), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&params->sizes, nRanks), ret, fail);
+#if CUDART_VERSION >= 12080
+  NCCLCHECKGOTO(ncclCalloc(&params->attrs, nRanks), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&params->attrIdxs, nRanks), ret, fail);
+#endif
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+void ncclCeFreeBatchOpsParams(struct ncclCeBatchOpsParams* params) {
+  if (params->srcs) free(params->srcs);
+  if (params->dsts) free(params->dsts);
+  if (params->sizes) free(params->sizes);
+#if CUDART_VERSION >= 12080
+  if (params->attrs) free(params->attrs);
+  if (params->attrIdxs) free(params->attrIdxs);
+#endif
+}
+
+ncclResult_t ncclCeLaunchBatchOps(struct ncclComm* comm, struct ncclCeBatchOpsParams* params, cudaStream_t stream) {
+  ncclResult_t ret = ncclSuccess;
+
+  // Check if there are any operations to perform
+  if (params->numOps == 0) {
+    return ncclSuccess;
+  }
+
+  // Check if we are in a CUDA graph capture
+  bool capturing = ncclCudaGraphValid(comm->planner.capturingGraph);
+
+  int driverVersion;
+  NCCLCHECKGOTO(ncclCudaDriverVersion(&driverVersion), ret, fail);
+    
+  //--------------Graph capture--------------
+  // cudaMemcpyBatchAsync is not supported during CUDA graph capture
+  if (capturing) {
+    for (int i =0; i < params->numOps; i++) {
+      CUDACHECKGOTO(cudaMemcpyAsync(
+        (void*)params->dsts[i],
+        (void*)params->srcs[i],
+        params->sizes[i],
+        cudaMemcpyDeviceToDevice,
+        stream), ret, fail);
+
+      if (params->intraBatchSync && ((i+1) % comm->ceColl.intraBatchSyncFreq == 0) && ((i+1) < params->numOps)) {
+        NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
+      }
+    }
+  }
+  //--------------No graph capture--------------
+  else {
+    if (/*CUDART_VERSION >= 12080 &&*/ driverVersion >= 12080) {
+#if CUDART_VERSION >= 12080
+    // For CUDA 12.8+, use batch memory copy for better performance
+    params->attrs[0] = {};
+    params->attrs[0].srcAccessOrder = cudaMemcpySrcAccessOrderStream;
+    params->attrs[0].flags = cudaMemcpyFlagPreferOverlapWithCompute;
+    params->attrIdxs[0] = 0;
+    params->numAttrs = 1;
+
+    if (params->intraBatchSync) {
+      // Break into multiple batches with sync between them
+      int batchSize = comm->ceColl.intraBatchSyncFreq;
+      for (int i = 0; i < params->numOps; i += batchSize) {
+        int currentBatchSize = (i + batchSize <= params->numOps) ? batchSize : params->numOps - i;
+
+        #if CUDART_VERSION >= 13000
+        CUDACHECKGOTO(cudaMemcpyBatchAsync(
+          &params->dsts[i], &params->srcs[i], &params->sizes[i], currentBatchSize,
+          params->attrs, params->attrIdxs, params->numAttrs, stream), ret, fail);
+        #else
+        CUDACHECKGOTO(cudaMemcpyBatchAsync(
+          &params->dsts[i], &params->srcs[i], &params->sizes[i], currentBatchSize,
+          params->attrs, params->attrIdxs, params->numAttrs, nullptr, stream), ret, fail);
+        #endif
+
+        // Sync after each batch
+        if (i + batchSize < params->numOps) {
+          NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
+        }
+      }
+    } else {
+      // Use single batch for all operations
+      #if CUDART_VERSION >= 13000
+      CUDACHECKGOTO(cudaMemcpyBatchAsync(
+        params->dsts, params->srcs, params->sizes, params->numOps,
+        params->attrs, params->attrIdxs, params->numAttrs, stream), ret, fail);
+      #else
+      CUDACHECKGOTO(cudaMemcpyBatchAsync(
+        params->dsts, params->srcs, params->sizes, params->numOps,
+        params->attrs, params->attrIdxs, params->numAttrs, nullptr, stream), ret, fail);
+      #endif
+    }
+#endif
+    } else {
+      // For older CUDA versions, fall back to individual transfers
+      for (int i = 0; i < params->numOps; i++) {
+        CUDACHECKGOTO(cudaMemcpyAsync(
+          (void*)params->dsts[i],
+          (void*)params->srcs[i],
+          params->sizes[i],
+          cudaMemcpyDeviceToDevice,
+          stream), ret, fail);
+
+        if (params->intraBatchSync && ((i+1) % comm->ceColl.intraBatchSyncFreq == 0) && ((i+1) < params->numOps)) {
+          NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
+        }
+      }
+    }
+  }
+
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+
+ncclResult_t ncclCeAllGather(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) {
+  ncclResult_t ret = ncclSuccess;
+  
+  // Calculate the size of each rank's data chunk
+  const size_t chunkBytes = args->nElts * args->eltSize;
+  uint8_t* mySendBuff = (uint8_t*)args->sendBuff;
+  uint8_t* myRecvBuff = (uint8_t*)args->recvBuff + comm->rank * chunkBytes;
+  void* peerRecvBuff;
+  size_t offset;
+
+  struct ncclCeBatchOpsParams batchOpsParams = {};
+  NCCLCHECKGOTO(ncclCeInitBatchOpsParams(&batchOpsParams, comm->nRanks), ret, fail);
+
+  // Ensure all ranks are ready before starting transfers
+  NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
+
+  // Copy own data to receive buffer if operation is out-of-place
+  if (myRecvBuff != mySendBuff) {
+    batchOpsParams.srcs[batchOpsParams.numOps] = (void*)mySendBuff;
+    batchOpsParams.dsts[batchOpsParams.numOps] = (void*)myRecvBuff;
+    batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
+    batchOpsParams.numOps++;
+  }
+
+  // Copy data to other ranks
+  for (int r = 1; r < comm->nRanks; r++) {
+    int targetRank = (comm->rank + r) % comm->nRanks;
+    offset = myRecvBuff - (uint8_t*)args->recvWin->userPtr;
+    NCCLCHECKGOTO(ncclDevrGetLsaRankPtr(comm, args->recvWin, offset, targetRank, &peerRecvBuff), ret, fail);
+    batchOpsParams.srcs[batchOpsParams.numOps] = (void*)mySendBuff;
+    batchOpsParams.dsts[batchOpsParams.numOps] = (void*)peerRecvBuff;
+    batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
+    batchOpsParams.numOps++;
+  }
+
+  // Check if we need to perform intra-batch synchronization
+  batchOpsParams.intraBatchSync = (batchOpsParams.numOps > comm->ceColl.intraBatchSyncFreq && chunkBytes*batchOpsParams.numOps >= comm->ceColl.intraBatchSyncMsgThreshold);
+
+  // Launch the batch operations
+  NCCLCHECKGOTO(ncclCeLaunchBatchOps(comm, &batchOpsParams, stream), ret, fail);
+
+  // Ensure all transfers are complete across all ranks
+  NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
+
+exit:
+  ncclCeFreeBatchOpsParams(&batchOpsParams);
+  return ret;
+fail:
+  goto exit;
+}
+
+ncclResult_t ncclCeAlltoAll(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) {
+  ncclResult_t ret = ncclSuccess;
+  
+  // Calculate the size of data each rank sends to every other rank
+  const size_t chunkBytes = args->nElts * args->eltSize;
+  uint8_t* mySendBuff = (uint8_t*)args->sendBuff;
+  uint8_t* myRecvBuff = (uint8_t*)args->recvBuff;
+  void* peerRecvBuff;
+  size_t offset;
+
+  struct ncclCeBatchOpsParams batchOpsParams = {};
+  NCCLCHECKGOTO(ncclCeInitBatchOpsParams(&batchOpsParams, comm->nRanks * comm->nRanks), ret, fail);
+
+  // Ensure all ranks are ready before starting transfers
+  NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
+
+  // Copy data to other ranks: send data chunk for each destination rank
+  for (int r = 0; r < comm->nRanks; r++) {
+    int dstRank = (comm->rank + r) % comm->nRanks;
+    uint8_t* srcPtr = mySendBuff + dstRank * chunkBytes;
+    uint8_t* dstPtr = myRecvBuff + comm->rank * chunkBytes;
+    
+    if (dstRank == comm->rank) {
+      // Local copy for own data
+      batchOpsParams.srcs[batchOpsParams.numOps] = (void*)srcPtr;
+      batchOpsParams.dsts[batchOpsParams.numOps] = (void*)dstPtr;
+      batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
+      batchOpsParams.numOps++;
+    } else {
+      // Remote copy to other ranks: send to rank dstRank's receive buffer at position comm->rank
+      offset = dstPtr - (uint8_t*)args->recvWin->userPtr;
+      NCCLCHECKGOTO(ncclDevrGetLsaRankPtr(comm, args->recvWin, offset, dstRank, &peerRecvBuff), ret, fail);
+      batchOpsParams.srcs[batchOpsParams.numOps] = (void*)srcPtr;
+      batchOpsParams.dsts[batchOpsParams.numOps] = (void*)peerRecvBuff;
+      batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
+      batchOpsParams.numOps++;
+    }
+  }
+
+  // Check if we need to perform intra-batch synchronization
+  batchOpsParams.intraBatchSync = (batchOpsParams.numOps > comm->ceColl.intraBatchSyncFreq && chunkBytes*batchOpsParams.numOps >= comm->ceColl.intraBatchSyncMsgThreshold);
+
+  // Launch the batch operations
+  NCCLCHECKGOTO(ncclCeLaunchBatchOps(comm, &batchOpsParams, stream), ret, fail);
+
+  // Ensure all transfers are complete across all ranks
+  NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
+
+exit:
+  ncclCeFreeBatchOpsParams(&batchOpsParams);
+  return ret;
+fail:
+  goto exit;
+}
+
+ncclResult_t ncclCeScatter(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) {
+  ncclResult_t ret = ncclSuccess;
+  
+  // Calculate the size of data root sends to each rank
+  const size_t chunkBytes = args->nElts * args->eltSize;
+  uint8_t* mySendBuff = (uint8_t*)args->sendBuff;
+  uint8_t* myRecvBuff = (uint8_t*)args->recvBuff;
+  int rootRank = args->rootRank;
+  void* peerDstPtr;
+  size_t offset;
+
+  struct ncclCeBatchOpsParams batchOpsParams = {};
+  NCCLCHECKGOTO(ncclCeInitBatchOpsParams(&batchOpsParams, comm->nRanks), ret, fail);
+
+  // Ensure all ranks are ready before starting transfers
+  NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
+
+  if (comm->rank == rootRank) {
+    // Check if this is an in-place scatter operation
+    bool isInPlace = (myRecvBuff == mySendBuff + comm->rank * chunkBytes);
+
+    // Copy root's own data first if not in-place
+    if (!isInPlace) {
+      uint8_t* srcPtr = mySendBuff + comm->rank * chunkBytes;
+      uint8_t* dstPtr = myRecvBuff;
+      batchOpsParams.srcs[batchOpsParams.numOps] = (void*)srcPtr;
+      batchOpsParams.dsts[batchOpsParams.numOps] = (void*)dstPtr;
+      batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
+      batchOpsParams.numOps++;
+    }
+
+    // Root rank distributes data to other ranks
+    for (int r = 1; r < comm->nRanks; r++) {
+      int dstRank = (comm->rank + r) % comm->nRanks;
+      uint8_t* srcPtr = mySendBuff + dstRank * chunkBytes;
+      uint8_t* dstPtr = isInPlace ? myRecvBuff + dstRank * chunkBytes : myRecvBuff;
+
+      offset = dstPtr - (uint8_t*)args->recvWin->userPtr;
+      NCCLCHECKGOTO(ncclDevrGetLsaRankPtr(comm, args->recvWin, offset, dstRank, &peerDstPtr), ret, fail);
+      batchOpsParams.srcs[batchOpsParams.numOps] = (void*)srcPtr;
+      batchOpsParams.dsts[batchOpsParams.numOps] = (void*)peerDstPtr;
+      batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
+      batchOpsParams.numOps++;
+    }
+  }
+  // Non-root ranks don't need to perform any copy operations
+
+  // Launch the batch operations
+  NCCLCHECKGOTO(ncclCeLaunchBatchOps(comm, &batchOpsParams, stream), ret, fail);
+
+  // Ensure all transfers are complete across all ranks
+  NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
+
+exit:
+  ncclCeFreeBatchOpsParams(&batchOpsParams);
+  return ret;
+fail:
+  goto exit;
+}
+
+ncclResult_t ncclCeGather(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) {
+  ncclResult_t ret = ncclSuccess;
+  
+  // Calculate the size of data each rank sends to root
+  const size_t chunkBytes = args->nElts * args->eltSize;
+  uint8_t* mySendBuff = (uint8_t*)args->sendBuff;
+  uint8_t* myRecvBuff = (uint8_t*)args->recvBuff;
+  int rootRank = args->rootRank;
+  void* peerRecvBuff;
+  size_t offset;
+
+  struct ncclCeBatchOpsParams batchOpsParams = {};
+  NCCLCHECKGOTO(ncclCeInitBatchOpsParams(&batchOpsParams, 1), ret, fail);
+
+  // Ensure all ranks are ready before starting transfers
+  NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
+
+  if (comm->rank == rootRank) {
+    // Root rank copies its own data to the correct position in receive buffer
+    uint8_t* dstPtr = myRecvBuff + comm->rank * chunkBytes;
+    if (mySendBuff != dstPtr) {
+      batchOpsParams.srcs[batchOpsParams.numOps] = (void*)mySendBuff;
+      batchOpsParams.dsts[batchOpsParams.numOps] = (void*)dstPtr;
+      batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
+      batchOpsParams.numOps++;
+    }
+  } else {
+    // Non-root ranks send their data to root's receive buffer
+    uint8_t* rootRecvPtr = (uint8_t*)args->recvBuff + comm->rank * chunkBytes;
+    offset = rootRecvPtr - (uint8_t*)args->recvWin->userPtr;
+    NCCLCHECKGOTO(ncclDevrGetLsaRankPtr(comm, args->recvWin, offset, rootRank, &peerRecvBuff), ret, fail);
+    batchOpsParams.srcs[batchOpsParams.numOps] = (void*)mySendBuff;
+    batchOpsParams.dsts[batchOpsParams.numOps] = (void*)peerRecvBuff;
+    batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
+    batchOpsParams.numOps++;
+  }
+
+  // Launch the batch operations
+  NCCLCHECKGOTO(ncclCeLaunchBatchOps(comm, &batchOpsParams, stream), ret, fail);
+
+  // Ensure all transfers are complete across all ranks
+  NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
+
+exit:
+  ncclCeFreeBatchOpsParams(&batchOpsParams);
+  return ret;
+fail:
+  goto exit;
+}
+
+ncclResult_t ncclLaunchCeColl(struct ncclComm* comm, struct ncclKernelPlan* plan) {
+  ncclResult_t ret = ncclSuccess;
+  cudaStream_t stream = comm->planner.streams->stream;
+  struct ncclCeCollArgs* args = plan->ceCollArgs;
+
+  switch (args->func) {
+    case ncclFuncAllGather:
+      NCCLCHECKGOTO(ncclCeAllGather(comm, args, stream), ret, fail);
+      break;
+    case ncclFuncAlltoAll:
+      NCCLCHECKGOTO(ncclCeAlltoAll(comm, args, stream), ret, fail);
+      break;
+    case ncclFuncScatter:
+      NCCLCHECKGOTO(ncclCeScatter(comm, args, stream), ret, fail);
+      break;
+    case ncclFuncGather:
+      NCCLCHECKGOTO(ncclCeGather(comm, args, stream), ret, fail);
+      break;
+    default:
+      ret = ncclInvalidUsage;
+  }
+
+exit:
+  return ret;
+fail:
+  goto exit;
+}
@@ -23,10 +23,13 @@ const char* ncclFuncToString(ncclFunc_t fn) {
  switch (fn) {
  case ncclFuncAllGather: return "AllGather";
  case ncclFuncAllReduce: return "AllReduce";
+  case ncclFuncAlltoAll: return "AlltoAll";
  case ncclFuncBroadcast: return "Broadcast";
+  case ncclFuncGather: return "Gather";
  case ncclFuncRecv: return "Recv";
  case ncclFuncReduce: return "Reduce";
  case ncclFuncReduceScatter: return "ReduceScatter";
+  case ncclFuncScatter: return "Scatter";
  case ncclFuncSendRecv: return "SendRecv";
  case ncclFuncSend: return "Send";
  default: return "Invalid";
@@ -85,7 +88,6 @@ const char* ncclProtoToString(int proto) {

 NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
-
 ncclResult_t ncclAllGather_impl(const void* sendbuff, void* recvbuff, size_t sendcount,
    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
  NVTX3_FUNC_WITH_PARAMS(AllGather, NcclNvtxParamsAllGather,
@@ -148,10 +150,101 @@ ncclResult_t ncclAllGather_impl(const void* sendbuff, void* recvbuff, size_t sen
  }
 }

+RCCL_PARAM(AlltoAllPivotEnable, "ALL_TO_ALL_PIVOT_ENABLE", 0);
+
+NCCL_API(ncclResult_t, ncclAlltoAll, const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclComm* comm, cudaStream_t stream);
+ncclResult_t ncclAlltoAll_impl(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclComm* comm, cudaStream_t stream) {
+  NVTX3_FUNC_WITH_PARAMS(AlltoAll, NcclNvtxParamsAlltoAll,
+    NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), datatype));
+  
+  if (!mscclIsCaller()) // when msccl falls back to
+  {
+    NCCLCHECK(Recorder::instance().record(rrAllToAll, sendbuff, recvbuff, count, datatype, comm, stream));
+  }
+
+  if (mscclAvailable(comm) && !mscclIsCaller()) {
+    return mscclEnqueueCheck(
+      sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
+      count, datatype, 0, 0, ncclSum, mscclFuncAllToAll, comm, stream);
+  }
+
+  size_t rankOffset = count * ncclTypeSize(datatype);
+  size_t rankAlign = rankOffset & ((~rankOffset) + 1);
+  size_t msgSize = count * ncclTypeSize(datatype) * comm->nRanks;
+
+  struct ncclInfo info;
+  if (comm->topo->pivotA2AEnabled && comm->nChannels >= comm->topo->pivotA2ANumBiRings * 2 &&
+      rankOffset >= 744 * 1024 && rankAlign != 4 && rcclParamAlltoAllPivotEnable()) {
+      info = { ncclFuncAlltoAllPivot, "AlltoAllPivot",
+        sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream, /* Args */
+        ALLTOALL_PIVOT_CHUNKSTEPS, ALLTOALL_PIVOT_SLICESTEPS, nullptr };
+  } else {
+      #ifdef ENABLE_ROCSHMEM
+      if (rcclUseAllToAllGda(comm) && msgSize <= comm->rocshmemThreshold) {	
+        struct ncclInfo info = { ncclFuncAllToAllGda, "AllToAllGda",
+              sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream,
+              ALLTOALL_PIVOT_CHUNKSTEPS, ALLTOALL_PIVOT_SLICESTEPS, nullptr };
+            
+        return ncclEnqueueCheck(&info);
+      }
+      #endif ENABLE_ROCSHMEM
+    info = { ncclFuncAlltoAll, "AlltoAll",
+      sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream, /* Args */
+      ALLTOALL_CHUNKSTEPS, ALLTOALL_SLICESTEPS };
+  }
+  return ncclEnqueueCheck(&info);
+}
+
+NCCL_API(ncclResult_t, ncclAlltoAllv, const void *sendbuff, const size_t sendcounts[], const size_t sdispls[],
+    void *recvbuff, const size_t recvcounts[], const size_t rdispls[],
+    ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
+ncclResult_t ncclAlltoAllv_impl(const void *sendbuff, const size_t sendcounts[], const size_t sdispls[],
+    void *recvbuff, const size_t recvcounts[], const size_t rdispls[],
+    ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
+  NVTX3_FUNC_WITH_PARAMS(AlltoAllv, NcclNvtxParamsAlltoAllv,
+    NVTX3_PAYLOAD(comm ? comm->commHash : 0, sendcounts[comm->rank] * ncclTypeSize(datatype),
+      recvcounts[comm->rank] * ncclTypeSize(datatype), datatype));
+
+  if (!mscclIsCaller()) // when msccl falls back to
+  {
+    NCCLCHECK(Recorder::instance().record(rrAllToAllv, sendbuff, recvbuff, 0, datatype, comm, stream, -1, sendcounts, sdispls, recvcounts, rdispls));
+  }
+
+  if (mscclAvailable(comm) && !mscclIsCaller()) {
+    return mscclEnqueueCheck(
+      sendbuff, sendcounts, sdispls, recvbuff, recvcounts, rdispls,
+      0, datatype, 0, 0, ncclSum, mscclFuncAllToAllv, comm, stream);
+  }
+
+  int nRanks;
+  NCCLCHECK(ncclCommCount(comm, &nRanks));
+  if (!mscclIsCaller()) Recorder::instance().skip(true);
+  NCCLCHECK(ncclGroupStart());
+  for (int r=0; r<nRanks; r++) {
+    NCCLCHECK(ncclSend(
+        ((char*)sendbuff) + sdispls[r]*ncclTypeSize(datatype),
+        sendcounts[r],
+        datatype,
+        r,
+        comm,
+        stream));
+    NCCLCHECK(ncclRecv(
+        ((char*)recvbuff) + rdispls[r]*ncclTypeSize(datatype),
+        recvcounts[r],
+        datatype,
+        r,
+        comm,
+        stream));
+  }
+  NCCLCHECK(ncclGroupEnd());
+  if (!mscclIsCaller()) Recorder::instance().skip(false);
+  return ncclSuccess;
+}
+
 NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
-
-
 ncclResult_t ncclAllReduce_impl(const void* sendbuff, void* recvbuff, size_t count,
    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
  NVTX3_FUNC_WITH_PARAMS(AllReduce, NcclNvtxParamsAllReduce,
@@ -202,116 +295,8 @@ ncclResult_t ncclAllReduceWithBias_impl(const void* sendbuff, void* recvbuff, si
  return ncclEnqueueCheck(&info);
 }

-RCCL_PARAM(AllToAllPivotEnable, "ALL_TO_ALL_PIVOT_ENABLE", 0);
-
-NCCL_API(ncclResult_t, ncclAllToAll, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
-  ncclComm_t comm, hipStream_t stream);
-
-
-ncclResult_t ncclAllToAll_impl(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
-  ncclComm_t comm, hipStream_t stream) {
-  NVTX3_FUNC_WITH_PARAMS(AllToAll, NcclNvtxParamsAllToAll,
-    NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), datatype));
-
-  if (!mscclIsCaller()) // when msccl falls back to
-  {
-    NCCLCHECK(Recorder::instance().record(rrAllToAll, sendbuff, recvbuff, count, datatype, comm, stream));
-  }
-
-  if (mscclAvailable(comm) && !mscclIsCaller()) {
-    return mscclEnqueueCheck(
-      sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
-      count, datatype, 0, 0, ncclSum, mscclFuncAllToAll, comm, stream);
-  }
-
-  size_t rankOffset = count * ncclTypeSize(datatype);
-  size_t rankAlign = rankOffset & ((~rankOffset) + 1);
-  size_t msgSize = count * ncclTypeSize(datatype) * comm->nRanks;
-
-  // Determine Pivot A2A support now that we know number of channels
-  if (comm->topo->pivotA2AEnabled && comm->nChannels >= comm->topo->pivotA2ANumBiRings * 2 &&
-      rankOffset >= 744 * 1024 && rankAlign != 4 && rcclParamAllToAllPivotEnable()) {
-    struct ncclInfo info = { ncclFuncAllToAllPivot, "AllToAllPivot",
-      sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream, /* Args */
-      ALLTOALL_PIVOT_CHUNKSTEPS, ALLTOALL_PIVOT_SLICESTEPS, nullptr };
-    return ncclEnqueueCheck(&info);
-  } else {
-#ifdef ENABLE_ROCSHMEM
-    if (rcclUseAllToAllGda(comm) && msgSize <= comm->rocshmemThreshold) {	
-	    struct ncclInfo info = { ncclFuncAllToAllGda, "AllToAllGda",
-      	    sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream,
-      	    ALLTOALL_PIVOT_CHUNKSTEPS, ALLTOALL_PIVOT_SLICESTEPS, nullptr };
-    	    
-	    return ncclEnqueueCheck(&info);
-    }
-#endif	  
-    int nRanks;
-    //comm->isA2a = 0;
-    NCCLCHECK(ncclCommCount(comm, &nRanks));
-    if (count == 0) return ncclSuccess;
-    if (!mscclIsCaller()) Recorder::instance().skip(true);
-    NCCLCHECK(ncclGroupStart());
-    for (int r=0; r<nRanks; r++) {
-      NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, count, datatype, r, comm, stream));
-      NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, count, datatype, r, comm, stream));
-    }
-    NCCLCHECK(ncclGroupEnd());
-    if (!mscclIsCaller()) Recorder::instance().skip(false);
-    return ncclSuccess;
-  }
-}
-
-NCCL_API(ncclResult_t, ncclAllToAllv, const void *sendbuff, const size_t sendcounts[], const size_t sdispls[],
-    void *recvbuff, const size_t recvcounts[], const size_t rdispls[],
-    ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
-
-
-ncclResult_t ncclAllToAllv_impl(const void *sendbuff, const size_t sendcounts[], const size_t sdispls[],
-    void *recvbuff, const size_t recvcounts[], const size_t rdispls[],
-    ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
-  NVTX3_FUNC_WITH_PARAMS(AllToAllv, NcclNvtxParamsAllToAllv,
-    NVTX3_PAYLOAD(comm ? comm->commHash : 0, sendcounts[comm->rank] * ncclTypeSize(datatype),
-      recvcounts[comm->rank] * ncclTypeSize(datatype), datatype));
-
-  if (!mscclIsCaller()) // when msccl falls back to
-  {
-    NCCLCHECK(Recorder::instance().record(rrAllToAllv, sendbuff, recvbuff, 0, datatype, comm, stream, -1, sendcounts, sdispls, recvcounts, rdispls));
-  }
-
-  if (mscclAvailable(comm) && !mscclIsCaller()) {
-    return mscclEnqueueCheck(
-      sendbuff, sendcounts, sdispls, recvbuff, recvcounts, rdispls,
-      0, datatype, 0, 0, ncclSum, mscclFuncAllToAllv, comm, stream);
-  }
-
-  int nRanks;
-  NCCLCHECK(ncclCommCount(comm, &nRanks));
-  if (!mscclIsCaller()) Recorder::instance().skip(true);
-  NCCLCHECK(ncclGroupStart());
-  for (int r=0; r<nRanks; r++) {
-    NCCLCHECK(ncclSend(
-        ((char*)sendbuff) + sdispls[r]*ncclTypeSize(datatype),
-        sendcounts[r],
-        datatype,
-        r,
-        comm,
-        stream));
-    NCCLCHECK(ncclRecv(
-        ((char*)recvbuff) + rdispls[r]*ncclTypeSize(datatype),
-        recvcounts[r],
-        datatype,
-        r,
-        comm,
-        stream));
-  }
-  NCCLCHECK(ncclGroupEnd());
-  if (!mscclIsCaller()) Recorder::instance().skip(false);
-  return ncclSuccess;
-}
-
 NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
    ncclComm_t comm, cudaStream_t stream);
-
 ncclResult_t ncclBroadcast_impl(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
    ncclComm_t comm, cudaStream_t stream) {
  NVTX3_FUNC_WITH_PARAMS(Broadcast, NcclNvtxParamsBroadcast,
@@ -343,46 +328,32 @@ ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int ro
  return ncclBroadcast(buff, buff, count, datatype, root, comm, stream);
 }

-NCCL_API(ncclResult_t, ncclGather, const void* sendbuff, void* recvbuff, size_t sendcount,
-    ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream);
-
-ncclResult_t ncclGather_impl(const void* sendbuff, void* recvbuff, size_t sendcount,
-    ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream) {
+NCCL_API(ncclResult_t, ncclGather, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm* comm, cudaStream_t stream);
+ncclResult_t ncclGather_impl(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm* comm, cudaStream_t stream) {
  NVTX3_FUNC_WITH_PARAMS(Gather, NcclNvtxParamsGather,
-    NVTX3_PAYLOAD(comm ? comm->commHash : 0, sendcount * ncclTypeSize(datatype), root, datatype));
+    NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), root));

  if (!mscclIsCaller()) // when msccl falls back to
  {
-    NCCLCHECK(Recorder::instance().record(rrGather, sendbuff, recvbuff, sendcount, datatype, comm, stream, root));
+    NCCLCHECK(Recorder::instance().record(rrGather, sendbuff, recvbuff, count, datatype, comm, stream, root));
  }

  if (mscclAvailable(comm) && !mscclIsCaller()) {
    return mscclEnqueueCheck(
      sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
-      sendcount, datatype, root, 0, ncclSum, mscclFuncGather, comm, stream);
+      count, datatype, root, 0, ncclSum, mscclFuncGather, comm, stream);
  }

-  int nRanks;
-  NCCLCHECK(ncclCommCount(comm, &nRanks));
-  size_t rankOffset = sendcount * ncclTypeSize(datatype);
-  if (sendcount == 0) return ncclSuccess;
-  int rank;
-  NCCLCHECK(ncclCommUserRank(comm, &rank));
-  if (!mscclIsCaller()) Recorder::instance().skip(true);
-  NCCLCHECK(ncclGroupStart());
-  if (rank == root) {
-    for (int r=0; r<nRanks; r++)
-      NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, sendcount, datatype, r, comm, stream));
-  }
-  NCCLCHECK(ncclSend(sendbuff, sendcount, datatype, root, comm, stream));
-  NCCLCHECK(ncclGroupEnd());
-  if (!mscclIsCaller()) Recorder::instance().skip(false);
-  return ncclSuccess;
+  struct ncclInfo info = { ncclFuncGather, "Gather",
+    sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
+    GATHER_CHUNKSTEPS, GATHER_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
 }

 NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
-
 ncclResult_t ncclReduce_impl(const void* sendbuff, void* recvbuff, size_t count,
    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
  NVTX3_FUNC_WITH_PARAMS(Reduce, NcclNvtxParamsReduce,
@@ -408,8 +379,6 @@ ncclResult_t ncclReduce_impl(const void* sendbuff, void* recvbuff, size_t count,

 NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
-
-
 ncclResult_t ncclReduceScatter_impl(const void* sendbuff, void* recvbuff, size_t recvcount,
    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
  NVTX3_FUNC_WITH_PARAMS(ReduceScatter, NcclNvtxParamsReduceScatter,
@@ -433,48 +402,32 @@ ncclResult_t ncclReduceScatter_impl(const void* sendbuff, void* recvbuff, size_t
  return ncclEnqueueCheck(&info);
 }

-NCCL_API(ncclResult_t, ncclScatter, const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, int root,
-    ncclComm_t comm, hipStream_t stream);
-
-
-ncclResult_t ncclScatter_impl(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, int root,
-    ncclComm_t comm, hipStream_t stream) {
+NCCL_API(ncclResult_t, ncclScatter, const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, int root, ncclComm* comm, cudaStream_t stream);
+ncclResult_t ncclScatter_impl(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, int root, ncclComm* comm, cudaStream_t stream) {
  NVTX3_FUNC_WITH_PARAMS(Scatter, NcclNvtxParamsScatter,
-    NVTX3_PAYLOAD(comm ? comm->commHash : 0, recvcount * ncclTypeSize(datatype), root, datatype));
+    NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), root, datatype));

  if (!mscclIsCaller()) // when msccl falls back to
  {
-    NCCLCHECK(Recorder::instance().record(rrScatter, sendbuff, recvbuff, recvcount, datatype, comm, stream, root));
+    NCCLCHECK(Recorder::instance().record(rrScatter, sendbuff, recvbuff, count, datatype, comm, stream, root));
  }

  if (mscclAvailable(comm) && !mscclIsCaller()) {
    return mscclEnqueueCheck(
      sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
-      recvcount, datatype, root, 0, ncclSum, mscclFuncScatter, comm, stream);
+      count, datatype, root, 0, ncclSum, mscclFuncScatter, comm, stream);
  }

-  int nRanks;
-  NCCLCHECK(ncclCommCount(comm, &nRanks));
-  size_t rankOffset = recvcount * ncclTypeSize(datatype);
-  if (recvcount == 0) return ncclSuccess;
-  int rank;
-  NCCLCHECK(ncclCommUserRank(comm, &rank));
-  if (!mscclIsCaller()) Recorder::instance().skip(true);
-  NCCLCHECK(ncclGroupStart());
-  if (rank == root) {
-    for (int r=0; r<nRanks; r++)
-      NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, recvcount, datatype, r, comm, stream));
-  }
-  NCCLCHECK(ncclRecv(recvbuff, recvcount, datatype, root, comm, stream));
-  NCCLCHECK(ncclGroupEnd());
-  if (!mscclIsCaller()) Recorder::instance().skip(false);
-  return ncclSuccess;
+  struct ncclInfo info = { ncclFuncScatter, "Scatter",
+    sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
+    SCATTER_CHUNKSTEPS, SCATTER_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
 }

 NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
    ncclComm_t comm, cudaStream_t stream);
-
-
 ncclResult_t ncclSend_impl(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
    ncclComm_t comm, cudaStream_t stream) {
  NVTX3_FUNC_WITH_PARAMS(Send, NcclNvtxParamsSendRecv,
@@ -500,7 +453,6 @@ ncclResult_t ncclSend_impl(const void* sendbuff, size_t count, ncclDataType_t da

 NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
    ncclComm_t comm, cudaStream_t stream);
-
 ncclResult_t ncclRecv_impl(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
    ncclComm_t comm, cudaStream_t stream) {
  NVTX3_FUNC_WITH_PARAMS(Recv, NcclNvtxParamsSendRecv,
@@ -28,7 +28,7 @@ static int pid = -1;
 static char hostname[1024];
 thread_local int ncclDebugNoWarn = 0;
 char ncclLastError[1024] = ""; // Global string for the last error in human readable form
-static uint64_t ncclDebugMask = 0;
+uint64_t ncclDebugMask = 0;
 FILE *ncclDebugFile = stdout;
 static pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER;
 static std::chrono::steady_clock::time_point ncclEpoch;
@@ -419,4 +419,4 @@ void ncclSetThreadName(pthread_t thread, const char *fmt, ...) {
  va_end(vargs);
  pthread_setname_np(thread, threadName);
 #endif
-}
+}
@@ -0,0 +1,60 @@
+# Run the scripts once during configuration to get the file lists
+execute_process(
+    COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/generate.py ${CMAKE_CURRENT_BINARY_DIR}/gensrc "${ONLY_FUNCS}"
+    OUTPUT_VARIABLE files
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+)
+string(STRIP "${files}" files)
+list(TRANSFORM files PREPEND ${CMAKE_CURRENT_BINARY_DIR}/gensrc/)
+
+execute_process(
+    COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/symmetric/generate.py ${CMAKE_CURRENT_BINARY_DIR}/gensrc/symmetric "${ONLY_FUNCS}"
+    OUTPUT_VARIABLE symmetric_files
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+)
+string(STRIP "${symmetric_files}" symmetric_files)
+list(TRANSFORM symmetric_files PREPEND ${CMAKE_CURRENT_BINARY_DIR}/gensrc/symmetric/)
+
+# Create custom commands to generate source files with proper dependencies
+add_custom_command(
+    OUTPUT  ${files}
+    BYPRODUCTS ${files}
+    COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/generate.py ${CMAKE_CURRENT_BINARY_DIR}/gensrc "${ONLY_FUNCS}"
+    DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/generate.py
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+    COMMENT "Generating device source files"
+)
+
+add_custom_command(
+    OUTPUT  ${symmetric_files}
+    BYPRODUCTS ${symmetric_files}
+    COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/symmetric/generate.py ${CMAKE_CURRENT_BINARY_DIR}/gensrc/symmetric "${ONLY_FUNCS}"
+    DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/symmetric/generate.py
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+    COMMENT "Generating symmetric device source files"
+)
+
+# Add library target
+add_library(nccl_device OBJECT
+            ${files}
+            ${symmetric_files}
+            ${CMAKE_CURRENT_SOURCE_DIR}/common.cu
+            ${CMAKE_CURRENT_SOURCE_DIR}/onerank.cu
+)
+
+set_target_properties(nccl_device PROPERTIES
+    CUDA_SEPARABLE_COMPILATION ON
+    CUDA_RESOLVE_DEVICE_SYMBOLS ON
+)
+
+# Set include directories for the target
+target_include_directories(nccl_device PUBLIC
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${CMAKE_SOURCE_DIR}/src/include
+    ${CMAKE_SOURCE_DIR}/src/include/plugin
+    ${CMAKE_BINARY_DIR}/include
+    ${CUDAToolkit_INCLUDE_DIRS}
+    ${CUDAToolkit_INCLUDE_DIRS}/cccl
+)
+
+add_dependencies(nccl_device nccl_header)
@@ -19,7 +19,7 @@ OBJDIR := $(BUILDDIR)/obj/device
 MANIFEST := $(OBJDIR)/manifest
 DEVGLUE_OBJ  := $(OBJDIR)/device_glue.o

-INCFLAGS  = -I. -I.. -I$(BUILDDIR)/include -I../include
+INCFLAGS  = -I. -I.. -I$(BUILDDIR)/include -I../include -I../include/plugin
 NVCUFLAGS += $(INCFLAGS) --compiler-options "-fPIC -fvisibility=hidden"
 CXXFLAGS  += $(INCFLAGS)

@@ -47,7 +47,11 @@ endif
 define COMPILE_SYM
@$(SAY) "Compiling" $2;\
 mkdir -p $(dir $1);\
- $(NVCC) $(NVCUFLAGS_SYM) $3 -dw $2 -o $1
+ if [[ -n "$3" ]]; then\
+ $(NVCC) $(NVCUFLAGS_SYM) $3 -dw $2 -o $1;\
+ else\
+ touch $2.empty.cu; $(NVCC) $(NVCUFLAGS_SYM) -dw $2.empty.cu -o $1; rm $2.empty.cu;\
+ fi
 endef

 DEPENDS.cu = $(NVCC) $(NVCUFLAGS) -M -dc $1
@@ -75,7 +75,7 @@ namespace {
 }

 template<typename T, typename RedOp>
-struct RunWorkColl<ncclFuncAllToAllPivot, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
+struct RunWorkColl<ncclFuncAlltoAllPivot, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
  __device__ __forceinline__ void run(int tid, int nThreads, struct ncclDevWorkColl* work) {
    using Proto = ProtoSimple<ALLTOALL_PIVOT_CHUNKSTEPS/ALLTOALL_PIVOT_SLICESTEPS, ALLTOALL_PIVOT_SLICESTEPS>;
    runRing<T, RedOp, Proto>(tid, nThreads, work);
@@ -150,7 +150,7 @@ struct ncclShmemData {
  struct ncclDevKernelArgs args;
  int channelId;
  int aborted;
-  alignas(16) struct ncclDevComm comm;
+  alignas(16) struct ncclKernelComm comm;
  alignas(16) struct ncclDevChannel channel;
 #ifdef ENABLE_WARP_SPEED
  int warpComm;
@@ -502,7 +502,7 @@ __device__ __forceinline__ void profiler(int action) {
        ncclShmem.comm.workCompleted[ncclShmem.channelId].data[wc%MAX_PROFILER_EVENTS_PER_CHANNEL].counter = wc;
      }
      ncclShmem.channel.workCounter += ncclShmem.nWorks;
-      if (action == FINI) ((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter = ncclShmem.channel.workCounter;
+      if (action == FINI) ((ncclKernelCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter = ncclShmem.channel.workCounter;
    }
  }
 }
@@ -579,7 +579,7 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
  /* set abort flag to 0 */
  if (tid == 0) {
    ncclShmem.aborted = 0;
-    ncclShmem.channel.workCounter = ((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter;
+    ncclShmem.channel.workCounter = ((ncclKernelCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter;
  }

  // Use first 2 warps to load comm and channel, and remaining load work batch.
@@ -587,14 +587,14 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
  case 0:
    { void* dst = &ncclShmem.comm;
      void* src = ncclShmem.args.comm;
-      int bytes = sizeof(ncclDevComm);
-      static_assert(sizeof(ncclDevComm) <= 16*WARP_SIZE, "ncclDevComm cannot be loaded by a single warp in one insn.");
+      int bytes = sizeof(ncclKernelComm);
+      static_assert(sizeof(ncclKernelComm) <= 16*WARP_SIZE, "ncclKernelComm cannot be loaded by a single warp in one insn.");
      copyToShmem16(tid, dst, src, bytes);
    } break;
  case 1:
-    { // Get address of channel without incurring indirect load from ncclDevComm::channels
+    { // Get address of channel without incurring indirect load from ncclKernelComm::channels
      void* dst = &ncclShmem.channel;
-      void* src = &((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId];
+      void* src = &((ncclKernelCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId];
      int bytes = sizeof(ncclDevChannel);
      static_assert(sizeof(ncclDevChannel) <= 16*WARP_SIZE, "ncclDevChannel cannot be loaded by a single warp in one insn.");
      copyToShmem16(tid-WARP_SIZE, dst, src, bytes);
@@ -641,7 +641,7 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
    __syncthreads();
    if(ncclShmem.warpChannelId[localWarpId] >= 0) {
      void* dst = &ncclShmem.warpChannel[localWarpId];
-      void* src = &((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.warpChannelId[localWarpId]];
+      void* src = &((ncclKernelCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.warpChannelId[localWarpId]];
      int bytes = sizeof(ncclDevChannel);
      static_assert(sizeof(ncclDevChannel) <= 16*WARP_SIZE, "ncclDevChannel cannot be loaded by a single warp in one insn.");
      // assert((tid-localWarpId*WARP_SIZE) >= 0 && (tid-localWarpId*WARP_SIZE) < WARP_SIZE);
@@ -3,9 +3,10 @@ import os
 import sys
 import subprocess
 from dataclasses import dataclass
+import shutil

 # Order of colls, redops, tys, protos, algos must match src/include/device.h
-all_colls     = ["Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce", "SendRecv", "", "", "AllToAllPivot", "AllToAllGda"]
+all_colls     = ["Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce", "SendRecv", "", "", "AlltoAllPivot", "AllToAllGda"]
 all_redops    = ["Sum","Prod","MinMax","PreMulSum","SumPostDiv"]
 all_tys       = ["i8","u8","i32","u32","i64","u64","f16","f32","f64","bf16","f8e4m3","f8e5m2"]
 all_protos    = ["LL","LL128","SIMPLE"]
@@ -24,8 +25,11 @@ gensrc = sys.argv[1]

 if os.path.exists(gensrc):
  for name in os.listdir(gensrc):
-    os.remove(os.path.join(gensrc, name))
-    #os.truncate(os.path.join(gensrc, name), 0)
+    path = os.path.join(gensrc, name)
+    if os.path.isfile(path):
+      os.remove(path)
+    elif os.path.isdir(path):
+      shutil.rmtree(path)
 else:
  os.makedirs(gensrc)

@@ -64,7 +68,7 @@ else:
 # make ONLY_FUNCS="AllReduce RING SIMPLE * *|ReduceScatter RING LL * f32"
 #                         --- or ---
 # make ONLY_FUNCS="AllReduce RING SIMPLE|ReduceScatter RING LL * f32"
-# make ONLY_FUNCS="AllReduce RING/TREE LL/SIMPLE Sum/MinMax i8/u8/f16/f32/f64/bf16/f8e4m3/f8e5m2|AllGather RING LL/SIMPLE Sum i8|AllToAllPivot RING SIMPLE Sum i8|Broadcast RING LL/SIMPLE Sum i8|Reduce RING LL/SIMPLE Sum/MinMax i8/u8/f16/f32/f64/bf16/f8e4m3/f8e5m2|ReduceScatter RING LL/SIMPLE Sum/MinMax i8/u8/f16/f32/f64/bf16/f8e4m3/f8e5m2|SendRecv RING SIMPLE Sum i8"
+# make ONLY_FUNCS="AllReduce RING/TREE LL/SIMPLE Sum/MinMax i8/u8/f16/f32/f64/bf16/f8e4m3/f8e5m2|AllGather RING LL/SIMPLE Sum i8|AlltoAllPivot RING SIMPLE Sum i8|Broadcast RING LL/SIMPLE Sum i8|Reduce RING LL/SIMPLE Sum/MinMax i8/u8/f16/f32/f64/bf16/f8e4m3/f8e5m2|ReduceScatter RING LL/SIMPLE Sum/MinMax i8/u8/f16/f32/f64/bf16/f8e4m3/f8e5m2|SendRecv RING SIMPLE Sum i8"

 # Paste all non-None arguments together with `sep`.
 def paste(sep, *args):
@@ -79,14 +83,14 @@ func_pattern = sys.argv[6:7]
 if func_pattern and func_pattern[0]:
  func_pattern = func_pattern[0]
 else:
-  func_pattern = "AllGather|AllReduce|AllToAllPivot|AllToAllGda|Broadcast|Reduce|ReduceScatter|SendRecv"
+  func_pattern = "AllGather|AllReduce|AlltoAllPivot|AllToAllGda|Broadcast|Reduce|ReduceScatter|SendRecv"

 ################################################################################

 algos_of_coll = {
  "AllGather":             ["RING", "PAT"],
  "AllReduce":             ["RING", "TREE"],
-  "AllToAllPivot":         ["RING"],
+  "AlltoAllPivot":         ["RING"],
  "AllToAllGda":           ["RING"],
  "Broadcast":             ["RING"],
  "Reduce":                ["RING"],
@@ -97,7 +101,7 @@ algos_of_coll = {
 protos_of_coll = {
  "AllGather":              all_protos,
  "AllReduce":              all_protos,
-  "AllToAllPivot":          ["SIMPLE"],
+  "AlltoAllPivot":          ["SIMPLE"],
  "AllToAllGda":            ["SIMPLE"],
  "Broadcast":              all_protos,
  "Reduce":                 all_protos,
@@ -108,7 +112,7 @@ protos_of_coll = {
 redops_of_coll = {
  "AllGather":            ["Sum"],
  "AllReduce":            all_redops,
-  "AllToAllPivot":        ["Sum"],
+  "AlltoAllPivot":        ["Sum"],
  "AllToAllGda":          ["Sum"],
  "Broadcast":            ["Sum"],
  "Reduce":               all_redops,
@@ -119,7 +123,7 @@ redops_of_coll = {
 tys_of_coll = {
  "AllGather":             ["i8"],
  "AllReduce":             all_tys,
-  "AllToAllPivot":         ["i8"],
+  "AlltoAllPivot":         ["i8"],
  "AllToAllGda":           ["i8"],
  "Broadcast":             ["i8"],
  "Reduce":                all_tys,
@@ -130,7 +134,7 @@ tys_of_coll = {
 acc_of_coll = {
  "AllGather":             ["0"],
  "AllReduce":             all_accs,
-  "AllToAllPivot":         ["0"],
+  "AlltoAllPivot":         ["0"],
  "AllToAllGda":           ["0"],
  "Broadcast":             ["0"],
  "Reduce":                ["0"],
@@ -141,7 +145,7 @@ acc_of_coll = {
 pipelines_of_coll = {
  "AllGather":             ["0"],
  "AllReduce":             all_pipelines,
-  "AllToAllPivot":         ["0"],
+  "AlltoAllPivot":         ["0"],
  "AllToAllGda":           ["0"],
  "Broadcast":             ["0"],
  "Reduce":                all_pipelines,
@@ -153,7 +157,7 @@ pipelined_types = ["bf16"]
 coll_camel_to_lower = {
  "AllGather":             "all_gather",
  "AllReduce":             "all_reduce",
-  "AllToAllPivot":         "alltoall_pivot",
+  "AlltoAllPivot":         "alltoall_pivot",
  "AllToAllGda":           "alltoall_gda",
  "Broadcast":             "broadcast",
  "Reduce":                "reduce",
@@ -510,7 +514,7 @@ with open(os.path.join(gensrc, "host_table.cpp"), "w") as f:
      )
      if fn.coll == "Broadcast":
        key = ((coll_idx & 0x3F) | ((proto_idx & 0x3F) << 8))
-      if fn.coll in ["SendRecv", "AllToAllPivot", "AllToAllGda"]:
+      if fn.coll in ["SendRecv", "AlltoAllPivot", "AllToAllGda"]:
        key = ((coll_idx & 0x3F))
      
      out(f'  {{{key}, {fn_id}}}, {comment}\n')
@@ -93,7 +93,7 @@ __device__ __forceinline__ static void mscclReduce(int c, int numReductions, int

 template<typename T, typename RedOp, typename Proto, bool fullOps>
 __device__ __forceinline__ void mscclRunInterpreter(
-  struct ncclDevComm* comm, struct mscclAlgo* algo, struct mscclWork* work) {
+  struct ncclKernelComm* comm, struct mscclAlgo* algo, struct mscclWork* work) {
  const int tid = threadIdx.x;
  const int bid = blockIdx.x;
  const int nthreads = MSCCL_MAX_NTHREADS;
@@ -120,12 +120,12 @@ __device__ __forceinline__ void mscclRunInterpreter(
    case 0:
      dst = &ncclShmem.comm;
      src = comm;
-      bytes = sizeof(ncclDevComm);
+      bytes = sizeof(ncclKernelComm);
      break;
    case 1:
-      // Get address of channel without incurring indirect load from ncclDevComm::channels
+      // Get address of channel without incurring indirect load from ncclKernelComm::channels
      dst = &ncclShmem.channel;
-      src = &((ncclDevCommAndChannels*)comm)->channels[channelId];
+      src = &((ncclKernelCommAndChannels*)comm)->channels[channelId];
      bytes = sizeof(ncclDevChannel);
      break;
    case 2:
@@ -372,13 +372,13 @@ __device__ __forceinline__ void mscclRunInterpreter(
 }

 #define MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, type, fullOps) \
-__global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, LL, fullOps)(struct ncclDevComm* comm, struct mscclAlgo* algo, struct mscclWork* work) { \
+__global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, LL, fullOps)(struct ncclKernelComm* comm, struct mscclAlgo* algo, struct mscclWork* work) { \
  mscclRunInterpreter<type, Func##devredop<type>, ProtoLL, fullOps>(comm, algo, work); \
 } \
-__global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, LL128, fullOps)(struct ncclDevComm* comm, struct mscclAlgo* algo, struct mscclWork* work) { \
+__global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, LL128, fullOps)(struct ncclKernelComm* comm, struct mscclAlgo* algo, struct mscclWork* work) { \
  mscclRunInterpreter<type, Func##devredop<type>, ProtoLL128, fullOps>(comm, algo, work); \
 } \
-__global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, Simple, fullOps)(struct ncclDevComm* comm, struct mscclAlgo* algo, struct mscclWork* work) { \
+__global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, Simple, fullOps)(struct ncclKernelComm* comm, struct mscclAlgo* algo, struct mscclWork* work) { \
  mscclRunInterpreter<type, Func##devredop<type>, ProtoSimple<MSCCL_CHUNKSTEPS/MSCCL_SLICESTEPS, MSCCL_SLICESTEPS, 0, 2>, fullOps>(comm, algo, work); \
 }

@@ -1,35 +1,36 @@
 // Modification Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT 

-#include "symmetric.h"
+#include "sym_kernels.h"
 #include "symmetric/kernel.h"
 #include "symmetric/primitives.h"

 template<int BytePerPack, int UnrollPacks, int UnrollPeers>
 static __device__ void bcastDeep(
-    ncclSymPrims& prim, int tn, int t, bool waitNeeded,
-    char* inputHere, char* outputRank0, bool inPlace, int nIters
+    ncclSymkArgsHandler const& handler, int tn, int t,
+    bool waitNeeded, ncclLsaBarrierSession<ncclCoopCta>& bar,
+    ncclSymPtr<char> input, ncclSymPtr<char> output, bool inPlace, int nIters
  ) {
  using Pack = BytePack<BytePerPack>;
  int wn = tn/WARP_SIZE;
  int w = t/WARP_SIZE;
  int lane = t%WARP_SIZE;
-  int const& rank = prim.rank;
-  int const& nRanks = prim.nRanks;
-  uint32_t const& stride4G = prim.stride4G;
-  Pack* inpHere = (Pack*)inputHere + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
-  Pack* outRank0 = (Pack*)outputRank0 + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
+  int const& rank = handler.comm.rank;
+  int const& nRanks = handler.comm.nRanks;
+
+  Pack* inpPacks = (Pack*)input.localPtr() + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
+  ncclSymPtr<Pack> outPacks = (ncclSymPtr<Pack>)output + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
  Pack tmp[UnrollPacks];

  nIters -= w;
  if (0 < nIters) {
    #pragma unroll
    for (int u=0; u < UnrollPacks; u++) {
-      tmp[u] = inpHere[u*WARP_SIZE];
+      tmp[u] = inpPacks[u*WARP_SIZE];
    }
  }

-  if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+  if (waitNeeded) bar.wait(ncclCoopCta(), cuda::memory_order_relaxed);

  if (0 < nIters) {
    while (true) {
@@ -47,21 +48,21 @@ static __device__ void bcastDeep(
            if (partial && dr == nRanks) break;
            #pragma unroll UnrollPacks
            for (int u=0; u < UnrollPacks; u++) {
-              add4G(outRank0, r*stride4G)[u*WARP_SIZE] = tmp[u];
+              outPacks.lsaPtr(r)[u*WARP_SIZE] = tmp[u];
            }
            if (++r == nRanks) r = 0;
          }
        }
      }
-      inpHere += intptr_t(wn)*UnrollPacks*WARP_SIZE;
-      outRank0 += intptr_t(wn)*UnrollPacks*WARP_SIZE;
+      inpPacks += intptr_t(wn)*UnrollPacks*WARP_SIZE;
+      outPacks += intptr_t(wn)*UnrollPacks*WARP_SIZE;
      nIters -= wn;
      if (nIters <= 0) break;

      // Load data for next iteration.
      #pragma unroll
      for (int u=0; u < UnrollPacks; u++) {
-        tmp[u] = inpHere[u*WARP_SIZE];
+        tmp[u] = inpPacks[u*WARP_SIZE];
      }
    }
  }
@@ -69,18 +70,17 @@ static __device__ void bcastDeep(

 template<int UnrollPeers, typename T>
 static __device__ void bcastEnds(
-    ncclSymPrims& prim, int tn, int t,
-    T* inputHere, T* outputRank0, bool inPlace, size_t nElts, uint32_t nPreElts, size_t nSufElts
+    ncclSymkArgsHandler const& handler, int tn, int t,
+    ncclSymPtr<T> input, ncclSymPtr<T> output, bool inPlace, size_t nElts, uint32_t nPreElts, size_t nSufElts
  ) {
-  int const& rank = prim.rank;
-  int const& nRanks = prim.nRanks;
-  uint32_t const& stride4G = prim.stride4G;
-  BytePack<sizeof(T)>* inpHere = (BytePack<sizeof(T)>*)inputHere;
-  BytePack<sizeof(T)>* outRank0 = (BytePack<sizeof(T)>*)outputRank0;
+  int const& rank = handler.comm.rank;
+  int const& nRanks = handler.comm.nRanks;
+  BytePack<sizeof(T)>* inpPacks = (BytePack<sizeof(T)>*)input.localPtr();
+  ncclSymPtr<BytePack<sizeof(T)>> outPacks = (ncclSymPtr<BytePack<sizeof(T)>>)output;
  #pragma unroll 1
  for (size_t i = t; i < nPreElts+nSufElts; i += tn) {
    size_t elt = i < nPreElts ? i : nElts-nPreElts-nSufElts+i;
-    BytePack<sizeof(T)> tmp = inpHere[elt];
+    BytePack<sizeof(T)> tmp = inpPacks[elt];
    int dr = inPlace ? 1 : 0;
    int r = rank + dr;
    if (r == nRanks) r = 0;
@@ -88,14 +88,14 @@ static __device__ void bcastEnds(
    for (; dr + UnrollPeers <= nRanks; dr += UnrollPeers) {
      #pragma unroll UnrollPeers
      for (int u=0; u < UnrollPeers; u++) {
-        *add4G(outRank0+elt, r*stride4G) = tmp;
+        outPacks.lsaPtr(r)[elt] = tmp;
        if (++r == nRanks) r = 0;
      }
    }
    #pragma unroll UnrollPeers
    for (int u=0; u < UnrollPeers; u++) {
      if (dr+u == nRanks) break;
-      *add4G(outRank0+elt, r*stride4G) = tmp;
+      outPacks.lsaPtr(r)[elt] = tmp;
      if (++r == nRanks) r = 0;
    }
  }
@@ -103,95 +103,95 @@ static __device__ void bcastEnds(

 template<typename T>
 static __device__ void bcast(
-    ncclSymPrims& prim, int tn, int t, bool waitNeeded, T* input, T* output, size_t nElts
+    ncclSymkArgsHandler const& handler, int tn, int t, int nBlocks,
+    bool waitNeeded, ncclLsaBarrierSession<ncclCoopCta>& bar,
+    ncclSymPtr<T> input, ncclSymPtr<T> output, size_t nElts
  ) {
  bool inPlace = (input == output);
-  // Mpve to rank=0
-  output = prim.peerPtr(0, output);
-
-  uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
-  uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
  size_t nBytes = nElts*sizeof(T);
+  uint32_t nBlocks_rcp32 = nccl::utility::idivRcp32_upto64(nBlocks);

-  uint32_t nPreBytes = (128u - inputUptr)%128u;
+  uint32_t nPreBytes = (16 - input.offset)%16;
  nPreBytes = min((size_t)nPreBytes, nBytes);
  uintptr_t cursor = nPreBytes;

  constexpr int MinWarpPerBlock = 4;

-  if ((inputUptr-outputUptr)%16 == 0) {
-    constexpr int BytePerPack = 16, UnrollPacks = 1, UnrollPeers = 1;
+  if ((input.offset - output.offset)%16 == 0) {
+    constexpr int BytePerPack = 16, UnrollPacks = 4, UnrollPeers = 2;
    constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
    uint32_t chunks = (nBytes-cursor)/BytePerChunk;
-    chunks -= imodFast32(chunks, prim.nBlocks, prim.nBlocks_rcp32);
+    chunks -= imodFast32(chunks, nBlocks, nBlocks_rcp32);
    if (chunks != 0) {
      uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
      bcastDeep<BytePerPack, UnrollPacks, UnrollPeers>(
-        prim, tn, t, waitNeeded,
-        (char*)input + cursor, (char*)output + cursor, inPlace,
-        chunks*MinWarpPerBlock
+        handler, tn, t, waitNeeded, bar,
+        (ncclSymPtr<char>)input + cursor,
+        (ncclSymPtr<char>)output + cursor,
+        inPlace, chunks*MinWarpPerBlock
      );
      cursor = cursorAfter;
      waitNeeded = false;
    }
  }

-  if (sizeof(T) == 4 || (sizeof(T) < 4 && (inputUptr-outputUptr)%4 == 0)) {
-    constexpr int BytePerPack = 4, UnrollPacks = 1, UnrollPeers = 1;
-    constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
-    uint32_t chunks = (nBytes-cursor)/BytePerChunk;
-    chunks -= imodFast32(chunks, prim.nBlocks, prim.nBlocks_rcp32);
+  if (sizeof(T) == 4 || (sizeof(T) < 4 && (input.offset - output.offset)%4 == 0)) {
+    chunks -= imodFast32(chunks, nBlocks, nBlocks_rcp32);
    if (chunks != 0) {
-      uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
      bcastDeep<(sizeof(T) <= BytePerPack ? BytePerPack : 0), UnrollPacks, UnrollPeers>(
-        prim, tn, t, waitNeeded,
-        (char*)input + cursor, (char*)output + cursor, inPlace,
-        chunks*MinWarpPerBlock
+        handler, tn, t, waitNeeded, bar,
+        (ncclSymPtr<char>)input + cursor,
+        (ncclSymPtr<char>)output + cursor,
+        inPlace, chunks*MinWarpPerBlock
      );
      cursor = cursorAfter;
      waitNeeded = false;
    }
  }

-  if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+  if (waitNeeded) bar.wait(ncclCoopCta(), cuda::memory_order_relaxed);

  constexpr int UnrollPeers = 8;
  size_t nSufElts = (nBytes-cursor)/sizeof(T);
-  bcastEnds<UnrollPeers>(prim, tn, t, input, output, inPlace, nElts, nPreBytes/sizeof(T), nSufElts);
+  bcastEnds<UnrollPeers>(handler, tn, t, input, output, inPlace, nElts, nPreBytes/sizeof(T), nSufElts);
 }

-__device__ __forceinline__ void ncclSymRun_AllGather_ST(ncclSymDevArgs const* args) {
-  ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier);
-  int const& rank = prim.rank;
+__device__ __forceinline__ void ncclSymkRun_AllGather_ST(ncclSymkDevWorkArgs const* args) {
+  ncclSymkArgsHandler handler{args};
+  ncclLsaBarrierSession<ncclCoopCta> bar{
+    ncclCoopCta(), handler.comm, ncclTeamTagLsa(), blockIdx.x
+  };
+  int const& rank = handler.comm.rank;

-  // Threads numbered over rank.
-  int bt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
-                     prim.block, prim.nBlocks,
-                     threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
-  int btn = prim.nBlocks*blockDim.x;
+  bar.arrive(ncclCoopCta(), cuda::memory_order_relaxed);

-  prim.barrierArrive(ncclCoopCta(), /*release=*/false);
-  //prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+  bool waitNeeded = true;
+  handler.forEachWork<char>(
+      [&]__device__(int block, int nBlocks, size_t nElts, size_t nAllElts,
+                    ncclSymPtr<char> input, ncclSymPtr<char> output) {
+        // Threads numbered over rank.
+        int bt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
+                           block, nBlocks,
+                           threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
+        int btn = nBlocks*blockDim.x;

-  bcast(prim, btn, bt, /*waitNeeded=*/true, (char*)args->input, (char*)args->output + rank*args->nElts, args->nElts);
+        bcast(handler, btn, bt, nBlocks, waitNeeded, bar, input, output + rank*nAllElts, nElts);

-  prim.barrierArrive(ncclCoopCta(), /*release=*/true);
-  prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+        waitNeeded = false;
+      }
+    );
+
+  bar.sync(ncclCoopCta(), cuda::memory_order_release);
 }

-
 template<typename T>
 static __device__ void bcastMultimem(
-    ncclSymPrims& prim, int tn, int t, T* input, T* output, size_t nElts
+    ncclSymkArgsHandler& handler, int tn, int t, ncclSymPtr<T> input, ncclSymPtr<T> output, size_t nElts
  ) {
-  // Move output to multimem
-  output = prim.multimemPtr(output);
-
-  uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
-  uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
  size_t nBytes = nElts*sizeof(T);
-
-  uint32_t nPreBytes = (16-inputUptr)%16;
+  uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input.localPtr());
+  uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output.multimemPtr(handler.comm.lsaMultimem));
+  uint32_t nPreBytes = (16 - input.offset)%16;
  nPreBytes = min((size_t)nPreBytes, nBytes);
  uintptr_t nSufBytes;

@@ -230,51 +230,52 @@ static __device__ void bcastMultimem(
    uintptr_t cursor = i < nPreBytes ? i : nBytes-nSufBytes+(i-nPreBytes);
    BytePack<sizeof(T)> val = *reinterpret_cast<BytePack<sizeof(T)>*>(inputUptr + cursor);
    multimem_st_global(outputUptr + cursor, val);
-    cursor += tn*sizeof(T);
  }
 }

-__device__ __forceinline__ void ncclSymRun_AllGather_STMC(ncclSymDevArgs const* args) {
-  ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier|ncclSymPrims_UseMultimem);
-  int const& rank = prim.rank;
+__device__ __forceinline__ void ncclSymkRun_AllGather_STMC(ncclSymkDevWorkArgs const* args) {
+  ncclSymkArgsHandler handler{args};
+  ncclLsaBarrierSession<ncclCoopCta> bar(
+    ncclCoopCta(), handler.comm, ncclTeamTagLsa(), blockIdx.x, /*multimem=*/true
+  );
+  int const& rank = handler.comm.rank;

-  char* input = args->input;
-  char* output = args->output;
-  size_t bytes = args->nElts;
-  // Round robin memory to blocks.
-  int t = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
-                    prim.block, prim.nBlocks,
-                    threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
-  int tn = prim.nBlocks*blockDim.x;
+  bar.sync(ncclCoopCta(), cuda::memory_order_relaxed);

-  prim.barrierArrive(ncclCoopCta(), /*release=*/false);
-  prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+  handler.forEachWork<char>(
+      [&]__device__(int block, int nBlocks, size_t nElts, size_t nAllElts,
+                    ncclSymPtr<char> input, ncclSymPtr<char> output) {
+        // Round robin memory to blocks.
+        int t = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
+                          block, nBlocks,
+                          threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
+        int tn = nBlocks*blockDim.x;

-  bcastMultimem(prim, tn, t, input, output + rank*bytes, bytes);
+        bcastMultimem(handler, tn, t, input, output + rank*nAllElts, nElts);
+      }
+    );

-  prim.barrierArrive(ncclCoopCta(), /*release=*/true);
-  prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+  bar.sync(ncclCoopCta(), cuda::memory_order_release);
 }

 template<typename EltType>
 static __device__ void allgather_LL_body(
-    ncclSymPrims &prim, EltType* input, EltType* output, int nElts, int nPacks, int nStrideElts
+    ncclSymkArgsHandler& handler, ncclLLA2ASession<ncclCoopCta>& lla2a,
+    EltType* input, EltType* output, int nElts, int nPacks, int nStrideElts
  ) {
  using Pack = BytePack<8>;
  constexpr int EltPerPack = 8/sizeof(EltType);
-
-  ncclCoopCta cta;
-  int rank = prim.rank;
-  int nRanks = prim.nRanks;
-  constexpr int tn = ncclSymMaxThreads;
+  int const& rank = handler.comm.rank;
+  int const& nRanks = handler.comm.nRanks;
  int t = threadIdx.x;
+  constexpr int tn = ncclSymkMaxThreads;

  #pragma unroll 1
  while (0 < nElts) {
    int nIterPacks = min(nPacks, tn);
    if (t < nIterPacks) {
      Pack x = loadPack<Pack>(input, t*EltPerPack, nElts);
-      prim.bcastLL(/*slot=*/nIterPacks*rank + t, x);
+      lla2a.bcast(/*slot=*/nIterPacks*rank + t, x);
    }

    int tn_div_nPacks = tn/nIterPacks;
@@ -287,7 +288,7 @@ static __device__ void allgather_LL_body(
      #pragma unroll 1
      for (int i = t; i < (nRanks*nIterPacks & -(Unroll*tn)); i += Unroll*tn) {
        Pack got[Unroll];
-        prim.template recvLL<Unroll, Unroll>(i, Unroll, tn, /*&*/got);
+        lla2a.template recvUnrolled<Unroll, Unroll>(i, Unroll, tn, /*&*/got);
        #pragma unroll
        for (int u=0; u < Unroll; u++) {
          storePack<Pack>(output + peer*nStrideElts, pack*EltPerPack, nElts, got[u]);
@@ -302,7 +303,7 @@ static __device__ void allgather_LL_body(
      if (i + n*tn < nRanks*nIterPacks) n += 1;
      if (n != 0) {
        Pack got[Unroll];
-        prim.template recvLL<1, Unroll>(i, n, tn, /*&*/got);
+        lla2a.template recvUnrolled<1, Unroll>(i, n, tn, /*&*/got);
        #pragma unroll
        for (int u=0; u < Unroll; u++) {
          if (u != 0 && u == n) break;
@@ -316,7 +317,7 @@ static __device__ void allgather_LL_body(
      // The non-unrolled but "obviously correct" implementation for reference.
      #pragma unroll 1
      for (int i = t; i < nRanks*nIterPacks; i += tn) {
-        Pack got = prim.template recvLL<Pack>(i);
+        Pack got = lla2a.template recv<Pack>(i);
        storePack(output + peer*nStrideElts, pack*EltPerPack, nElts, got);
        peer += tn_div_nPacks;
        pack += tn_mod_nPacks;
@@ -324,7 +325,7 @@ static __device__ void allgather_LL_body(
      }
    #endif

-    prim.endLL(cta);
+    lla2a.endEpoch(ncclCoopCta());

    input += tn*EltPerPack;
    output += tn*EltPerPack;
@@ -333,38 +334,41 @@ static __device__ void allgather_LL_body(
  }
 }

-static __device__ void ncclSymRun_AllGather_LL_impl(ncclSymDevArgs const* args, bool multimem) {
-  ncclSymPrims prim(args->comm, ncclSymPrims_UseLL | multimem*ncclSymPrims_UseMultimem);
+static __device__ void ncclSymkRun_AllGather_LL_impl(ncclSymkDevWorkArgs const* args, bool multimem) {
+  ncclSymkArgsHandler handler{args};
+  ncclLLA2ASession<ncclCoopCta> lla2a(
+    ncclCoopCta(), handler.comm, ncclTeamLsa(handler.comm), handler.lsaLLA2A, blockIdx.x, /*maxElts=*/ncclSymkMaxThreads, multimem, handler.comm.lsaMultimem
+  );
+
  using Pack = BytePack<8>;
  constexpr int BytePerPack = 8;
-  int nElts = args->nElts;
-  int nPacks = divUp(nElts, BytePerPack);

-  uint32_t nPackPerBlock, nPackModBlock;
-  idivmodFast32(&nPackPerBlock, &nPackModBlock, nPacks, prim.nBlocks, prim.nBlocks_rcp32);
-  int blockPackBegin = prim.block*nPackPerBlock + minval<int>(prim.block, nPackModBlock);
-  int blockPackEnd = blockPackBegin + nPackPerBlock + (prim.block < nPackModBlock ? 1 : 0);
-  int nBlockPacks = blockPackEnd - blockPackBegin;
-  int nBlockElts = nElts - blockPackBegin*BytePerPack;
-  nBlockElts = min(nBlockElts, nBlockPacks*BytePerPack);
-  char* blockInput = args->input + blockPackBegin*BytePerPack;
-  char* blockOutput = args->output + blockPackBegin*BytePerPack;
+  handler.singleWork<char>(
+      [&]__device__(int nElts, int nAllElts,
+                    ncclSymPtr<char> input, ncclSymPtr<char> output) {
+        int nPacks = divUp(nElts, BytePerPack);

-  uint32_t lowBits = args->nElts;
-  lowBits |= (uint32_t)reinterpret_cast<uintptr_t>(args->input);
-  lowBits |= (uint32_t)reinterpret_cast<uintptr_t>(args->output);
-  if (__builtin_expect(lowBits%8 == 0, true)) {
-    // NOTE: Specializing for 8-byte alignment in one case help at size=65K: 8.9us vs 5.6us
-    allgather_LL_body(prim, (BytePack<8>*)blockInput, (BytePack<8>*)blockOutput, nBlockElts/8, nBlockPacks, nElts/8);
-  } else {
-    allgather_LL_body(prim, blockInput, blockOutput, nBlockElts, nBlockPacks, nElts);
-  }
+        char* blockInput = input.localPtr();
+        char* blockOutput = output.localPtr();
+
+        uint32_t lowBits = nElts;
+        lowBits |= (uintptr_t)blockInput;
+        lowBits |= (uintptr_t)blockOutput;
+        if (__builtin_expect(lowBits%8 == 0, true)) {
+          // NOTE: Specializing for 8-byte alignment in one case help at size=65K: 8.9us vs 5.6us
+          allgather_LL_body(handler, lla2a, (BytePack<8>*)blockInput, (BytePack<8>*)blockOutput,
+                            nElts/8, nPacks, nAllElts/8);
+        } else {
+          allgather_LL_body(handler, lla2a, blockInput, blockOutput, nElts, nPacks, nAllElts);
+        }
+      }
+    );
 }

-__device__ __forceinline__ void ncclSymRun_AllGather_LL(ncclSymDevArgs const* args) {
-  ncclSymRun_AllGather_LL_impl(args, /*multimem=*/false);
+__device__ __forceinline__ void ncclSymkRun_AllGather_LL(ncclSymkDevWorkArgs const* args) {
+  ncclSymkRun_AllGather_LL_impl(args, /*multimem=*/false);
 }

-__device__ __forceinline__ void ncclSymRun_AllGather_LLMC(ncclSymDevArgs const* args) {
-  ncclSymRun_AllGather_LL_impl(args, /*multimem=*/true);
+__device__ __forceinline__ void ncclSymkRun_AllGather_LLMC(ncclSymkDevWorkArgs const* args) {
+  ncclSymkRun_AllGather_LL_impl(args, /*multimem=*/true);
 }
@@ -1,38 +1,41 @@
 // Modification Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT 
+// SPDX-License-Identifier: MIT

-#include "symmetric.h"
+#include "sym_kernels.h"
+#include "nccl_device.h"
 #include "symmetric/kernel.h"
 #include "symmetric/primitives.h"

 template<int BytePerPack, int UnrollPacks, int UnrollPeers, typename T, typename Red>
 static __device__ __forceinline__ void allreduceDeep(
-    ncclSymPrims& prim, int tn, int t, bool waitNeeded,
-    Red red, char* inputRank0, char* outputRank0, int32_t nIters
+    ncclSymkArgsHandler const& handler, int tn, int t,
+    bool waitNeeded, ncclLsaBarrierSession<ncclCoopCta>& bar,
+    Red red, ncclSymPtr<char> input, ncclSymPtr<char> output, int32_t nIters
  ) {
  using Pack = BytePack<BytePerPack>;
  using Acc = typename Red::EltType;
  using AccPack = BytePack<BytePerPack*sizeof(Acc)/sizeof(T)>;

+  ncclTeam world = ncclTeamWorld(handler.comm);
  int wn = tn/WARP_SIZE;
  int w = t/WARP_SIZE;
  int lane = t%WARP_SIZE;
-  int const& rank = prim.rank;
-  int const& nRanks = prim.nRanks;
-  uint32_t const& stride4G = prim.stride4G;
-  Pack* inpRank0 = (Pack*)inputRank0 + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
-  Pack* outRank0 = (Pack*)outputRank0 + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
+  int const& rank = handler.comm.rank;
+  int const& nRanks = handler.comm.nRanks;
+
+  ncclSymPtr<Pack> inpPacks = (ncclSymPtr<Pack>)input + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
+  ncclSymPtr<Pack> outPacks = (ncclSymPtr<Pack>)output + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
  Pack acc0[UnrollPacks];

  nIters -= w;
  if (0 < nIters) {
    #pragma unroll
    for (int u=0; u < UnrollPacks; u++) {
-      acc0[u] = add4G(inpRank0, rank*stride4G)[u*WARP_SIZE];
+      acc0[u] = inpPacks.peerPtr(world, rank)[u*WARP_SIZE];
    }
  }

-  if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+  if (waitNeeded) bar.wait(ncclCoopCta(), cuda::memory_order_relaxed);

  if (0 < nIters) {
    while (true) {
@@ -42,7 +45,7 @@ static __device__ __forceinline__ void allreduceDeep(
      { Pack tmp1[UnrollPacks];
        #pragma unroll
        for (int u=0; u < UnrollPacks; u++) {
-          tmp1[u] = add4G(inpRank0, r*stride4G)[u*WARP_SIZE];
+          tmp1[u] = inpPacks.peerPtr(world, r)[u*WARP_SIZE];
        }
        #pragma unroll
        for (int u=0; u < UnrollPacks; u++) {
@@ -67,7 +70,7 @@ static __device__ __forceinline__ void allreduceDeep(
            if (partial && ur!=0 && dr+ur == nRanks) break;
            #pragma unroll UnrollPacks
            for (int u=0; u < UnrollPacks; u++) {
-              tmp1[ur][u] = add4G(inpRank0, r*stride4G)[u*WARP_SIZE];
+              tmp1[ur][u] = inpPacks.peerPtr(world, r)[u*WARP_SIZE];
            }
            if (++r == nRanks) r = 0;
          }
@@ -98,22 +101,22 @@ static __device__ __forceinline__ void allreduceDeep(
            if (partial && dr == nRanks) break;
            #pragma unroll UnrollPacks
            for (int u=0; u < UnrollPacks; u++) {
-              add4G(outRank0, r*stride4G)[u*WARP_SIZE] = acc0[u];
+              outPacks.peerPtr(world, r)[u*WARP_SIZE] = acc0[u];
            }
            if (++r == nRanks) r = 0;
          }
        }
      }

-      inpRank0 += intptr_t(wn)*UnrollPacks*WARP_SIZE;
-      outRank0 += intptr_t(wn)*UnrollPacks*WARP_SIZE;
+      inpPacks += intptr_t(wn)*UnrollPacks*WARP_SIZE;
+      outPacks += intptr_t(wn)*UnrollPacks*WARP_SIZE;
      nIters -= wn;
      if (nIters <= 0) break;

      // Load data for next iteration.
      #pragma unroll
      for (int u=0; u < UnrollPacks; u++) {
-        acc0[u] = add4G(inpRank0, rank*stride4G)[u*WARP_SIZE];
+        acc0[u] = inpPacks.peerPtr(world, rank)[u*WARP_SIZE];
      }
    }
  }
@@ -121,21 +124,23 @@ static __device__ __forceinline__ void allreduceDeep(

 template<int UnrollPeers, typename Red, typename T>
 static __device__ __forceinline__ void allreduceEnds(
-    ncclSymPrims& prim, int tn, int t, Red red,
-    T* inputRank0, T* outputRank0, size_t nElts, uint32_t nPreElts, size_t nSufElts
+    ncclSymkArgsHandler const& handler, int tn, int t, Red red,
+    ncclSymPtr<T> input, ncclSymPtr<T> output,
+    size_t nElts, uint32_t nPreElts, size_t nSufElts
  ) {
  using Acc = typename Red::EltType;

-  int const& rank = prim.rank;
-  int const& nRanks = prim.nRanks;
-  uint32_t const& stride4G = prim.stride4G;
-  BytePack<sizeof(T)>* inpRank0 = (BytePack<sizeof(T)>*)inputRank0;
-  BytePack<sizeof(T)>* outRank0 = (BytePack<sizeof(T)>*)outputRank0;
+  ncclTeam world = ncclTeamWorld(handler.comm);
+  int const& rank = handler.comm.rank;
+  int const& nRanks = handler.comm.nRanks;
+
+  ncclSymPtr<BytePack<sizeof(T)>> inpPacks = (ncclSymPtr<BytePack<sizeof(T)>>)input;
+  ncclSymPtr<BytePack<sizeof(T)>> outPacks = (ncclSymPtr<BytePack<sizeof(T)>>)output;

  #pragma unroll 1
  for (size_t i = t; i < nPreElts+nSufElts; i += tn) {
    size_t elt = i < nPreElts ? i : nElts-nSufElts-nPreElts+i;
-    BytePack<sizeof(T)> acc0 = *add4G(inpRank0+elt, rank*stride4G);
+    BytePack<sizeof(T)> acc0 = inpPacks.peerPtr(world, rank)[elt];
    BytePack<sizeof(Acc)> acc1;
    BytePack<sizeof(T)> tmp[UnrollPeers];
    int dr = 1;
@@ -154,7 +159,7 @@ static __device__ __forceinline__ void allreduceEnds(
        #pragma unroll
        for (int u=0; u < UnrollPeers-partial; u++) {
          if (partial && u!=0 && dr+u == nRanks) break;
-          tmp[u] = *add4G(inpRank0+elt, r*stride4G);
+          tmp[u] = inpPacks.peerPtr(world, r)[elt];
          r += 1;
          if (r == nRanks) r = 0;
        }
@@ -182,7 +187,7 @@ static __device__ __forceinline__ void allreduceEnds(
        #pragma unroll
        for (int u=0; u < UnrollPeers-partial; u++) {
          if (partial && dr+u == nRanks) break;
-          *add4G(outRank0+elt, r*stride4G) = acc0;
+          outPacks.peerPtr(world, r)[elt] = acc0;
          r += 1;
          if (r == nRanks) r = 0;
        }
@@ -193,35 +198,33 @@ static __device__ __forceinline__ void allreduceEnds(

 template<typename Red, typename T>
 static __device__ void allreduce(
-    ncclSymPrims& prim, int tn, int t, bool waitNeeded,
-    Red red, T* input, T* output, size_t nElts
+    ncclSymkArgsHandler const& handler, int tn, int t, int nBlocks,
+    bool waitNeeded, ncclLsaBarrierSession<ncclCoopCta>& bar,
+    Red red, ncclSymPtr<T> input, ncclSymPtr<T> output, size_t nElts
  ) {
-  int nRanks = prim.nRanks;
-  int nBlocks = prim.nBlocks;
-  // Mpve to rank=0
-  input = prim.peerPtr(0, input);
-  output = prim.peerPtr(0, output);
-
-  uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
-  uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
+  int const& nRanks = handler.comm.nRanks;
+  int const& nRanks_rcp32 = handler.nRanks_rcp32;
  size_t nBytes = nElts*sizeof(T);
+  uint32_t nBlocks_rcp32 = nccl::utility::idivRcp32_upto64(nBlocks);
+  uint32_t nRanks_nBlocks_rcp32 = nccl::utility::imulRcp32(nRanks, nRanks_rcp32, nBlocks, nBlocks_rcp32);

-  uint32_t nPreBytes = (16u - inputUptr)%16u;
+  uint32_t nPreBytes = (16u - input.offset)%16u;
  nPreBytes = min((size_t)nPreBytes, nBytes);
  uintptr_t cursor = nPreBytes;

  constexpr int MinWarpPerBlock = 4;

-  if ((inputUptr-outputUptr)%16 == 0) {
+  if ((input.offset - output.offset)%16 == 0) {
    constexpr int BytePerPack = 16, UnrollPacks = 4, UnrollPeers = 2;
    constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
    uint32_t chunks = (nBytes-cursor)/BytePerChunk;
-    chunks -= imodFast32(chunks, nRanks*nBlocks, prim.nRanks_nBlocks_rcp32);
+    chunks -= imodFast32(chunks, nRanks*nBlocks, nRanks_nBlocks_rcp32);
    if (chunks != 0) {
      uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
      allreduceDeep<BytePerPack, UnrollPacks, UnrollPeers, T>(
-        prim, tn, t, waitNeeded, red,
-        (char*)input + cursor, (char*)output + cursor,
+        handler, tn, t, waitNeeded, bar, red,
+        (ncclSymPtr<char>)input + cursor,
+        (ncclSymPtr<char>)output + cursor,
        chunks*MinWarpPerBlock
      );
      cursor = cursorAfter;
@@ -229,16 +232,17 @@ static __device__ void allreduce(
    }
  }

-  if (sizeof(T) == 4 || (sizeof(T) < 4 && (inputUptr-outputUptr)%4 == 0)) {
+  if (sizeof(T) == 4 || (sizeof(T) < 4 && (input.offset - output.offset)%4 == 0)) {
    constexpr int BytePerPack = 4, UnrollPacks = 4, UnrollPeers = 4;
    constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
    uint32_t chunks = (nBytes-cursor)/BytePerChunk;
-    chunks -= imodFast32(chunks, nRanks*nBlocks, prim.nRanks_nBlocks_rcp32);
+    chunks -= imodFast32(chunks, nRanks*nBlocks, nRanks_nBlocks_rcp32);
    if (chunks != 0) {
      uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
      allreduceDeep<(sizeof(T) <= BytePerPack ? BytePerPack : 0), UnrollPacks, UnrollPeers, T>(
-        prim, tn, t, waitNeeded, red,
-        (char*)input + cursor, (char*)output + cursor,
+        handler, tn, t, waitNeeded, bar, red,
+        (ncclSymPtr<char>)input + cursor,
+        (ncclSymPtr<char>)output + cursor,
        chunks*MinWarpPerBlock
      );
      cursor = cursorAfter;
@@ -246,46 +250,51 @@ static __device__ void allreduce(
    }
  }

-  if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+  if (waitNeeded) bar.wait(ncclCoopCta(), cuda::memory_order_relaxed);

  constexpr int UnrollPeers = 8;
  size_t nSufElts = (nBytes-cursor)/sizeof(T);
-  allreduceEnds<UnrollPeers>(prim, tn, t, red, input, output, nElts, nPreBytes/sizeof(T), nSufElts);
+  allreduceEnds<UnrollPeers>(handler, tn, t, red, input, output, nElts, nPreBytes/sizeof(T), nSufElts);
 }

-
 template<template<typename> typename Red, typename T>
-__device__ __forceinline__ void ncclSymRun_AllReduce_RSxLD_AGxST(ncclSymDevArgs const* args) {
-  ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier);
-  int /*const&*/ rank = prim.rank;
-  int /*const&*/ nRanks = prim.nRanks;
-  Red<typename ncclSymAccumType<Red, T, /*nvls=*/false>::Type> red(args->redOpArg);
+__device__ __forceinline__ void ncclSymkRun_AllReduce_RSxLD_AGxST(ncclSymkDevWorkArgs const* args) {
+  ncclSymkArgsHandler handler{args};
+  ncclLsaBarrierSession<ncclCoopCta> bar{
+    ncclCoopCta(), handler.comm, ncclTeamTagLsa(), blockIdx.x
+  };

-  // Threads numbered globally such that we round robin warps by rank then block.
-  int gt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
-                     rank, nRanks,
-                     prim.block, prim.nBlocks,
-                     threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
-  int gtn = nRanks*prim.nBlocks*blockDim.x;
+  Red<typename ncclSymkAccumType<Red, T, /*nvls=*/false>::Type> red(handler.devWork->redOpArg);

-  prim.barrierArrive(ncclCoopCta(), /*release=*/false);
-  //prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+  int const& rank = handler.comm.rank;
+  int const& nRanks = handler.comm.nRanks;

-  allreduce(prim, gtn, gt, /*waitNeeded=*/true, red, (T*)args->input, (T*)args->output, args->nElts);
+  bar.arrive(ncclCoopCta(), cuda::memory_order_relaxed);

-  prim.barrierArrive(ncclCoopCta(), /*release=*/true);
-  prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+  bool waitNeeded = true;
+  handler.forEachWork<T>(
+      [&]__device__(int block, int nBlocks, size_t nElts, size_t nAllElts,
+                    ncclSymPtr<T> input, ncclSymPtr<T> output) {
+        // Threads numbered globally such that we round robin warps by rank then block.
+        int gt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
+                           rank, nRanks,
+                           block, nBlocks,
+                           threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
+        int gtn = nRanks*nBlocks*blockDim.x;
+
+        allreduce(handler, gtn, gt, nBlocks, waitNeeded, bar, red, input, output, nElts);
+
+        waitNeeded = false;
+      }
+    );
+
+  bar.sync(ncclCoopCta(), cuda::memory_order_release);
 }

-
 template<typename Red, typename T>
 static __device__ void allreduceMultimem(
-    ncclSymPrims& prim, int tn, int t, Red red, T* input, T* output, size_t nElts
+    int tn, int t, Red red, T* input, T* output, size_t nElts
  ) {
-  // Mpve to multimem
-  input = prim.multimemPtr(input);
-  output = prim.multimemPtr(output);
-
  uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
  uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
  size_t nBytes = nElts*sizeof(T);
@@ -330,106 +339,132 @@ static __device__ void allreduceMultimem(
    uintptr_t cursor = i < nPreBytes ? i : nBytes-nSufBytes+(i-nPreBytes);
    BytePack<sizeof(T)> val = applyLoadMultimem<Red, sizeof(T)>(red, inputUptr + cursor);
    multimem_st_global(outputUptr + cursor, val);
-    cursor += tn*sizeof(T);
  }
 }

 template<template<typename> typename Red, typename T>
-__device__ __forceinline__ void ncclSymRun_AllReduce_RSxLDMC_AGxSTMC(ncclSymDevArgs const* args) {
-  ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier|ncclSymPrims_UseMultimem);
-  Red<typename ncclSymAccumType<Red, T, /*nvls=*/true>::Type> red(args->redOpArg);
+__device__ __forceinline__ void ncclSymkRun_AllReduce_RSxLDMC_AGxSTMC(ncclSymkDevWorkArgs const* args) {
+  ncclSymkArgsHandler handler{args};
+  ncclLsaBarrierSession<ncclCoopCta> bar{
+    ncclCoopCta(), handler.comm, ncclTeamTagLsa(), blockIdx.x, /*multimem=*/true
+  };

-  // Threads numbered globally such that we round robin warps by rank then block.
-  int gt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
-                     prim.rank, prim.nRanks,
-                     prim.block, prim.nBlocks,
-                     threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
-  int gtn = prim.nRanks*prim.nBlocks*blockDim.x;
+  Red<typename ncclSymkAccumType<Red, T, /*nvls=*/true>::Type> red(handler.devWork->redOpArg);

-  prim.barrierArrive(ncclCoopCta(), /*release=*/false);
-  prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+  int const& rank = handler.comm.rank;
+  int const& nRanks = handler.comm.nRanks;
+  auto const& multimem = handler.comm.lsaMultimem;

-  allreduceMultimem(prim, gtn, gt, red, (T*)args->input, (T*)args->output, args->nElts);
+  bar.sync(ncclCoopCta(), cuda::memory_order_relaxed);

-  prim.barrierArrive(ncclCoopCta(), /*release=*/true);
-  prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+  handler.forEachWork<T>(
+      [&]__device__(int block, int nBlocks, size_t nElts, size_t nAllElts,
+                    ncclSymPtr<T> input, ncclSymPtr<T> output) {
+        // Threads numbered globally such that we round robin warps by rank then block.
+        int gt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
+                           rank, nRanks,
+                           block, nBlocks,
+                           threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
+        int gtn = nRanks*nBlocks*blockDim.x;
+
+        allreduceMultimem(gtn, gt, red, input.multimemPtr(multimem), output.multimemPtr(multimem), nElts);
+      }
+    );
+
+  bar.sync(ncclCoopCta(), cuda::memory_order_release);
 }

 template<template<typename> typename Red, typename T>
-__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLL_R_impl(ncclSymDevArgs const* args, bool multimem) {
-  ncclSymPrims prim(args->comm, ncclSymPrims_UseLL | multimem*ncclSymPrims_UseMultimem);
-  int /*const&*/ rank = prim.rank;
-  using Acc = typename ncclSymAccumType<Red, T, /*nvls=*/false>::Type;
-  Red<Acc> red(args->redOpArg);
+__device__ __forceinline__ void ncclSymkRun_AllReduce_AGxLL_R_impl(ncclSymkDevWorkArgs const* args, bool multimem) {
+  ncclSymkArgsHandler handler{args};
+  ncclLLA2ASession<ncclCoopCta> lla2a(
+    ncclCoopCta(), handler.comm, ncclTeamLsa(handler.comm), handler.lsaLLA2A,
+    blockIdx.x, ncclSymkMaxThreads, multimem, handler.comm.lsaMultimem
+  );
+
+  int const& rank = handler.comm.rank;
+  int const& nRanks = handler.comm.nRanks;
+  using Acc = typename ncclSymkAccumType<Red, T, /*nvls=*/false>::Type;
+  Red<Acc> red(handler.devWork->redOpArg);

  using Pack = BytePack<8>;
  using AccPack = BytePack<8*sizeof(Acc)/sizeof(T)>;
  constexpr int EltPerPack = 8/sizeof(T);
-  int nElts = args->nElts;
-  int nPacks = divUp(nElts, EltPerPack);

-  bool packAligned = 8 <= alignof(T) || (
-      args->nElts*sizeof(T) |
-      (uint32_t)reinterpret_cast<uintptr_t>(args->input) |
-      (uint32_t)reinterpret_cast<uintptr_t>(args->output)
-    )%8 == 0;
+  handler.singleWork<T>(
+      [&]__device__(int nElts, int nAllElts,
+                    ncclSymPtr<T> inputPtr, ncclSymPtr<T> outputPtr) {
+        int nPacks = divUp(nElts, EltPerPack);

-  uint32_t nPackPerBlock, nPackModBlock;
-  idivmodFast32(&nPackPerBlock, &nPackModBlock, nPacks, prim.nBlocks, prim.nBlocks_rcp32);
-  int begin = prim.block*nPackPerBlock + minval<int>(prim.block, nPackModBlock);
-  int end = begin + nPackPerBlock + (prim.block < nPackModBlock ? 1 : 0);
+        T* input = (T*)inputPtr.localPtr();
+        T* output = (T*)outputPtr.localPtr();

-  nPacks = end - begin;
-  nElts -= begin*EltPerPack;
-  nElts = min(nElts, nPacks*EltPerPack);
-  T* input = (T*)args->input + begin*EltPerPack;
-  T* output = (T*)args->output + begin*EltPerPack;
+        bool packAligned = 8 <= alignof(T) || (nElts*sizeof(T) | (uintptr_t)input | (uintptr_t)output)%8 == 0;

-  ncclCoopCta cta;
-  int t = threadIdx.x;
-  int tn = ncclSymMaxThreads;
+        ncclCoopCta cta;
+        int t = threadIdx.x;
+        int tn = ncclSymkMaxThreads;

-  if (__builtin_expect(packAligned, true)) {
-    #pragma unroll 1
-    while (0 < nPacks) {
-      if (t < nPacks) {
-        int nIterPacks = min(nPacks, tn);
-        Pack inp = loadPack<Pack>((Pack*)input, t, nPacks);
-        prim.bcastLL(/*slot=*/nIterPacks*rank + t, inp);
-        Pack out = prim.template recvReduceLL<Pack, T>(t, nIterPacks, red);
-        storePack((Pack*)output, t, nPacks, out);
+        if (__builtin_expect(packAligned, true)) {
+          #pragma unroll 1
+          while (0 < nPacks) {
+            if (t < nPacks) {
+              int nIterPacks = min(nPacks, tn);
+              Pack inp = loadPack<Pack>((Pack*)input, t, nPacks);
+              lla2a.bcast(/*slot=*/nIterPacks*rank + t, inp);
+              AccPack out = lla2a.template recvReduce</*Unroll=*/8, Pack>(
+                /*slotStart=*/t, /*slotCount=*/nRanks, /*slotStride=*/nIterPacks,
+                /*eltToAcc=*/[&] __device__ (Pack x)->AccPack {
+                  return applyCast<T, Acc>(x);
+                },
+                /*reduce=*/[&] __device__ (AccPack a, AccPack b)->AccPack {
+                  return applyReduce(red, a, b);
+                }
+              );
+              storePack((Pack*)output, t, nPacks, applyCast<Acc, T>(out));
+            }
+            lla2a.endEpoch(cta);
+
+            input += tn*EltPerPack;
+            output += tn*EltPerPack;
+            nPacks -= tn;
+          }
+        } else {
+          #pragma unroll 1
+          while (0 < nElts) {
+            if (t*EltPerPack < nElts) {
+              int nIterPacks = min(nPacks, tn);
+              Pack inp = loadPack<Pack>(input, t*EltPerPack, nElts);
+              lla2a.bcast(/*slot=*/nIterPacks*rank + t, inp);
+              AccPack out = lla2a.template recvReduce</*Unroll=*/8, Pack>(
+                /*slotStart=*/t, /*slotCount=*/nRanks, /*slotStride=*/nIterPacks,
+                /*eltToAcc=*/[&] __device__ (Pack x)->AccPack {
+                  return applyCast<T, Acc>(x);
+                },
+                /*reduce=*/[&] __device__ (AccPack a, AccPack b)->AccPack {
+                  return applyReduce(red, a, b);
+                }
+              );
+              storePack(output, t*EltPerPack, nElts, applyCast<Acc, T>(out));
+            }
+            lla2a.endEpoch(cta);
+
+            input += tn*EltPerPack;
+            output += tn*EltPerPack;
+            nElts -= tn*EltPerPack;
+            nPacks -= tn;
+          }
+        }
      }
-      prim.endLL(cta);
-
-      input += tn*EltPerPack;
-      output += tn*EltPerPack;
-      nPacks -= tn;
-    }
-  } else {
-    #pragma unroll 1
-    while (0 < nElts) {
-      if (t*EltPerPack < nElts) {
-        int nIterPacks = min(nPacks, tn);
-        Pack inp = loadPack<Pack>(input, t*EltPerPack, nElts);
-        prim.bcastLL(/*slot=*/nIterPacks*rank + t, inp);
-        Pack out = prim.template recvReduceLL<Pack, T>(t, nIterPacks, red);
-        storePack(output, t*EltPerPack, nElts, out);
-      }
-      prim.endLL(cta);
-
-      input += tn*EltPerPack;
-      output += tn*EltPerPack;
-      nElts -= tn*EltPerPack;
-      nPacks -= tn;
-    }
-  }
+    );
 }

 template<template<typename> typename Red, typename T>
-__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLL_R(ncclSymDevArgs const* args) {
-  ncclSymRun_AllReduce_AGxLL_R_impl<Red, T>(args, /*multimem=*/false);
+__device__ __forceinline__ void ncclSymkRun_AllReduce_AGxLL_R(ncclSymkDevWorkArgs const* args) {
+  ncclSymkRun_AllReduce_AGxLL_R_impl<Red, T>(args, /*multimem=*/false);
 }
+
 template<template<typename> typename Red, typename T>
-__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLLMC_R(ncclSymDevArgs const* args) {
-  ncclSymRun_AllReduce_AGxLL_R_impl<Red, T>(args, /*multimem=*/true);
+__device__ __forceinline__ void ncclSymkRun_AllReduce_AGxLLMC_R(ncclSymkDevWorkArgs const* args) {
+  ncclSymkRun_AllReduce_AGxLL_R_impl<Red, T>(args, /*multimem=*/true);
 }
@@ -4,6 +4,7 @@
 #!/usr/bin/env python3
 import os
 import sys
+import shutil

 ################################################################################
 # The first command line argument is the path to the directory to generate and
@@ -13,8 +14,11 @@ gensrc = sys.argv[1]

 if os.path.exists(gensrc):
  for name in os.listdir(gensrc):
-    os.remove(os.path.join(gensrc, name))
-    #os.truncate(os.path.join(gensrc, name), 0)
+    path = os.path.join(gensrc, name)
+    if os.path.isfile(path):
+      os.remove(path)
+    elif os.path.isdir(path):
+      shutil.rmtree(path)
 else:
  os.mkdir(gensrc)

@@ -97,7 +101,7 @@ def enumerate_kernels():
        yield Rec(coll="ReduceScatter", algo=algo, red=red, ty=ty)

 def required_cuda(k):
-  cudart, arch, specific_sms  = 0, 0, None
+  cudart, arch, specific_sms  = 0, 600, None
  is_nvls = k.algo in nvls_algos_by_coll.get(k.coll, [])
  if is_nvls:
    cudart = max(cudart, 12010)
@@ -136,13 +140,13 @@ def kernel_gencode(k):

 def kernel_cname(k):
  if k.coll in reductions:
-    return paste("_", "ncclSymDevKernel", k.coll, k.algo, k.red, k.ty)
+    return paste("_", "ncclSymkDevKernel", k.coll, k.algo, k.red, k.ty)
  else:
-    return paste("_", "ncclSymDevKernel", k.coll, k.algo)
+    return paste("_", "ncclSymkDevKernel", k.coll, k.algo)

 def kernel_conds(k):
  cudart, arch, specific_sms = required_cuda(k)
-  if cudart == 0: return (None, None)
+  if cudart == 0 and arch == 0: return (None, None)

  cudart_cond = "CUDART_VERSION >= %d"%cudart
  if not specific_sms:
@@ -153,13 +157,13 @@ def kernel_conds(k):

 def instantiate(k):
  form_red_ty = (
-    "__global__ void {cname}(ncclSymDevArgs NCCL_GRID_CONSTANT const *args) {{\n"
-    "  ncclSymRun_{id}<{red}, {ty}>(args);\n"
+    "__global__ void {cname}(ncclSymkDevWorkArgs4K NCCL_GRID_CONSTANT const *args4K) {{\n"
+    "  ncclSymkRun_{id}<{red}, {ty}>(args4K->args);\n"
    "}}"
  )
  form = (
-    "__global__ void {cname}(ncclSymDevArgs NCCL_GRID_CONSTANT const *args) {{\n"
-    "  ncclSymRun_{id}(args);\n"
+    "__global__ void {cname}(ncclSymkDevWorkArgs4K NCCL_GRID_CONSTANT const *args4K) {{\n"
+    "  ncclSymkRun_{id}(args4K->args);\n"
    "}}"
  )

@@ -172,7 +176,7 @@ def instantiate(k):
  return inst

 def prototype(k):
-  return "__global__ void {cname}(ncclSymDevArgs const *args);".format(cname=kernel_cname(k))
+  return "__global__ void {cname}(ncclSymkDevWorkArgs4K const *args4K);".format(cname=kernel_cname(k))

 ################################################################################

@@ -194,20 +198,22 @@ for coll in set(k.coll for k in enumerate_kernels()):
  if (fname, coll) not in kernels_by_file:
    kernels_by_file[fname, coll] = []

+files_to_print = ""
 # Generate each kernel instantiation file
 for (fname, coll), ks in kernels_by_file.items():
+  files_to_print += fname + ";"
  with open(os.path.join(gensrc, fname), "w") as f:
    print("-- Generating %s" % os.path.join(gensrc, fname))
-    emitln(f, '#include "symmetric.h"')
+    emitln(f, '#include "sym_kernels.h"')
    emitln(f, '#include "symmetric/kernel.h"')
    emitln(f, '#include "symmetric/{coll}.h"'.format(coll=coll_to_lower[coll]))
    for k in ks:
      emitln(f, instantiate(k))

-# Generate <gensrc>/symmetric_host.cc
-with open(os.path.join(gensrc, "symmetric_kernels.cc"), "w") as f:
+# Generate <gensrc>/sym_kernels_host.cc
+with open(os.path.join(gensrc, "sym_kernels_host.cc"), "w") as f:
  print("-- Generating %s" % os.path.join(gensrc, "symmetric_kernels.cc"))
-  emitln(f, '#include "symmetric.h"')
+  emitln(f, '#include "sym_kernels.h"')
  emitln(f, '#include "device.h"')
  emitln(f, '')

@@ -215,19 +221,19 @@ with open(os.path.join(gensrc, "symmetric_kernels.cc"), "w") as f:
    emitln(f, prototype(k))
  emitln(f, '')

-  emitln(f, 'extern int const ncclSymKernelCount = %d;' % len(list(enumerate_kernels())))
-  emitln(f, 'extern void* const ncclSymKernelList[] = {')
+  emitln(f, 'extern int const ncclSymkKernelCount = %d;' % len(list(enumerate_kernels())))
+  emitln(f, 'extern void* const ncclSymkKernelList[] = {')
  for k in enumerate_kernels():
    emitln(f, '(void*){cname},'.format(cname=kernel_cname(k)))
  emitln(f, 'nullptr};')
  emitln(f, '')

-  emitln(f, 'void* ncclSymGetKernelPtr(ncclSymKernelId id, int red, ncclDataType_t ty) {')
+  emitln(f, 'void* ncclSymkGetKernelPtr(ncclSymkKernelId id, int red, ncclDataType_t ty) {')
  indents += 1
  emitln(f, 'switch (id) {')
  emitln(f, 'default: return nullptr;')
  for (coll, algo), coll_algo_ks in partition(enumerate_kernels(), lambda k: (k.coll, k.algo)).items():
-    emitln(f, 'case ncclSymKernelId_'+coll+'_'+algo+':')
+    emitln(f, 'case ncclSymkKernelId_'+coll+'_'+algo+':')
    indents += 1
    if len(coll_algo_ks) == 1:
      emitln(f, 'return (void*)&'+kernel_cname(coll_algo_ks[0])+';')
@@ -4,27 +4,27 @@
 #ifndef NCCL_DEVICE_SYMMETRIC_KERNEL_H_
 #define NCCL_DEVICE_SYMMETRIC_KERNEL_H_

-#include "symmetric.h"
+#include "sym_kernels.h"

 template<template<typename> typename Red, typename T>
-__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLL_R(struct ncclSymDevArgs const* args);
+__device__ __forceinline__ void ncclSymkRun_AllReduce_AGxLL_R(struct ncclSymkDevWorkArgs const* args);
 template<template<typename> typename Red, typename T>
-__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLLMC_R(struct ncclSymDevArgs const* args);
+__device__ __forceinline__ void ncclSymkRun_AllReduce_AGxLLMC_R(struct ncclSymkDevWorkArgs const* args);

 template<template<typename> typename Red, typename T>
-__device__ __forceinline__ void ncclSymRun_AllReduce_RSxLD_AGxST(struct ncclSymDevArgs const* args);
+__device__ __forceinline__ void ncclSymkRun_AllReduce_RSxLD_AGxST(struct ncclSymkDevWorkArgs const* args);
 template<template<typename> typename Red, typename T>
-__device__ __forceinline__ void ncclSymRun_AllReduce_RSxLDMC_AGxSTMC(struct ncclSymDevArgs const* args);
+__device__ __forceinline__ void ncclSymkRun_AllReduce_RSxLDMC_AGxSTMC(struct ncclSymkDevWorkArgs const* args);

-__device__ __forceinline__ void ncclSymRun_AllGather_LL(struct ncclSymDevArgs const* args);
-__device__ __forceinline__ void ncclSymRun_AllGather_LLMC(struct ncclSymDevArgs const* args);
-__device__ __forceinline__ void ncclSymRun_AllGather_ST(struct ncclSymDevArgs const* args);
-__device__ __forceinline__ void ncclSymRun_AllGather_STMC(struct ncclSymDevArgs const* args);
+__device__ __forceinline__ void ncclSymkRun_AllGather_LL(struct ncclSymkDevWorkArgs const* args);
+__device__ __forceinline__ void ncclSymkRun_AllGather_LLMC(struct ncclSymkDevWorkArgs const* args);
+__device__ __forceinline__ void ncclSymkRun_AllGather_ST(struct ncclSymkDevWorkArgs const* args);
+__device__ __forceinline__ void ncclSymkRun_AllGather_STMC(struct ncclSymkDevWorkArgs const* args);

 template<template<typename> typename Red, typename T>
-__device__ __forceinline__ void ncclSymRun_ReduceScatter_LL(struct ncclSymDevArgs const* args);
+__device__ __forceinline__ void ncclSymkRun_ReduceScatter_LL(struct ncclSymkDevWorkArgs const* args);
 template<template<typename> typename Red, typename T>
-__device__ __forceinline__ void ncclSymRun_ReduceScatter_LD(struct ncclSymDevArgs const* args);
+__device__ __forceinline__ void ncclSymkRun_ReduceScatter_LD(struct ncclSymkDevWorkArgs const* args);
 template<template<typename> typename Red, typename T>
-__device__ __forceinline__ void ncclSymRun_ReduceScatter_LDMC(struct ncclSymDevArgs const* args);
+__device__ __forceinline__ void ncclSymkRun_ReduceScatter_LDMC(struct ncclSymkDevWorkArgs const* args);
 #endif
@@ -4,7 +4,7 @@
 #ifndef NCCL_DEVICE_SYMMETRIC_PRIMITIVES_H_
 #define NCCL_DEVICE_SYMMETRIC_PRIMITIVES_H_

-#include "symmetric.h"
+#include "sym_kernels.h"
 #include "bitops.h"
 #include "collectives.h"
 #include "op128.h"
@@ -28,453 +28,124 @@ static __device__ Int0 flattenIx(Int0 pos, Int1 size, Ints ...more) {
  return pos + size*flattenIx(more...);
 }

-// Precomputed integer reciprocoals for denominator values 1..64 inclusive.
-// Pass these to idivFast64() for fast division on the GPU.
-static __device__ uint64_t idivRcp64_upto64(int x) {
-  static constexpr uint64_t table[65] = {
-    idivRcp64(0x01), idivRcp64(0x01), idivRcp64(0x02), idivRcp64(0x03),
-    idivRcp64(0x04), idivRcp64(0x05), idivRcp64(0x06), idivRcp64(0x07),
-    idivRcp64(0x08), idivRcp64(0x09), idivRcp64(0x0a), idivRcp64(0x0b),
-    idivRcp64(0x0c), idivRcp64(0x0d), idivRcp64(0x0e), idivRcp64(0x0f),
-    idivRcp64(0x10), idivRcp64(0x11), idivRcp64(0x12), idivRcp64(0x13),
-    idivRcp64(0x14), idivRcp64(0x15), idivRcp64(0x16), idivRcp64(0x17),
-    idivRcp64(0x18), idivRcp64(0x19), idivRcp64(0x1a), idivRcp64(0x1b),
-    idivRcp64(0x1c), idivRcp64(0x1d), idivRcp64(0x1e), idivRcp64(0x1f),
-    idivRcp64(0x20), idivRcp64(0x21), idivRcp64(0x22), idivRcp64(0x23),
-    idivRcp64(0x24), idivRcp64(0x25), idivRcp64(0x26), idivRcp64(0x27),
-    idivRcp64(0x28), idivRcp64(0x29), idivRcp64(0x2a), idivRcp64(0x2b),
-    idivRcp64(0x2c), idivRcp64(0x2d), idivRcp64(0x2e), idivRcp64(0x2f),
-    idivRcp64(0x30), idivRcp64(0x31), idivRcp64(0x32), idivRcp64(0x33),
-    idivRcp64(0x34), idivRcp64(0x35), idivRcp64(0x36), idivRcp64(0x37),
-    idivRcp64(0x38), idivRcp64(0x39), idivRcp64(0x3a), idivRcp64(0x3b),
-    idivRcp64(0x3c), idivRcp64(0x3d), idivRcp64(0x3e), idivRcp64(0x3f),
-    idivRcp64(0x40)
-  };
-  return table[x];
-}
-
-static __device__ uint32_t idivRcp32_upto64(int x) {
-  return idivRcp64_upto64(x)>>32;
-}
-
 namespace {
-struct ncclCoopCta {
-  __device__ void sync() { __syncthreads(); }
-  __device__ int self() { return threadIdx.x; }
-  __device__ int count() { return blockDim.x; }
-};
-struct ncclCoopWarps {
-  int log2_nWarps;
-  __device__ void sync() {
-    asm volatile("barrier.sync %0, %1;" :: "r"(1 + (threadIdx.x>>(5+log2_nWarps))), "r"(32<<log2_nWarps) : "memory");
-  }
-  __device__ int self() { return threadIdx.x & ((32<<log2_nWarps)-1); }
-  __device__ int count() { return 32<<log2_nWarps; }
-};
-struct ncclCoopWarp {
-  __device__ void sync() { __syncwarp(); }
-  __device__ int self() { return threadIdx.x%32; }
-  __device__ int count() { return 32; }
-};
-}
+struct ncclSymkArgsHandler {
+  ncclDevComm const& comm;
+  ncclLLA2AHandle const& lsaLLA2A;
+  struct ncclSymkChannelWorkRange* channelWorkRange;
+  struct ncclSymkDevWork* devWork;
+  uint32_t nRanks_rcp32;

-namespace {
-static constexpr int ncclSymPrims_UseBarrier = 1;
-static constexpr int ncclSymPrims_UseLL = 2;
-static constexpr int ncclSymPrims_UseMultimem = 4;
-struct ncclSymPrims {
-  int flags;
-  int const &rank;
-  int const &nRanks;
-  uint32_t const &nRanks_rcp32;
-  int block, nBlocks;
-  uint32_t nBlocks_rcp32;
-  uint32_t nBlocks_nWarps_rcp32;
-  uint32_t nRanks_nBlocks_rcp32;
-  uint32_t nWarpPerRank, nWarpPerRank_rcp32;
-  struct ncclSymDevBase* const &base;
-  uintptr_t offsetMc;
+  __device__ ncclSymkArgsHandler(ncclSymkDevWorkArgs const* args):
+    comm(args->kcomm.devComm),
+    lsaLLA2A(args->kcomm.lsaLLA2A) {
+    channelWorkRange = args->getWorkRange();

-  uint32_t const &stride4G;
-  uint32_t barEpoch;
-  uint32_t llEpoch;
-
-  __device__ ncclSymPrims(ncclSymDevComm const &comm, int flags):
-    flags(flags),
-    rank(comm.rank),
-    nRanks(comm.nRanks),
-    nRanks_rcp32(comm.nRanks_rcp32),
-    block(blockIdx.x),
-    nBlocks(gridDim.x),
-    nBlocks_rcp32(idivRcp32_upto64(nBlocks)),
-    nBlocks_nWarps_rcp32(imulRcp32(nBlocks, nBlocks_rcp32, blockDim.x/32, idivRcp32_upto64(blockDim.x/32))),
-    nRanks_nBlocks_rcp32(imulRcp32(nRanks, nRanks_rcp32, gridDim.x, nBlocks_rcp32)),
-    nWarpPerRank(idivFast32(nBlocks*blockDim.x/32, nRanks, nRanks_rcp32)),
-    nWarpPerRank_rcp32(idivRcp32_upto64(nWarpPerRank)),
-    base(comm.base),
-    offsetMc((flags & ncclSymPrims_UseMultimem) ? (char*)comm.baseMc - (char*)base : 0x0),
-    stride4G(comm.stride4G) {
-
-    #if CUDART_VERSION >= 12030 && __CUDA_ARCH__ >= 900
-      cudaGridDependencySynchronize();
-    #endif
-
-    if ((flags & ncclSymPrims_UseBarrier) && threadIdx.x < nRanks) {
-      barEpoch = (flags & ncclSymPrims_UseMultimem) ? base->barEpochMc[block] : base->barEpochUc[block];
-    }
-    if (flags & ncclSymPrims_UseLL) llEpoch = base->llEpoch[block] + 2;
-  }
-  __device__  ~ncclSymPrims() {
-    if (threadIdx.x == 0) {
-      if (flags & ncclSymPrims_UseBarrier) {
-        ((flags & ncclSymPrims_UseMultimem) ? base->barEpochMc : base->barEpochUc)[block] = barEpoch;
-      }
-      if (flags & ncclSymPrims_UseLL) base->llEpoch[block] = llEpoch - 2;
-    }
+    devWork = args->getWorks(args->nMaxChannels);
+    nRanks_rcp32 = comm.nRanks_rcp32;
  }

  template<typename T>
-  __device__ T* peerPtr(int peer, T* selfPtr) {
-    return add4G(selfPtr, (peer-rank)*stride4G);
+    __device__ void getWorkRange(int block,
+                                 uint16_t& workLo, size_t& indexLo, uint16_t& workHi, size_t& indexHi) {
+    constexpr int EltPerCell = NCCL_SYM_KERNEL_CELL_SIZE / sizeof(T);
+    uint32_t fracLo, fracHi;
+
+    // Where the work begins
+    workLo = (block==0) ? 0 : channelWorkRange[block-1].workHi; // start where predecessor ends
+    fracLo = (block==0) ? 0 : channelWorkRange[block-1].fracHi + 1;
+    // If the predecessor ended on the work boundary, then we step to the beginning of the next work.
+    // This ensures we never have empty parts.
+    if (fracLo == 0x10000) {
+      workLo++;
+      fracLo = 0;
+    }
+    struct ncclSymkDevWork const& dw = devWork[workLo];
+    indexLo = ((fracLo * divUp(dw.nElts, EltPerCell)) >> 16) * EltPerCell;
+
+    // Where the work ends
+    workHi = channelWorkRange[block].workHi;
+    fracHi = channelWorkRange[block].fracHi + 1;
+    indexHi = min(((fracHi * divUp(dw.nElts, EltPerCell)) >> 16) * EltPerCell, dw.nElts);
  }

  template<typename T>
-  __device__ T* multimemPtr(T* selfPtr) {
-    return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(selfPtr) + offsetMc);
+    __device__ void getWorkRangeFused(int blockIdx, int w,
+                                      int& block, int& nBlocks, size_t& indexLo, size_t& indexHi) {
+    constexpr int EltPerCell = NCCL_SYM_KERNEL_CELL_SIZE / sizeof(T);
+    struct ncclSymkDevWork const& dw = devWork[w];
+    uint32_t fracLo, fracHi;
+    int lastBlock;
+
+    block = blockIdx - dw.sChannelId;
+    nBlocks = dw.nChannels;
+    lastBlock = dw.sChannelId+dw.nChannels-1;
+
+    // Where the work begins
+    fracLo = (dw.sChannelId==0) ? 0 : ((channelWorkRange[dw.sChannelId-1].fracHi + 1) & 0xFFFF);
+    indexLo = ((fracLo * divUp(dw.nElts, EltPerCell)) >> 16) * EltPerCell;
+    fracHi = (channelWorkRange[lastBlock].workHi == w) ? channelWorkRange[lastBlock].fracHi + 1 : 0x10000;
+    indexHi = min(((fracHi * divUp(dw.nElts, EltPerCell)) >> 16) * EltPerCell, dw.nElts);
  }

-  __device__  void barrierArrive(ncclCoopCta cta, bool release) {
-    cta.sync();
-    #if __CUDA_ARCH__ < 700
-      if (release) {
-        if (cta.self() == 0) __threadfence_system();
-        cta.sync();
-      }
-    #endif
-    if (flags & ncclSymPrims_UseMultimem) {
-    #if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010
-      if (cta.self() == 0) {
-        uint32_t* inbox = &multimemPtr(base)->barInboxMc[block];
-        if (release) {
-          asm volatile("multimem.red.release.sys.add.u32 [%0],1;" :: "l"(inbox));
+  template<typename T, typename Fn>
+    __device__ void forEachWork(Fn const& fn) {
+      uint16_t workLo, workHi;
+      size_t indexLo, indexHi;
+
+      getWorkRange<T>(blockIdx.x, workLo, indexLo, workHi, indexHi);
+
+      size_t currentIndexLo = indexLo;
+      #pragma unroll 1
+      for (int w = workLo; w <= workHi; w++) {
+        struct ncclSymkDevWork const& dw = devWork[w];
+        size_t const& nAllElts = dw.nElts;
+        size_t currentIndexHi;
+        int block, nBlocks;
+        if (blockIdx.x >= dw.sChannelId && blockIdx.x < dw.sChannelId + dw.nChannels) {
+          getWorkRangeFused<T>(blockIdx.x, w, block, nBlocks, currentIndexLo, currentIndexHi);
        } else {
-          asm volatile("multimem.red.relaxed.sys.add.u32 [%0],1;" :: "l"(inbox));
+          currentIndexHi = (w < workHi) ? nAllElts : indexHi;
+          block = 0;
+          nBlocks = 1;
        }
+
+        fn(block, nBlocks, currentIndexHi - currentIndexLo, nAllElts,
+           ncclSymPtr<T>(dw.inputWin, dw.inputOff) + currentIndexLo,
+           ncclSymPtr<T>(dw.outputWin, dw.outputOff) + currentIndexLo);
+
+        currentIndexLo = 0;
      }
-    #endif
-    } else {
-      int r = cta.self();
-      if (r != rank && r < nRanks) {
-        uint32_t* inbox = &peerPtr(r, base)->barInboxPerPeer[block*nRanks + rank];
-        #if __CUDA_ARCH__ >= 700
-          if (release) {
-            asm volatile("st.release.sys.u32 [%0],%1;" :: "l"(inbox), "r"(barEpoch+1));
-          } else {
-            asm volatile("st.relaxed.sys.u32 [%0],%1;" :: "l"(inbox), "r"(barEpoch+1));
-          }
-        #else
-          if (release) {
-            __atomic_store_n(inbox, barEpoch + 1, __ATOMIC_RELEASE);
-          } else {
-            __atomic_store_n(inbox, barEpoch + 1, __ATOMIC_RELAXED);
-          }
-          // asm volatile("st.volatile.u32 [%0],%1;" :: "l"(inbox), "r"(barEpoch+1));
-        #endif
-      }
-    }
  }

-  __device__  void barrierWait(ncclCoopCta cta, bool acquire) {
-    if (flags & ncclSymPrims_UseMultimem) {
-    #if __CUDA_ARCH__ >= 900
-      if (cta.self() == 0) {
-        uint32_t* inbox = &base->barInboxMc[block];
-        while (true) {
-          uint32_t got;
-          if (acquire) {
-            asm volatile("ld.acquire.sys.u32 %0,[%1];" : "=r"(got) : "l"(inbox));
-          } else {
-            asm volatile("ld.relaxed.sys.u32 %0,[%1];" : "=r"(got) : "l"(inbox));
-          }
-          if (got-(barEpoch+nRanks) <= uint32_t(-1)>>1) break;
-        }
-        barEpoch += nRanks;
-      }
-    #endif
-    } else {
-      int r = cta.self();
-      if (r != rank && r < nRanks) {
-        uint32_t* inbox = &base->barInboxPerPeer[block*nRanks + r];
-        while (true) {
-          uint32_t got;
-          #if __CUDA_ARCH__ >= 700
-            if (acquire) {
-              asm volatile("ld.acquire.sys.u32 %0,[%1];" : "=r"(got) : "l"(inbox));
-            } else {
-              asm volatile("ld.relaxed.sys.u32 %0,[%1];" : "=r"(got) : "l"(inbox));
-            }
-          #else
-            if (acquire) {
-              got = __atomic_load_n(inbox, __ATOMIC_ACQUIRE);
-            } else {
-              got = __atomic_load_n(inbox, __ATOMIC_RELAXED);
-            }
-            // asm volatile("ld.volatile.u32 %0,[%1];" : "=r"(got) : "l"(inbox));
-          #endif
-          if (got-(barEpoch+1) <= uint32_t(-1)>>1) break;
-        }
-      }
-      #if __CUDA_ARCH__ < 700
-        if (acquire) {
-          cta.sync();
-          if (cta.self() == 0) __threadfence();
-        }
-      #endif
-      barEpoch += 1;
-    }
-    cta.sync();
-  }
+  template<typename T, typename Fn>
+    __device__ void singleWork(Fn const& fn) {
+      uint16_t w;
+      size_t indexLo, indexHi;

-  __device__ void endLL(ncclCoopCta cta) {
-    if (__builtin_expect(llEpoch >= -2u, false)) {
-      cta.sync();
-      uint4* buf = ncclSymDevBase_getLLBuf(base, nRanks, block, llEpoch);
-      int epochSize = ncclSymLLEpochSize(nRanks);
-      #pragma unroll 4
-      for (int i=cta.self(); i*16 < epochSize; i += cta.count()) {
-        buf[i] = uint4{0, 0, 0, 0};
-      }
-    }
-    cta.sync();
-    llEpoch += (llEpoch == -1u) ? 3 : 1;
-  }
+      getWorkRange<T>(blockIdx.x, w, indexLo, w, indexHi);

-  template<typename T>
-  __device__ void sendLL(int peer, int slot, T val) {
-    union { T tmp; uint32_t u32[divUp(sizeof(T),8)][2]; };
-    tmp = val;
-    uint4* buf = ncclSymDevBase_getLLBuf(peerPtr(peer, base), nRanks, block, llEpoch) + slot;
-    #pragma unroll
-    for (int u=0; u < divUp(sizeof(T),8); u++) {
-      using Vec = uint32_t __attribute__((ext_vector_type(4)));
-      Vec i4;
-      i4[0] = u32[u][0];
-      i4[1] = llEpoch;
-      i4[2] = u32[u][1];
-      i4[3] = llEpoch;
-#if defined(__gfx950__)
-      asm volatile ("flat_store_dwordx4 %0, %1 sc0 sc1 nt" :: "v"(buf + ncclSymLLMaxSlots(sizeof(T))*u), "v"(i4));
-#else
-      __builtin_nontemporal_store(i4, (Vec*)(buf + ncclSymLLMaxSlots(sizeof(T))*u));
-#endif
-      // asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" :: "l"(buf + ncclSymLLMaxSlots(sizeof(T))*u), "r"(u32[u][0]), "r"(u32[u][1]), "r"(llEpoch));
-    }
-  }
+      struct ncclSymkDevWork const& dw = devWork[w];

-  template<typename T>
-  __device__ void bcastLL(int slot, T val) {
-    if (flags & ncclSymPrims_UseMultimem) {
-      union { T tmp; uint32_t u32[divUp(sizeof(T),8)][2]; };
-      tmp = val;
-      uint4* bufmc = ncclSymDevBase_getLLBuf(multimemPtr(base), nRanks, block, llEpoch) + slot;
-      #pragma unroll
-      for (int u=0; u < divUp(sizeof(T),8); u++) {
-        using Vec = uint32_t __attribute__((ext_vector_type(4)));
-        Vec i4;
-        i4[0] = u32[u][0];
-        i4[1] = llEpoch;
-        i4[2] = u32[u][1];
-        i4[3] = llEpoch;
-#if defined(__gfx950__)
-        asm volatile ("flat_store_dwordx4 %0, %1 sc0 sc1 nt" :: "v"(bufmc + ncclSymLLMaxSlots(sizeof(T))*u), "v"(i4));
-#else
-        __builtin_nontemporal_store(i4, (Vec*)(bufmc + ncclSymLLMaxSlots(sizeof(T))*u));
-#endif
-        // asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" :: "l"(bufmc + ncclSymLLMaxSlots(sizeof(T))*u), "r"(u32[u][0]), "r"(u32[u][1]), "r"(llEpoch));
-      }
-    } else {
-      union { T tmp; uint32_t u32[divUp(sizeof(T),8)][2]; };
-      tmp = val;
-      uint4* buf0 = ncclSymDevBase_getLLBuf(peerPtr(0, base), nRanks, block, llEpoch) + slot;
-      int dr = 0;
-      int r = rank;
-      #pragma unroll 1
-      for (; dr+8 <= nRanks; dr += 8) {
-        #pragma unroll
-        for (int ur=0; ur < 8; ur++) {
-          uint4* buf = add4G(buf0, r*stride4G);
-          #pragma unroll
-          for (int u=0; u < divUp(sizeof(T),8); u++) {
-            using Vec = uint32_t __attribute__((ext_vector_type(4)));
-            Vec i4;
-            i4[0] = u32[u][0];
-            i4[1] = llEpoch;
-            i4[2] = u32[u][1];
-            i4[3] = llEpoch;
-#if defined(__gfx950__)
-            asm volatile ("flat_store_dwordx4 %0, %1 sc0 sc1 nt" :: "v"(buf + ncclSymLLMaxSlots(sizeof(T))*u), "v"(i4));
-#else
-            __builtin_nontemporal_store(i4, (Vec*)((buf + ncclSymLLMaxSlots(sizeof(T))*u)));
-#endif
-            // asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" :: "l"(buf + ncclSymLLMaxSlots(sizeof(T))*u), "r"(u32[u][0]), "r"(u32[u][1]), "r"(llEpoch));
-          }
-          r += 1;
-          if (r == nRanks) r = 0;
-        }
-      }
-      #pragma unroll
-      for (int ur=0; ur < 8; ur++, dr++) {
-        if (dr == nRanks) break;
-        uint4* buf = add4G(buf0, r*stride4G);
-        #pragma unroll
-        for (int u=0; u < divUp(sizeof(T),8); u++) {
-          using Vec = uint32_t __attribute__((ext_vector_type(4)));
-          Vec i4;
-          i4[0] = u32[u][0];
-          i4[1] = llEpoch;
-          i4[2] = u32[u][1];
-          i4[3] = llEpoch;
-#if defined(__gfx950__)
-          asm volatile ("flat_store_dwordx4 %0, %1 sc0 sc1 nt" :: "v"(buf + ncclSymLLMaxSlots(sizeof(T))*u), "v"(i4));
-#else
-          __builtin_nontemporal_store(i4, (Vec*)(buf + ncclSymLLMaxSlots(sizeof(T))*u));
-#endif
-          // asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" :: "l"(buf + ncclSymLLMaxSlots(sizeof(T))*u), "r"(u32[u][0]), "r"(u32[u][1]), "r"(llEpoch));
-        }
-        r += 1;
-        if (r == nRanks) r = 0;
-      }
-    }
-  }
-
-  template<int nSlotsMin, int nSlotsMax, typename T>
-  __device__ void recvLL(int slot0, int nSlots, int stride, T(&elts)[nSlotsMax]) {
-    uint4* buf = ncclSymDevBase_getLLBuf(base, nRanks, block, llEpoch) + slot0;
-    uint4 tmp[nSlotsMax][divUp(sizeof(T),8)];
-    //int spins=0;
-    while (true) {
-      #pragma unroll
-      for (int u=0; u < nSlotsMax; u++) {
-        if (u < nSlotsMin || u < nSlots) {
-          #pragma unroll
-          for (int v=0; v < divUp(sizeof(T),8); v++) {
-            tmp[u][v] = *(buf + u * stride + v * ncclSymLLMaxSlots(sizeof(T)));
-            // asm volatile("ld.volatile.v4.u32 {%0,%1,%2,%3},[%4];" : "=r"(tmp[u][v].x), "=r"(tmp[u][v].y), "=r"(tmp[u][v].z), "=r"(tmp[u][v].w) : "l"(buf + u*stride + v*ncclSymLLMaxSlots(sizeof(T))));
-          }
-        }
-      }
-      bool okAll = true;
-      #pragma unroll
-      for (int u=0; u < nSlotsMax; u++) {
-        #pragma unroll
-        for (int v=0; v < divUp(sizeof(T),8); v++) {
-          if (u < nSlotsMin || u < nSlots) {
-            bool ok = tmp[u][v].y == llEpoch &&
-                      tmp[u][v].w == llEpoch;
-            okAll &= ok;
-          }
-        }
-      }
-      if (__builtin_expect(okAll, true)) break;
-      //if (spins++ == 10<<20) spins=0;
-    }
-    #pragma unroll
-    for (int u=0; u < nSlotsMax; u++) {
-      if (nSlotsMin <= u && u == nSlots) break;
-      union { T val; uint32_t u32[divUp(sizeof(T),8)][2]; };
-      #pragma unroll
-      for (int v=0; v < divUp(sizeof(T),8); v++) {
-        u32[v][0] = tmp[u][v].x;
-        u32[v][1] = tmp[u][v].z;
-      }
-      elts[u] = val;
-    }
-  }
-
-  template<typename Pack, typename T, typename Red, int Unroll=8>
-  __device__ Pack recvReduceLL(int slot, int stride, Red red) {
-    using Acc = typename Red::EltType;
-    using AccPack = BytePack<sizeof(Pack)*sizeof(Acc)/sizeof(T)>;
-    AccPack acc;
-    bool first = true;
-    int r = 0;
-    #pragma unroll 1
-    for (; r+Unroll <= nRanks; r += Unroll) {
-      Pack got[Unroll];
-      this->template recvLL</*Min=*/Unroll>(slot + r*stride, Unroll, stride, got);
-      AccPack acc0 = applyCast<T, Acc>(got[0]);
-      acc = first ? acc0 : applyReduce(red, acc, acc0);
-      first = false;
-      #pragma unroll
-      for (int i=1; i < Unroll; i++) acc = applyReduce(red, acc, applyCast<T, Acc>(got[i]));
-    }
-    if (r < nRanks) {
-      Pack got[Unroll];
-      this->template recvLL</*Min=*/1>(slot + r*stride, nRanks-r, stride, got);
-      AccPack acc0 = applyCast<T, Acc>(got[0]);
-      acc = first ? acc0 : applyReduce(red, acc, acc0);
-      #pragma unroll
-      for (int i=1; i < Unroll-1; i++) {
-        if (r+i < nRanks) acc = applyReduce(red, acc, applyCast<T, Acc>(got[i]));
-      }
-    }
-    return applyCast<Acc, T>(acc);
-  }
-
-  template<typename T>
-  __device__ T recvLL(int slot) {
-    T one[1];
-    this->template recvLL<1, 1, T>(slot, 1, 0, one);
-    return one[0];
-  }
-
-  template<typename Coop, typename T>
-  __device__ void coopRecvLL(Coop coop, int slot0, int nSlots, T* dst) {
-    int me = coop.self();
-    if (me < nSlots) {
-      uint4* buf = ncclSymDevBase_getLLBuf(base, nRanks, block, llEpoch) + slot0 + me;
-      uint4 got[divUp(sizeof(T), 8)];
-      //int spins=0;
-      #pragma unroll 1
-      while (true) {
-        #pragma unroll
-        for (int u=0; u < divUp(sizeof(T), 8); u++) {
-          got[u] = *((buf + u * ncclSymLLMaxSlots(sizeof(T))));
-          // asm volatile("ld.volatile.v4.u32 {%0,%1,%2,%3},[%4];" : "=r"(got[u].x), "=r"(got[u].y), "=r"(got[u].z), "=r"(got[u].w) : "l"(buf + u*ncclSymLLMaxSlots(sizeof(T))));
-        }
-        bool ok = true;
-        #pragma unroll
-        for (int u=0; u < divUp(sizeof(T), 8); u++) {
-          ok &= got[u].y == llEpoch;
-          ok &= got[u].w == llEpoch;
-        }
-        if (__builtin_expect(ok, true)) break;
-        //if (++spins == 10<<20) { spins=0; printf("r=%d LL spin @ ix=%d got=%d want=%d\n", rank, slot0+me, got[0].y, llEpoch); }
-      }
-      union { T val; uint32_t u32[divUp(sizeof(T), 8)][2]; };
-      #pragma unroll
-      for (int u=0; u < divUp(sizeof(T), 8); u++) {
-        u32[u][0] = got[u].x;
-        u32[u][1] = got[u].z;
-      }
-      dst[slot0 + me] = val;
-    }
+      fn(indexHi - indexLo, dw.nElts,
+         ncclSymPtr<T>(dw.inputWin, dw.inputOff) + indexLo,
+         ncclSymPtr<T>(dw.outputWin, dw.outputOff) + indexLo);
  }
 };
 }

 template<template<typename> typename Red, typename T, bool nvls>
-struct ncclSymAccumType { using Type = T; };
+struct ncclSymkAccumType { using Type = T; };

 // Only Red's whose opArg is invariant w.r.t. the datatype can have a different
 // accumulator type. At the moment this excludes integer min/max, sumpostdiv,
 // and premulsum.
-template<> struct ncclSymAccumType<FuncSum, __half, false> { using Type = float; };
+template<> struct ncclSymkAccumType<FuncSum, __half, false> { using Type = float; };
 #if defined(__CUDA_BF16_TYPES_EXIST__)
-template<> struct ncclSymAccumType<FuncSum, __nv_bfloat16, false> { using Type = float; };
+template<> struct ncclSymkAccumType<FuncSum, __nv_bfloat16, false> { using Type = float; };
 #endif
 #if defined(__CUDA_FP8_TYPES_EXIST__)
-template<> struct ncclSymAccumType<FuncSum, __nv_fp8_e4m3, false> { using Type = float; };
-template<> struct ncclSymAccumType<FuncSum, __nv_fp8_e5m2, false> { using Type = float; };
+template<> struct ncclSymkAccumType<FuncSum, __nv_fp8_e4m3, false> { using Type = float; };
+template<> struct ncclSymkAccumType<FuncSum, __nv_fp8_e5m2, false> { using Type = float; };
 #endif
 #endif
@@ -1,38 +1,39 @@
 // Modification Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT 

-#include "symmetric.h"
+#include "sym_kernels.h"
 #include "symmetric/kernel.h"
 #include "symmetric/primitives.h"

 template<int BytePerPack, int UnrollPacks, int UnrollPeers, typename T, typename Red>
 static __device__ void reduceDeep(
-    ncclSymPrims& prim, int tn, int t, bool waitNeeded,
-    Red red, char* inputRank0, char* outputHere, int32_t nIters
+    ncclSymkArgsHandler const& handler, int tn, int t,
+    bool waitNeeded, ncclLsaBarrierSession<ncclCoopCta>& bar,
+    Red red, ncclSymPtr<char> input, ncclSymPtr<char> output, int32_t nIters
  ) {
  using Pack = BytePack<BytePerPack>;
  using Acc = typename Red::EltType;
  using AccPack = BytePack<BytePerPack*sizeof(Acc)/sizeof(T)>;

+  ncclTeam world = ncclTeamWorld(handler.comm);
  int wn = tn/WARP_SIZE;
  int w = t/WARP_SIZE;
  int lane = t%WARP_SIZE;
-  int const& rank = prim.rank;
-  int const& nRanks = prim.nRanks;
-  uint32_t const& stride4G = prim.stride4G;
-  Pack* inpRank0 = (Pack*)inputRank0 + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
-  Pack* outHere = (Pack*)outputHere + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
+  int const& rank = handler.comm.rank;
+  int const& nRanks = handler.comm.nRanks;
+  ncclSymPtr<Pack> inpPacks = (ncclSymPtr<Pack>)input + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
+  ncclSymPtr<Pack> outPacks = (ncclSymPtr<Pack>)output + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
  Pack acc0[UnrollPacks];

  nIters -= w;
  if (0 < nIters) {
    #pragma unroll
    for (int u=0; u < UnrollPacks; u++) {
-      acc0[u] = add4G(inpRank0, rank*stride4G)[u*WARP_SIZE];
+      acc0[u] = inpPacks.peerPtr(world, rank)[u*WARP_SIZE];
    }
  }

-  if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+  if (waitNeeded) bar.wait(ncclCoopCta(), cuda::memory_order_relaxed);

  if (0 < nIters) {
    while (true) {
@@ -42,7 +43,7 @@ static __device__ void reduceDeep(
      { Pack tmp1[UnrollPacks];
        #pragma unroll
        for (int u=0; u < UnrollPacks; u++) {
-          tmp1[u] = add4G(inpRank0, r*stride4G)[u*WARP_SIZE];
+          tmp1[u] = inpPacks.peerPtr(world, r)[u*WARP_SIZE];
        }
        #pragma unroll
        for (int u=0; u < UnrollPacks; u++) {
@@ -68,7 +69,7 @@ static __device__ void reduceDeep(
            if (partial && ur!=0 && dr+ur == nRanks) break;
            #pragma unroll UnrollPacks
            for (int u=0; u < UnrollPacks; u++) {
-              tmp1[ur][u] = add4G(inpRank0, r*stride4G)[u*WARP_SIZE];
+              tmp1[ur][u] = inpPacks.peerPtr(world, r)[u*WARP_SIZE];
            }
            r += 1;
            if (r == nRanks) r = 0;
@@ -88,17 +89,17 @@ static __device__ void reduceDeep(
      for (int u=0; u < UnrollPacks; u++) acc0[u] = applyCast<Acc, T>(acc1[u]);

      #pragma unroll UnrollPacks
-      for (int u=0; u < UnrollPacks; u++) outHere[u*WARP_SIZE] = acc0[u];
+      for (int u=0; u < UnrollPacks; u++) outPacks.localPtr()[u*WARP_SIZE] = acc0[u];

-      inpRank0 += intptr_t(wn)*UnrollPacks*WARP_SIZE;
-      outHere += intptr_t(wn)*UnrollPacks*WARP_SIZE;
+      inpPacks += intptr_t(wn)*UnrollPacks*WARP_SIZE;
+      outPacks += intptr_t(wn)*UnrollPacks*WARP_SIZE;
      nIters -= wn;
      if (nIters <= 0) break;

      // Load data for next iteration.
      #pragma unroll
      for (int u=0; u < UnrollPacks; u++) {
-        acc0[u] = add4G(inpRank0, rank*stride4G)[u*WARP_SIZE];
+        acc0[u] = inpPacks.peerPtr(world, rank)[u*WARP_SIZE];
      }
    }
  }
@@ -106,20 +107,22 @@ static __device__ void reduceDeep(

 template<int UnrollPeers, typename Red, typename T>
 static __device__ void reduceEnds(
-    ncclSymPrims& prim, int tn, int t, Red red,
-    T* inputRank0, T* outputHere, size_t nElts, uint32_t nPreElts, size_t nSufElts
+    ncclSymkArgsHandler const& handler, int tn, int t, Red red,
+    ncclSymPtr<T> input, ncclSymPtr<T> output,
+    size_t nElts, uint32_t nPreElts, size_t nSufElts
  ) {
  using Acc = typename Red::EltType;

-  int const& rank = prim.rank;
-  int const& nRanks = prim.nRanks;
-  uint32_t const& stride4G = prim.stride4G;
-  BytePack<sizeof(T)>* inpRank0 = (BytePack<sizeof(T)>*)inputRank0;
-  BytePack<sizeof(T)>* outHere = (BytePack<sizeof(T)>*)outputHere;
+  ncclTeam world = ncclTeamWorld(handler.comm);
+  int const& rank = handler.comm.rank;
+  int const& nRanks = handler.comm.nRanks;
+
+  ncclSymPtr<BytePack<sizeof(T)>> inpPacks = (ncclSymPtr<BytePack<sizeof(T)>>)input;
+  ncclSymPtr<BytePack<sizeof(T)>> outPacks = (ncclSymPtr<BytePack<sizeof(T)>>)output;
  #pragma unroll 1
  for (size_t i = t; i < nPreElts+nSufElts; i += tn) {
    size_t elt = i < nPreElts ? i : nElts-nSufElts-nPreElts+i;
-    BytePack<sizeof(T)> acc0 = *add4G(inpRank0+elt, rank*stride4G);
+    BytePack<sizeof(T)> acc0 = inpPacks.peerPtr(world, rank)[elt];
    BytePack<sizeof(Acc)> acc1;
    BytePack<sizeof(T)> tmp[UnrollPeers];
    int dr = 1;
@@ -138,7 +141,7 @@ static __device__ void reduceEnds(
        #pragma unroll
        for (int u=0; u < UnrollPeers-partial; u++) {
          if (partial && u!=0 && dr+u == nRanks) break;
-          tmp[u] = *add4G(inpRank0+elt, r*stride4G);
+          tmp[u] = inpPacks.peerPtr(world, r)[elt];
          r += 1;
          if (r == nRanks) r = 0;
        }
@@ -155,26 +158,25 @@ static __device__ void reduceEnds(
    }

    acc0 = applyCast<Acc, T>(acc1);
-    outHere[elt] = acc0;
+    outPacks.localPtr()[elt] = acc0;
  }
 }

 template<typename Red, typename T>
 static __device__ void reduce(
-    ncclSymPrims& prim, int tn, int t, bool waitNeeded,
-    Red red, T* input, T* output, size_t nElts
+    ncclSymkArgsHandler const& handler, int tn, int t, int nBlocks,
+    bool waitNeeded, ncclLsaBarrierSession<ncclCoopCta>& bar,
+    Red red, ncclSymPtr<T> input, ncclSymPtr<T> output, size_t nElts
  ) {
-  int nRanks = prim.nRanks;
-  int nBlocks = prim.nBlocks;
-  // Mpve input to rank=0
-  input = prim.peerPtr(0, input);
+  int const& nRanks = handler.comm.nRanks;
+  int const& nRanks_rcp32 = handler.nRanks_rcp32;
+  uint32_t nBlocks_rcp32 = nccl::utility::idivRcp32_upto64(nBlocks);
+  uint32_t nRanks_nBlocks_rcp32 = nccl::utility::imulRcp32(nRanks, nRanks_rcp32, nBlocks, nBlocks_rcp32);

-  uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
-  uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
-  uint32_t alignment = uint32_t(inputUptr - outputUptr);
+  uint32_t alignment = uint32_t(input.offset - output.offset);
  size_t nBytes = nElts*sizeof(T);

-  uint32_t nPreBytes = (16u - inputUptr)%16u;
+  uint32_t nPreBytes = (16u - input.offset)%16u;
  nPreBytes = min((size_t)nPreBytes, nBytes);
  uintptr_t cursor = nPreBytes;

@@ -184,12 +186,12 @@ static __device__ void reduce(
    constexpr int BytePerPack = 16, UnrollPacks = 4, UnrollPeers = 2;
    constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
    uint32_t chunks = (nBytes-cursor)/BytePerChunk;
-    chunks -= imodFast32(chunks, nRanks*nBlocks, prim.nRanks_nBlocks_rcp32);
+    chunks -= imodFast32(chunks, nRanks*nBlocks, nRanks_nBlocks_rcp32);
    if (chunks != 0) {
      uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
      reduceDeep<BytePerPack, UnrollPacks, UnrollPeers, T>(
-        prim, tn, t, waitNeeded, red,
-        (char*)input + cursor, (char*)output + cursor,
+        handler, tn, t, waitNeeded, bar, red,
+        (ncclSymPtr<char>)input + cursor, (ncclSymPtr<char>)output + cursor,
        chunks*MinWarpPerBlock
      );
      cursor = cursorAfter;
@@ -201,12 +203,12 @@ static __device__ void reduce(
    constexpr int BytePerPack = 4, UnrollPacks = 4, UnrollPeers = 4;
    constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
    uint32_t chunks = (nBytes-cursor)/BytePerChunk;
-    chunks -= imodFast32(chunks, nRanks*nBlocks, prim.nRanks_nBlocks_rcp32);
+    chunks -= imodFast32(chunks, nRanks*nBlocks, nRanks_nBlocks_rcp32);
    if (chunks != 0) {
      uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
      reduceDeep<(sizeof(T) <= BytePerPack ? BytePerPack : 0), UnrollPacks, UnrollPeers, T>(
-        prim, tn, t, waitNeeded, red,
-        (char*)input + cursor, (char*)output + cursor,
+        handler, tn, t, waitNeeded, bar, red,
+        (ncclSymPtr<char>)input + cursor, (ncclSymPtr<char>)output + cursor,
        chunks*MinWarpPerBlock
      );
      cursor = cursorAfter;
@@ -214,42 +216,47 @@ static __device__ void reduce(
    }
  }

-  if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+  if (waitNeeded) bar.wait(ncclCoopCta(), cuda::memory_order_relaxed);

  constexpr int UnrollPeers = 8;
  size_t nSufElts = (nBytes-cursor)/sizeof(T);
-  reduceEnds<UnrollPeers>(prim, tn, t, red, input, output, nElts, nPreBytes/sizeof(T), nSufElts);
+  reduceEnds<UnrollPeers>(handler, tn, t, red, input, output, nElts, nPreBytes/sizeof(T), nSufElts);
 }

-
 template<template<typename> typename Red, typename T>
-__device__ __forceinline__ void ncclSymRun_ReduceScatter_LD(ncclSymDevArgs const* args) {
-  ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier);
-  Red<typename ncclSymAccumType<Red, T, /*nvls=*/false>::Type> red(args->redOpArg);
+__device__ __forceinline__ void ncclSymkRun_ReduceScatter_LD(ncclSymkDevWorkArgs const* args) {
+  ncclSymkArgsHandler handler{args};
+  ncclLsaBarrierSession<ncclCoopCta> bar{
+    ncclCoopCta(), handler.comm, ncclTeamTagLsa(), blockIdx.x
+  };
+  Red<typename ncclSymkAccumType<Red, T, /*nvls=*/false>::Type> red(handler.devWork->redOpArg);
+  int const& rank = handler.comm.rank;

-  // Round robin warps over blocks.
-  int t = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
-                    prim.block, prim.nBlocks,
-                    threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
-  int tn = prim.nBlocks*blockDim.x;
+  bar.arrive(ncclCoopCta(), cuda::memory_order_relaxed);

-  prim.barrierArrive(ncclCoopCta(), /*release=*/false);
-  //prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+  bool waitNeeded = true;
+  handler.forEachWork<T>(
+      [&]__device__(int block, int nBlocks, size_t nElts, size_t nAllElts,
+                    ncclSymPtr<T> input, ncclSymPtr<T> output) {
+        // Round robin warps over blocks.
+        int t = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
+                          block, nBlocks,
+                          threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
+        int tn = nBlocks*blockDim.x;

-  reduce(prim, tn, t, /*waitNeeded=*/true, red, (T*)args->input + prim.rank*args->nElts, (T*)args->output, args->nElts);
+        reduce(handler, tn, t, nBlocks, waitNeeded, bar, red, input + rank*nElts, output, nElts);

-  prim.barrierArrive(ncclCoopCta(), /*release=*/false);
-  prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+        waitNeeded = false;
+      }
+    );
+
+  bar.sync(ncclCoopCta(), cuda::memory_order_relaxed);
 }

-
 template<typename Red, typename T>
 static __device__ void reduceMultimem(
-    ncclSymPrims& prim, int tn, int t, Red red, T* input, T* output, size_t nElts
+    int tn, int t, Red red, T* input, T* output, size_t nElts
  ) {
-  // Mpve input to multimem
-  input = prim.multimemPtr(input);
-
  uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
  uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
  size_t nBytes = nElts*sizeof(T);
@@ -294,41 +301,52 @@ static __device__ void reduceMultimem(
    uintptr_t cursor = i < nPreBytes ? i : nBytes-nSufBytes+(i-nPreBytes);
    BytePack<sizeof(T)> val = applyLoadMultimem<Red, sizeof(T)>(red, inputUptr + cursor);
    *reinterpret_cast<BytePack<sizeof(T)>*>(outputUptr + cursor) = val;
-    cursor += tn*sizeof(T);
  }
 }

 template<template<typename> typename Red, typename T>
-__device__ __forceinline__ void ncclSymRun_ReduceScatter_LDMC(ncclSymDevArgs const* args) {
-  ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier|ncclSymPrims_UseMultimem);
-  Red<typename ncclSymAccumType<Red, T, /*nvls=*/true>::Type> red(args->redOpArg);
+__device__ __forceinline__ void ncclSymkRun_ReduceScatter_LDMC(ncclSymkDevWorkArgs const* args) {
+  ncclSymkArgsHandler handler{args};
+  ncclLsaBarrierSession<ncclCoopCta> bar{
+    ncclCoopCta(), handler.comm, ncclTeamTagLsa(), blockIdx.x, /*multimem=*/true
+  };
+  Red<typename ncclSymkAccumType<Red, T, /*nvls=*/true>::Type> red(handler.devWork->redOpArg);

-  // Round robin warps over blocks.
-  int t = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
-                    prim.block, prim.nBlocks,
-                    threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
-  int tn = prim.nBlocks*blockDim.x;
+  int const& rank = handler.comm.rank;
+  auto const& multimem = handler.comm.lsaMultimem;

-  prim.barrierArrive(ncclCoopCta(), /*release=*/false);
-  prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+  bar.sync(ncclCoopCta(), cuda::memory_order_relaxed);

-  reduceMultimem(prim, tn, t, red, (T*)args->input + prim.rank*args->nElts, (T*)args->output, args->nElts);
+  handler.forEachWork<T>(
+      [&]__device__(int block, int nBlocks, size_t nElts, size_t nAllElts,
+                    ncclSymPtr<T> input, ncclSymPtr<T> output) {
+        // Round robin warps over blocks.
+        int t = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
+                          block, nBlocks,
+                          threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
+        int tn = nBlocks*blockDim.x;

-  prim.barrierArrive(ncclCoopCta(), /*release=*/false);
-  prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+        reduceMultimem(tn, t, red, input.multimemPtr(multimem) + rank*nElts, output.localPtr(), nElts);
+      }
+    );
+
+  bar.sync(ncclCoopCta(), cuda::memory_order_relaxed);
 }

 // T is user type, EltType is the most aligned type
 template<typename T, typename Red, typename EltType>
-__device__ __forceinline__ void ncclSymRun_ReduceScatter_LL_body(
-    ncclSymPrims &prim, Red red, EltType* input, EltType* output, int nElts, int nPacks, int nStrideElts) {
+__device__ __forceinline__ void ncclSymkRun_ReduceScatter_LL_body(
+    ncclSymkArgsHandler& handler, ncclLLA2ASession<ncclCoopCta>& lla2a,
+    Red red, EltType* input, EltType* output, int nElts, int nPacks, int nStrideElts) {
  using Pack = BytePack<8>;
+  using Acc = typename Red::EltType;
+  using AccPack = BytePack<8*sizeof(Acc)/sizeof(T)>;
  constexpr int EltPerPack = 8/sizeof(EltType);

-  int nRanks = prim.nRanks;
-  int rank = prim.rank;
+  int const& nRanks = handler.comm.nRanks;
+  int const& rank = handler.comm.rank;
  int t = threadIdx.x;
-  int tn = ncclSymMaxThreads;
+  constexpr int tn = ncclSymkMaxThreads;
  ncclCoopCta cta;

  #pragma unroll 1
@@ -342,17 +360,25 @@ __device__ __forceinline__ void ncclSymRun_ReduceScatter_LL_body(
    #pragma unroll 1
    for (int i = t; i < nRanks*nIterPacks; i += tn) {
      Pack got = loadPack<Pack>(input + peer*nStrideElts, pack*EltPerPack, nElts);
-      prim.sendLL(peer, rank*nIterPacks + pack, got);
+      lla2a.send(peer, rank*nIterPacks + pack, got);
      peer += tn_div_nPacks;
      pack += tn_mod_nPacks;
      if (nIterPacks <= pack) { peer += 1; pack -= nIterPacks; }
    }

    if (t < nIterPacks) {
-      Pack got = prim.template recvReduceLL<Pack, T>(t, nIterPacks, red);
-      storePack(output, t*EltPerPack, nElts, got);
+      AccPack got = lla2a.template recvReduce</*Unroll=*/8, Pack>(
+        /*slotStart=*/t, /*slotCount=*/nRanks, /*slotStride=*/nIterPacks,
+        /*eltToAcc=*/[&] __device__ (Pack x)->AccPack {
+          return applyCast<T, Acc>(x);
+        },
+        /*reduce=*/[&] __device__ (AccPack a, AccPack b)->AccPack {
+          return applyReduce(red, a, b);
+        }
+      );
+      storePack(output, t*EltPerPack, nElts, applyCast<Acc, T>(got));
    }
-    prim.endLL(cta);
+    lla2a.endEpoch(cta);

    input += tn*EltPerPack;
    output += tn*EltPerPack;
@@ -360,31 +386,34 @@ __device__ __forceinline__ void ncclSymRun_ReduceScatter_LL_body(
    nPacks -= tn;
  }
 }
-template<template<typename> typename Red, typename T>
-__device__ __forceinline__ void ncclSymRun_ReduceScatter_LL(ncclSymDevArgs const* args) {
-  ncclSymPrims prim(args->comm, ncclSymPrims_UseLL);
-  Red<typename ncclSymAccumType<Red, T, /*nvls=*/false>::Type> red(args->redOpArg);

+template<template<typename> typename Red, typename T>
+__device__ __forceinline__ void ncclSymkRun_ReduceScatter_LL(ncclSymkDevWorkArgs const* args) {
+  ncclSymkArgsHandler handler{args};
+  ncclLLA2ASession<ncclCoopCta> lla2a(
+    ncclCoopCta(), handler.comm, ncclTeamLsa(handler.comm), handler.lsaLLA2A, blockIdx.x, ncclSymkMaxThreads
+  );
+  Red<typename ncclSymkAccumType<Red, T, /*nvls=*/false>::Type> red(handler.devWork->redOpArg);
  using Pack = BytePack<8>;
  constexpr int EltPerPack = 8/sizeof(T);
-  int nAllElts = args->nElts;
-  int nAllPacks = divUp(nAllElts, EltPerPack);
-  uint32_t nPackPerBlock, nPackModBlock;
-  idivmodFast32(&nPackPerBlock, &nPackModBlock, nAllPacks, prim.nBlocks, prim.nBlocks_rcp32);
-  int blockPackBegin = prim.block*nPackPerBlock + minval<int>(prim.block, nPackModBlock);
-  int blockPackEnd = blockPackBegin + nPackPerBlock + (prim.block < nPackModBlock ? 1 : 0);
-  int nPacks = blockPackEnd - blockPackBegin;
-  int nElts = nAllElts - blockPackBegin*EltPerPack;
-  nElts = min(nElts, nPacks*EltPerPack);
-  T* input = (T*)args->input + blockPackBegin*EltPerPack;
-  T* output = (T*)args->output + blockPackBegin*EltPerPack;

-  uint32_t lowBits = args->nElts*sizeof(T);
-  lowBits |= (uint32_t)reinterpret_cast<uintptr_t>(args->input);
-  lowBits |= (uint32_t)reinterpret_cast<uintptr_t>(args->output);
-  if (__builtin_expect(lowBits%8 == 0, true)) {
-    ncclSymRun_ReduceScatter_LL_body<T>(prim, red, (Pack*)input, (Pack*)output, nPacks, nPacks, nAllElts/EltPerPack);
-  } else {
-    ncclSymRun_ReduceScatter_LL_body<T>(prim, red, input, output, nElts, nPacks, nAllElts);
-  }
+  handler.singleWork<T>(
+      [&]__device__(int nElts, int nAllElts,
+                    ncclSymPtr<T> inputPtr, ncclSymPtr<T> outputPtr) {
+        int nPacks = divUp(nElts, EltPerPack);
+
+        T* input = (T*)inputPtr.localPtr();
+        T* output = (T*)outputPtr.localPtr();
+
+        uint32_t lowBits = nElts*sizeof(T);
+        lowBits |= (uintptr_t)input;
+        lowBits |= (uintptr_t)output;
+        if (__builtin_expect(lowBits%8 == 0, true)) {
+          ncclSymkRun_ReduceScatter_LL_body<T>(handler, lla2a, red, (Pack*)input, (Pack*)output,
+                                               nPacks, nPacks, divUp(nAllElts, EltPerPack));
+        } else {
+          ncclSymkRun_ReduceScatter_LL_body<T>(handler, lla2a, red, input, output, nElts, nPacks, nAllElts);
+        }
+      }
+    );
 }
@@ -22,6 +22,9 @@
 #include "profiler.h"
 #include "transport.h"
 #include "register_inline.h"
+#include "ce_coll.h"
+#include "nvtx.h"
+#include "scheduler.h"
 #include "common.h"
 #include "api_trace.h"

@@ -248,6 +251,7 @@ static void finishPlan(struct ncclComm* comm, struct ncclKernelPlan* plan) {
  size_t workBytes = plan->workBytes;
  size_t batchBytes = plan->nWorkBatches*sizeof(struct ncclDevWorkBatch);

+  if (plan->isSymColl) return;
 #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
 #else
  plan->threadPerBlock = std::max(plan->threadPerBlock, 256 /*NCCL_MIN_NTHREADS*/);
@@ -364,7 +368,6 @@ bool gfx9CheapFenceOff(const ncclDevWorkColl& devWork, bool disabledByPrecheck){

 ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm) {
  struct ncclKernelPlanner* planner = &comm->planner;
-  if (planner->isSymColl) return ncclSuccess;
  struct ncclTaskColl *task;
  task = ncclIntruQueueHead(&planner->collTaskQueue);
  while (task != nullptr) {
@@ -448,6 +451,7 @@ next:
 ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool* needConnect, ncclSimInfo_t* simInfo) {
  struct ncclKernelPlanner* planner = &comm->planner;
  planner->persistent = ncclCudaGraphValid(planner->capturingGraph);
+
  // Tasks from the sorter come out ordered size descending.
  struct ncclTaskColl* task = ncclTaskCollSorterDequeueAll(&planner->collSorter);
  // Tasks are assembled by (fn,op,ty) size ascending.
@@ -456,36 +460,8 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
  int fnOpTyIndices[ncclNumFuncs*ncclNumDevRedOps*ncclNumTypes];
  int fnOpTyCount = 0;

-  if (comm->nNodes == 1 && planner->nTasksColl == 1 && planner->nTasksP2p == 0) {
-    void* sendSymPtr;
-    void* recvSymPtr;
-    struct ncclReg* sendReg;
-    struct ncclReg* recvReg;
-    size_t size = task->count*ncclTypeSize(task->datatype);
-    NCCLCHECK(ncclRegFindSymmetric(comm, task->sendbuff, size, &sendSymPtr, &sendReg));
-    NCCLCHECK(ncclRegFindSymmetric(comm, task->recvbuff, size, &recvSymPtr, &recvReg));
-    bool implemented = ncclSymImplemented(task->func, task->opDev.op, task->datatype);
-
-    if (sendReg && recvReg && (sendReg->winFlags & recvReg->winFlags & NCCL_WIN_COLL_SYMMETRIC) && implemented) {
-      enum ncclSymKernelId kernel;
-      int nChannels, nWarps;
-      float estTimeUs = 1.e18;
-      NCCLCHECK(ncclSymPickKernel(comm, task->func, task->opDev.op, task->datatype, task->count, &estTimeUs, &kernel, &nChannels, &nWarps));
-
-      // We should only use symmetric kernel if it beats the asymmetric kernel. But the
-      // perf model accuracy from asymmetric kernels is too inaccurate and reports too high
-      // of a bandwidth. For now just always use symmetric if available.
-      if (kernel != ncclSymKernelId_Count) {
-        task->sendbuff = sendSymPtr;
-        task->recvbuff = recvSymPtr;
-        task->devFuncId = (int)kernel;
-        task->nMaxChannels = nChannels;
-        task->nWarps = nWarps;
-        ncclIntruQueueEnqueue(&planner->collTaskQueue, task);
-        planner->isSymColl = true;
-        return ncclSuccess;
-      }
-    }
+  if (comm->symmetricSupport) {
+    NCCLCHECK(ncclMakeSymmetricTaskList(comm, task, &planner->collSymTaskQueue, &task));
  }

  // Walk the size sorted tasks, binning them by (fn,op,ty).
@@ -677,7 +653,7 @@ static ncclResult_t scheduleCollTasksToPlan(
  size_t trafficBytes[2*2] = {0, 0, 0, 0}; // [collnet][nvls]
  int nChannels[2*2] = {0, 0, 0, 0}; // [collnet][nvls]
  int const nMaxChannels[2*2] = {comm->nChannels, comm->nvlsChannels, // [collnet][nvls]
-                                 comm->nChannels, comm->nvlsChannels};
+                                 comm->nChannels, std::min(comm->nChannels, comm->nvlsChannels)};
  constexpr size_t MinTrafficPerChannel = 16 << 10; // 16K traffic as minimal
  do {
    size_t workBytes = 0;
@@ -888,6 +864,7 @@ static ncclResult_t scheduleCollTasksToPlan(
        }
        proxyOp->eActivationMask = task->eActivationMask;
        proxyOp->incWorkCounter = true;
+        proxyOp->nChannels = nChannels;
        proxyOp->connIndex = 0;
        if (task->protocol == NCCL_PROTO_SIMPLE && task->algorithm == NCCL_ALGO_RING) {
          if (comm->useIntraNet && nBytes > rcclParamIntraNetThreshold()) {
@@ -920,6 +897,8 @@ static ncclResult_t scheduleCollTasksToPlan(
      plan->kernelFn = ncclKerns[ncclGetKernelIndex(comm)].kernelFn;
      plan->kernelSpecialized = ncclKerns[ncclGetKernelIndex(comm)].specialized;
    }
+    // Profiler
+    plan->groupApiEventHandle = task->groupApiEventHandle;

    if (comm->rank == 0) {
      INFO(NCCL_TUNING, "%s: %ld Bytes -> Algo %s proto %s channel{Lo..Hi}={%d..%d}",
@@ -993,8 +972,9 @@ static ncclResult_t addP2pToPlan(
    int sendRank, void* sendAddr, ssize_t sendBytes,
    int recvRank, void* recvAddr, ssize_t recvBytes,
    uint64_t sendOpCount, uint64_t recvOpCount,
-    struct ncclTaskP2p** p2pTasks
+    const int planTotalTasks[], struct ncclTaskP2p** p2pTasks
  ) {
+  ncclResult_t ret = ncclSuccess;
  int connIndex[2] = {1, 1};
  bool selfSend = (sendRank == comm->rank);
  // recv: dir=0, send: dir=1
@@ -1012,6 +992,8 @@ static ncclResult_t addP2pToPlan(
  //replacing line below with ncclP2pChannelBaseForRound(comm, p2pRound, batchP2P) can cause issues due to ncclP2pChannelBaseForRound calling the same routine
  //channel base computed in taskAppend and here must be the same, but in taskAppend the call happens once and is cached for later usage, which is why it wouldn't be consistent with the call below
  uint8_t base = ncclP2pChannelBaseForRound(comm, p2pRound, batchP2PEnableEnv);
+  struct ncclProxyOp proxyOps[2] = {};
+  int nProxyOps = selfSend ? 0 : 2;
  if (comm->p2pNet) {
    for (int dir = 0; dir <= 1; dir++) {
      if (bytes[dir] > rcclParamP2pNetThreshold())
@@ -1072,7 +1054,7 @@ static ncclResult_t addP2pToPlan(
      bool pxnUsed = !ncclPxnDisable(comm) && comm->isAllNvlink && comm->maxLocalRanks > 1;
      if (bytes[dir] > 0 && proxySameProcess[dir] && protocol[dir] == NCCL_PROTO_SIMPLE && (!pxnUsed)) {
        int regFlag = 0;
-        NCCLCHECK(ncclCalloc(&handles[dir], nChannelsMax));
+        NCCLCHECKGOTO(ncclCalloc(&handles[dir], nChannelsMax), ret, cleanup);
        for (int part = 0; part < nChannelsMax; part++) {
          int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, part, nChannelsMax, comm->nNodes);
          struct ncclChannelPeer** channelPeers = comm->channels[channelId].peers;
@@ -1095,7 +1077,7 @@ static ncclResult_t addP2pToPlan(
      void* regAddr = NULL;
      if (conn->conn.flags & (NCCL_P2P_WRITE | NCCL_P2P_READ)) {
        // We require users registering buffers on both sides
-        NCCLCHECK(ncclRegisterP2pIpcBuffer(comm, addrs[dir], bytes[dir], peerRank, &regFlag, &regAddr, &plan->cleanupQueue));
+        NCCLCHECKGOTO(ncclRegisterP2pIpcBuffer(comm, addrs[dir], bytes[dir], peerRank, &regFlag, &regAddr, &plan->cleanupQueue), ret, cleanup);
        if (regFlag) {
          if (dir == 0 && (conn->conn.flags & NCCL_P2P_WRITE)) recvAddr = regAddr;
          else if (dir == 1 && (conn->conn.flags & NCCL_P2P_READ)) sendAddr = regAddr;
@@ -1120,14 +1102,17 @@ static ncclResult_t addP2pToPlan(
    if (p2pTasks[dir]) p2pTasks[dir]->nChannels = nChannels[dir];
  }

-  struct ncclWorkList* workNode = ncclMemoryStackAllocInlineArray<ncclWorkList, ncclDevWorkP2p>(&comm->memScoped, 1);
+  struct ncclWorkList* workNode;
+  workNode = ncclMemoryStackAllocInlineArray<ncclWorkList, ncclDevWorkP2p>(&comm->memScoped, 1);
  workNode->workType = ncclDevWorkTypeP2p;
  workNode->size = sizeof(struct ncclDevWorkP2p);
  ncclIntruQueueEnqueue(&plan->workQueue, workNode);
-  uint32_t workOffset = plan->workBytes;
+  uint32_t workOffset;
+  workOffset = plan->workBytes;
  plan->workBytes += sizeof(struct ncclDevWorkP2p);

-  struct ncclDevWorkP2p* work = (struct ncclDevWorkP2p*)(workNode+1);
+  struct ncclDevWorkP2p* work;
+  work = (struct ncclDevWorkP2p*)(workNode+1);
  work->nP2pChannels = comm->p2pnChannels;
  work->channelBase = base;
  work->nSendChannels = nChannels[1];
@@ -1152,8 +1137,6 @@ static ncclResult_t addP2pToPlan(
  work->recvConnIndex = connIndex[0];
  work->recvOpCount = recvOpCount;

-  struct ncclProxyOp proxyOps[2] = {};
-  int nProxyOps = selfSend ? 0 : 2;
  for (int dir=0; dir < nProxyOps; dir++) {
    struct ncclProxyOp* op = &proxyOps[dir];
    op->root = dir ? sendRank : recvRank;
@@ -1166,6 +1149,7 @@ static ncclResult_t addP2pToPlan(
    op->chunkSize = chunkSize[dir];
    op->reg = netRegistered[dir];
    op->coll = p2pTasks[dir] ? p2pTasks[dir]->func : 0;
+    op->collAPI = p2pTasks[dir] ? p2pTasks[dir]->collAPI : 0;
    op->task.p2p = p2pTasks[dir];
    op->rank = comm->rank;
    op->eActivationMask = p2pTasks[dir] ? p2pTasks[dir]->eActivationMask : 0;
@@ -1178,6 +1162,15 @@ static ncclResult_t addP2pToPlan(
  }

  nChannelsMax = std::max(nChannels[0], nChannels[1]);
+  // Determine how many peers this plan will target concurrently. Make a
+  // simplifying assumption that each task targets a different peer.
+  // Each task is striped across 'nChannelsMax' of 'p2pnChannels' channels.
+  // Each channel runs up to NCCL_MAX_DEV_WORK_P2P_PER_BATCH tasks concurrently.
+  int maxConcurrent;
+  int concurrentTasks[2];
+  maxConcurrent = comm->p2pnChannels / nChannelsMax * NCCL_MAX_DEV_WORK_P2P_PER_BATCH;
+  concurrentTasks[0] = std::min(planTotalTasks[0], maxConcurrent);
+  concurrentTasks[1] = std::min(planTotalTasks[1], maxConcurrent);
  for (int part=0; part < nChannelsMax; part++) {
    int incWorkCounter = -1;
    int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, part, comm->p2pnChannelsPerPeer, comm->nNodes);
@@ -1234,13 +1227,17 @@ static ncclResult_t addP2pToPlan(
        // equal one plus the batch index this p2p settled in.
        proxyOps[dir].channelId = channelId;
        proxyOps[dir].opCount = uint64_t(comm->planner.wipPlan.channels[channelId].nWorkBatchesP2p)<<1 | 1;
-        NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOps[dir]));
-        NCCLCHECK(addProfilerProxyOpIfNeeded(comm, plan, &proxyOps[dir]));
+        proxyOps[dir].nChannels = nChannels[dir];
+        proxyOps[dir].nPeers = concurrentTasks[dir];
+        NCCLCHECKGOTO(addProxyOpIfNeeded(comm, plan, &proxyOps[dir]), ret, cleanup);
+        NCCLCHECKGOTO(addProfilerProxyOpIfNeeded(comm, plan, &proxyOps[dir]), ret, cleanup);
      }
    }
  }
-
-  return ncclSuccess;
+cleanup:
+  free(handles[0]);
+  free(handles[1]);
+  return ret;
 }

 static int calcP2pChannelCount(size_t totalSize, int minChannels, int maxChannels, size_t minSize, size_t maxSize) {
@@ -1275,6 +1272,8 @@ static ncclResult_t scheduleP2pTasksToPlan(
  // Try to use all channels, but one channel per operation.
  while (nChannelsMin*nRanks > comm->p2pnChannels && nChannelsMin > 1) nChannelsMin /= 2;

+  // Save the total count of send/recv tasks in the plan
+  int planTotalTasks[2] = {comm->planner.nTasksP2pRecv, comm->planner.nTasksP2pSend};
  while (comm->planner.nTasksP2p != 0) {
    for (int round=0; round < nRanks; round++) {
      int sendRank = comm->p2pSchedule[round].sendRank;
@@ -1306,22 +1305,30 @@ static ncclResult_t scheduleP2pTasksToPlan(
        ncclMemoryPoolFree(&comm->memPool_ncclTaskP2p, send);
        ncclMemoryPoolFree(&comm->memPool_ncclTaskP2p, recv);
        comm->planner.nTasksP2p -= 2;
+        comm->planner.nTasksP2pSend -= 1;
+        comm->planner.nTasksP2pRecv -= 1;
      } else {
        // Ensure room for worst case of one new batch per channel.
        if (!testBudget(budget, plan->nWorkBatches+nChannelsMax, plan->workBytes + sizeof(struct ncclDevWorkP2p))) {
          return ncclSuccess;
        }
        struct ncclTaskP2p* p2pTasks[2] = { recv, send };
-        NCCLCHECK(addP2pToPlan(comm, plan, nChannelsMin, nChannelsMax, round, sendRank, sendBuff, sendBytes, recvRank, recvBuff, recvBytes, send ? send->opCount : 0, recv ? recv->opCount : 0, p2pTasks));
+        NCCLCHECK(addP2pToPlan(comm, plan, nChannelsMin, nChannelsMax, round, sendRank, sendBuff, sendBytes, recvRank, recvBuff, recvBytes, send ? send->opCount : 0, recv ? recv->opCount : 0, planTotalTasks, p2pTasks));
        if (send != nullptr) {
          ncclIntruQueueDequeue(&peers[sendRank].sendQueue);
+          // Profiler - We can overwrite groupAPI event handles here since all operations here belong to the same group
+          plan->groupApiEventHandle = send->groupApiEventHandle;
          ncclIntruQueueEnqueue(&plan->p2pTaskQueue, send);
          comm->planner.nTasksP2p -= 1;
+          comm->planner.nTasksP2pSend -= 1;
        }
        if (recv != nullptr) {
          ncclIntruQueueDequeue(&peers[recvRank].recvQueue);
+          // Profiler - We can overwrite groupAPI event handles here since all operations here belong to the same group
+          plan->groupApiEventHandle = recv->groupApiEventHandle;
          ncclIntruQueueEnqueue(&plan->p2pTaskQueue, recv);
          comm->planner.nTasksP2p -= 1;
+          comm->planner.nTasksP2pRecv -= 1;
        }
      }
    }
@@ -1372,7 +1379,7 @@ namespace {
 }

 static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* plan) {
-  if (plan->isSymColl) return ncclSuccess;
+  if (plan->isSymColl || plan->isCeColl) return ncclSuccess;

  size_t workBytes = plan->workBytes;
  size_t batchBytes = plan->nWorkBatches*sizeof(struct ncclDevWorkBatch);
@@ -1544,7 +1551,7 @@ static ncclResult_t hostStreamPlanTask(struct ncclComm* comm, struct ncclKernelP
 }

 static void HIPRT_CB hostStreamPlanCallback(void *plan_) {
-  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  NCCL_NVTX3_FUNC_RANGE;
  struct ncclKernelPlan* plan = (struct ncclKernelPlan*)plan_;
  ncclResult_t result = hostStreamPlanTask(plan->comm, plan);
  if (result != ncclSuccess) {
@@ -1565,6 +1572,9 @@ static ncclResult_t reclaimPlan(struct ncclComm* comm, struct ncclCommCallback*
      CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
    }
  }
+  if (plan->isSymColl) {
+    free(plan->kernelSymArgs);
+  }
  // Free coll tasks
  struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue);
  while (ct != nullptr) {
@@ -1645,7 +1655,9 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
  planner->persistent = persistent;
  int nPlans = 0;

-  if (planner->nTasksColl + planner->nTasksP2p != 0) {
+  if (planner->nTasksColl + planner->nTasksP2p != 0 ||
+      !ncclIntruQueueEmpty(&planner->collSymTaskQueue) ||
+      !ncclIntruQueueEmpty(&planner->collCeTaskQueue)) {
    do {
      memset(&planner->wipPlan, 0, sizeof(planner->wipPlan));

@@ -1657,55 +1669,55 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
      plan->workStorageType = persistent ? ncclDevWorkStorageTypePersistent
                                         : ncclDevWorkStorageTypeFifo;

-      if (planner->isSymColl) {
-        plan->workStorageType = ncclDevWorkStorageTypeArgs;
+      if (!ncclIntruQueueEmpty(&planner->collCeTaskQueue)) {
+        struct ncclTaskColl* task = ncclIntruQueueHead(&planner->collCeTaskQueue);
+        plan->isCeColl = true;
+        plan->ceCollArgs = ncclMemoryStackAlloc<struct ncclCeCollArgs>(&comm->memScoped);
+        plan->ceCollArgs->rootRank = task->root;
+        plan->ceCollArgs->nElts = task->count;
+        plan->ceCollArgs->eltSize = ncclTypeSize(task->datatype);
+        plan->ceCollArgs->sendBuff = (uint8_t*)task->sendbuff;
+        plan->ceCollArgs->recvBuff = (uint8_t*)task->recvbuff;
+        plan->ceCollArgs->func = task->func;
+        plan->ceCollArgs->sendWin = task->sendWin;
+        plan->ceCollArgs->recvWin = task->recvWin;

-        struct ncclTaskColl* task = ncclIntruQueueHead(&planner->collTaskQueue);
-        plan->isSymColl = true;
-        plan->kernelFn = ncclSymGetKernelPtr((ncclSymKernelId)task->devFuncId, task->opDev.op, task->datatype);
-        plan->threadPerBlock = task->nWarps*WARP_SIZE;
-        for (int i = 0; i < MAXCHANNELS/64; i++)
-          plan->channelMask.masks[i] = uint64_t(-1) >> (64-task->nMaxChannels);
-        // plan->channelMask = uint64_t(-1) >> (64-task->nMaxChannels);
-
-        plan->kernelArgsSize = sizeof(struct ncclSymDevArgs);
-        plan->kernelSymArgs = ncclMemoryStackAlloc<struct ncclSymDevArgs>(&comm->memScoped);
-        plan->kernelSymArgs->comm = comm->symDevComm;
-        plan->kernelSymArgs->rootRank = task->root;
-        plan->kernelSymArgs->redOpArg = task->opDev.scalarArg;
-        plan->kernelSymArgs->nElts = task->count;
-        plan->kernelSymArgs->input = (char*)task->sendbuff;
-        plan->kernelSymArgs->output = (char*)task->recvbuff;
-
-        planner->nTasksColl -= 1;
        ncclIntruQueueEnqueue(&planner->planQueue, plan);
-        INFO(NCCL_TUNING, "%s [Symmetric]: %ld Bytes -> Kernel %s nchannels %d nthreads %d",
-        ncclFuncToString(task->func), task->count * ncclTypeSize(task->datatype), ncclSymKernelIdToString(task->devFuncId), task->nMaxChannels, plan->threadPerBlock);
+        ncclIntruQueueDequeue(&planner->collCeTaskQueue);
+        ncclMemoryPoolFree(&comm->memPool_ncclTaskColl, task);
        nPlans += 1;
      } else {
-        struct ncclKernelPlanBudget budget;
-        budget.inArgsBytes = comm->workArgsBytes - sizeof(struct ncclDevKernelArgs);
-        // Non-persistent kernels fill up at most half of our fifo per kernel.
-        budget.outArgsBytes = plan->persistent ? (1<<30) : comm->workFifoBytes/2;
+	if (!ncclIntruQueueEmpty(&planner->collSymTaskQueue)) {
+          NCCLCHECKGOTO(ncclSymmetricTaskScheduler(comm, &planner->collSymTaskQueue, plan), result, failure);
+        }
+        else {
+          struct ncclKernelPlanBudget budget;
+          budget.inArgsBytes = comm->workArgsBytes - sizeof(struct ncclDevKernelArgs);
+          // Non-persistent kernels fill up at most half of our fifo per kernel.
+          budget.outArgsBytes = plan->persistent ? (1<<30) : comm->workFifoBytes/2;

-        // Drain coll tasks first. This is essential since we partition tasks based
-        // on the work budget and p2p work isn't collective. If we were to drain p2p
-        // first, the place where we cut the kernel could vary by rank which would
-        // cause the "shortest channel first" channel picker to have divergent results.
-        if (planner->nTasksColl != 0) {
-          NCCLCHECKGOTO(scheduleCollTasksToPlan(comm, plan, &budget), result, failure);
-        }
-        // And only drain p2p tasks once colls are depleted.
-        if (planner->nTasksColl == 0 && planner->nTasksP2p != 0) {
-          NCCLCHECKGOTO(scheduleP2pTasksToPlan(comm, plan, &budget), result, failure);
+          // Drain coll tasks first. This is essential since we partition tasks based
+          // on the work budget and p2p work isn't collective. If we were to drain p2p
+          // first, the place where we cut the kernel could vary by rank which would
+          // cause the "shortest channel first" channel picker to have divergent results.
+          if (planner->nTasksColl != 0) {
+            NCCLCHECKGOTO(scheduleCollTasksToPlan(comm, plan, &budget), result, failure);
+          }
+          // And only drain p2p tasks once colls are depleted.
+          if (planner->nTasksColl == 0 && planner->nTasksP2p != 0) {
+            NCCLCHECKGOTO(scheduleP2pTasksToPlan(comm, plan, &budget), result, failure);
+          }
        }
+
        finishPlan(comm, plan);
        if (plan->workBytes != 0) {
          ncclIntruQueueEnqueue(&planner->planQueue, plan);
          nPlans += 1;
        }
      }
-    } while (planner->nTasksColl + planner->nTasksP2p != 0);
+    } while (planner->nTasksColl + planner->nTasksP2p != 0 ||
+             !ncclIntruQueueEmpty(&planner->collSymTaskQueue) ||
+             !ncclIntruQueueEmpty(&planner->collCeTaskQueue));

    struct ncclKernelPlan* planHead = ncclIntruQueueHead(&planner->planQueue);
    planner->unlaunchedPlansHead = planHead;
@@ -1789,7 +1801,6 @@ ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, stru
 NCCL_PARAM(MemSyncDomain, "MEM_SYNC_DOMAIN", cudaLaunchMemSyncDomainRemote);
 #endif

-NCCL_PARAM(NvlinkUtilCentricSchedEnable, "NVLINK_UTIL_CENTRIC_SCHED_ENABLE", 0);
 ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan) {
  ncclResult_t ret = ncclSuccess;
  struct ncclKernelPlanner* planner = &comm->planner;
@@ -1804,6 +1815,9 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
  dim3 block = {(unsigned)plan->threadPerBlock, 1, 1};
  int smem = rcclShmemDynamicSize(comm->cudaArch, comm->WarpSize);
  cudaStream_t launchStream = planner->streams->stream;
+
+  NCCLCHECK(ncclProfilerStartKernelLaunchEvent(plan, launchStream));
+  
  void* extra[] = {plan->kernelArgs, &plan->kernelArgsSize};

  auto event = latency_profiler::collTraceAquireEventBaseline(plan, launchStream);
@@ -1860,25 +1874,24 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
    }
    #endif
    #if CUDART_VERSION >= 12030
-    bool capturing = ncclCudaGraphValid(planner->capturingGraph);
    enum ncclImplicitOrder implicitOrder;
-    NCCLCHECKGOTO(getImplicitOrder(&implicitOrder, capturing, driverVersion), ret, do_return);
+    NCCLCHECKGOTO(getImplicitOrder(&implicitOrder, plan->persistent, driverVersion), ret, do_return);
    if (implicitOrder == ncclImplicitOrderLaunch) {
      launchAttrs[attrs].id = CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT;
      launchAttrs[attrs].value.launchCompletionEvent.event = comm->sharedRes->launchEvent;
      launchAttrs[attrs].value.launchCompletionEvent.flags = 0;
      attrs++;
    }
-    if (comm->planner.isSymColl && compCap >= 90 && driverVersion >= 12030) {
+    if (plan->isSymColl && compCap >= 90 && driverVersion >= 12030) {
      launchAttrs[attrs].id = CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION;
      launchAttrs[attrs].value.programmaticStreamSerializationAllowed = 1;
      attrs++;
    }
    #endif
    #if CUDART_VERSION >= 13000
-    if (compCap >= 90 && driverVersion >= 13000) {
+    if (compCap >= 100 && driverVersion >= 13000) {
      launchAttrs[attrs].id = CU_LAUNCH_ATTRIBUTE_NVLINK_UTIL_CENTRIC_SCHEDULING;
-      launchAttrs[attrs].value.nvlinkUtilCentricScheduling = ncclParamNvlinkUtilCentricSchedEnable();
+      launchAttrs[attrs].value.nvlinkUtilCentricScheduling = comm->config.nvlinkCentricSched;
      attrs++;
    }
    #endif
@@ -1911,6 +1924,7 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
  latency_profiler::collTraceRecordEndEvent(comm, plan, launchStream, std::move(event));

 do_return:
+  NCCLCHECK(ncclProfilerStopKernelLaunchEvent(plan));
  return ret;
 }

@@ -2047,7 +2061,7 @@ static ncclResult_t updateCollCostTable(
    float** collCostTable) {
  float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;

-  if (comm->nRanks == 1 || info->func == ncclFuncAllToAllPivot || info->func == ncclFuncAllToAllGda) {
+  if (comm->nRanks == 1 || info->func == ncclFuncAlltoAllPivot || info->func == ncclFuncAllToAllGda) {
    table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0;
    return ncclSuccess;
  }
@@ -2056,6 +2070,8 @@ static ncclResult_t updateCollCostTable(
    if ((a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) && collNetSupport != 1) continue;
    // CollNetDirect is only supported for up to 8 local GPUs
    if (a == NCCL_ALGO_COLLNET_DIRECT && comm->maxLocalRanks > NCCL_MAX_DIRECT_ARITY+1) continue;
+    // Disable CollNet Chain for more than 8 local GPUs
+    if (a == NCCL_ALGO_COLLNET_CHAIN && comm->maxLocalRanks > NCCL_MAX_DIRECT_ARITY+1) continue;
    if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && (!nvlsSupport || (info->func != ncclFuncAllReduce && comm->localRanks > NCCL_MAX_NVLS_ARITY))) continue;
    if (a == NCCL_ALGO_NVLS && collNetSupport != 1 && comm->nNodes > 1) continue;
    /* Tree reduceScatter doesn't support scaling yet */
@@ -2160,7 +2176,11 @@ static ncclResult_t topoGetAlgoInfo(
    }
  } else if (info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) {
    // NVLS should not need more than 16 channels to get peak BW.
-    nc = comm->nvlsChannels;
+    if (comm->nNodes > 1 && info->algorithm == NCCL_ALGO_NVLS) {
+      nc = std::min(comm->nvlsChannels, comm->nChannels);
+    } else {
+      nc = comm->nvlsChannels;
+    }
  } else {
    rcclUpdateThreadThreshold(comm, nBytes, info, threadThreshold);
    INFO(NCCL_INIT, "pre-adjustment threadThreshold:%i nBytes:%lu nc:%i", threadThreshold, nBytes, nc);
@@ -2348,7 +2368,7 @@ static ncclResult_t calcCollChunking(
      info->algorithm == NCCL_ALGO_COLLNET_DIRECT ? ncclPatternCollnetDirect :
      ncclPatternRing;
    break;
-  case ncclFuncAllToAllPivot:
+  case ncclFuncAlltoAllPivot:
    pattern = ncclPatternRing;
    break;
  case ncclFuncAllToAllGda:
@@ -2510,6 +2530,7 @@ static ncclResult_t calcCollChunking(
  }
  proxyOp->pattern = pattern;
  proxyOp->coll = info->func;
+  proxyOp->collAPI = info->func;
  proxyOp->root = info->root;
  proxyOp->isOneRPN = comm->isOneRPN;
  // This is used by P2P to reduce the receive buffer size. We don't use it in collectives
@@ -2573,6 +2594,35 @@ static ncclResult_t calcCollChunking(
    proxyOp->nbytes = DIVUP(nBytes, nChannels);
  }

+  // Set peer count hints used by network plugin
+  switch (proxyOp->pattern) {
+  case ncclPatternRing:
+  case ncclPatternRingTwice:
+  case ncclPatternPipelineFrom:
+  case ncclPatternPipelineTo:
+  case ncclPatternPatUp:
+  case ncclPatternPatDown:
+    proxyOp->nPeers = 1;
+    break;
+  case ncclPatternTreeUp:
+  case ncclPatternTreeDown:
+  case ncclPatternTreeUpDown:
+  case ncclPatternNvlsTree:
+    proxyOp->nPeers = (NCCL_MAX_TREE_ARITY - 1) * 2;
+    break;
+  case ncclPatternCollnetChain:
+  case ncclPatternCollnetDirect:
+  case ncclPatternNvls:
+  case ncclPatternProfiler:
+    // Peer count hints unused
+    break;
+  case ncclPatternSend:
+  case ncclPatternRecv:
+  default:
+    WARN("Unknown pattern %d", pattern);
+    return ncclInternalError;
+  }
+
  *outChunkSize = proxyOp->chunkSize;
  return ncclSuccess;
 }
@@ -2673,128 +2723,8 @@ static ncclResult_t hostToDevRedOp(
  return ncclSuccess;
 }

-// Converts `info` to a task and adds it to `comm->planner`. The exception is with
-// single rank communicators, collectives are issued as `ncclMemcpyAsync`s and
-// thus don't need a task.
-static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
+static ncclResult_t ncclPlannerSetCapturingGraph(struct ncclComm* comm, struct ncclInfo* info) {
  struct ncclKernelPlanner *planner = &comm->planner;
-
-  if (info->coll == ncclFuncSend || info->coll == ncclFuncRecv) {
-    int peer = info->root;
-    ssize_t nBytes = info->count*ncclTypeSize(info->datatype);
-    bool isSendNotRecv = info->coll == ncclFuncSend;
-
-    // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`.
-    ncclGroupCommJoin(info->comm, ncclGroupTaskTypeCollective);
-    struct ncclTaskP2p* p2p = ncclMemoryPoolAlloc<struct ncclTaskP2p>(&comm->memPool_ncclTaskP2p, &comm->memPermanent);
-    p2p->func = info->coll;
-    p2p->buff = (void*)info->recvbuff;
-    p2p->count = info->count;
-    p2p->datatype = info->datatype;
-    p2p->root = info->root;
-    p2p->bytes = nBytes;
-    p2p->eActivationMask = __atomic_load_n(&ncclProfilerEventMask, __ATOMIC_RELAXED);
-    p2p->opCount = comm->opCount;
-    ncclIntruQueueEnqueue(
-      isSendNotRecv ? &planner->peers[peer].sendQueue : &planner->peers[peer].recvQueue,
-      p2p);
-    planner->nTasksP2p += 1;
-
-    // Mark channels that need pre-connect
-    if (comm->rank != peer) {
-      if (!(isSendNotRecv ? planner->peers[peer].sendSeen : planner->peers[peer].recvSeen)) {
-        // planner->peers[peer].send/recvSeen is private to each comm, so we need to set it anyway.
-        (isSendNotRecv ? planner->peers[peer].sendSeen : planner->peers[peer].recvSeen) = true;
-        int round = 0;
-        while (peer != (isSendNotRecv ? comm->p2pSchedule[round].sendRank
-                                      : comm->p2pSchedule[round].recvRank)) {
-          round += 1;
-        }
-        uint8_t base = ncclP2pChannelBaseForRound(comm, round, rcclParamP2pBatchEnable());
-        for (int c=0; c < comm->p2pnChannelsPerPeer; c++) {
-          int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, c, comm->p2pnChannelsPerPeer, comm->nNodes);
-          if (isSendNotRecv) {
-            if (comm->channels[channelId].peers[peer]->send[1].hasSeen == 0) { // P2P uses only 1 connector
-              // the send/recv connector is shared among split shared comms. We need to set hasSeen to
-              // 1 in order to avoid duplicate connection setup if user group sendrecv ops with split
-              // shared comms together.
-              comm->channels[channelId].peers[peer]->send[1].hasSeen = 1;
-              //comm->connectSend[peer] |= (1UL<<channelId);
-	            comm->connectSend[peer].masks[channelId/64] |= (1UL<<(channelId%64));
-              ncclGroupCommPreconnect(comm);
-            }
-            if (comm->p2pNet && comm->channels[channelId].peers[peer]->send[NCCL_CONN_IDX_P2P_NET].hasSeen == 0) {
-              comm->channels[channelId].peers[peer]->send[1].hasSeen = 1;
-              //comm->connectSend[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET] |= (1UL<<channelId);
-	            comm->connectSend[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET].masks[channelId/64] |= (1UL<<(channelId%64));
-              ncclGroupCommPreconnect(comm);
-            }
-          } else {
-            if (comm->channels[channelId].peers[peer]->recv[1].hasSeen == 0) { // P2P uses only 1 connector
-              comm->channels[channelId].peers[peer]->recv[1].hasSeen = 1;
-              //comm->connectRecv[peer] |= (1UL<<channelId);
-	            comm->connectRecv[peer].masks[channelId/64] |= (1UL<<(channelId%64));
-              ncclGroupCommPreconnect(comm);
-            }
-            if (comm->p2pNet && comm->channels[channelId].peers[peer]->recv[NCCL_CONN_IDX_P2P_NET].hasSeen == 0) {
-              comm->channels[channelId].peers[peer]->recv[1].hasSeen = 1;
-              //comm->connectRecv[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET] |= (1UL<<channelId);
-	            comm->connectRecv[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET].masks[channelId/64] |= (1UL<<(channelId%64));
-              ncclGroupCommPreconnect(comm);
-            }
-          }
-        }
-      }
-    }
-  } else {
-    // Empty collectives can be discarded.
-    if (info->count == 0) return ncclSuccess;
-
-    if (info->datatype == ncclFloat8e4m3 || info->datatype == ncclFloat8e5m2) {
-      if (comm->minCompCap < 90) {
-        WARN("FP8 reduction support begins with sm90 capable devices.");
-        return ncclInvalidArgument;
-      }
-    }
-
-    // Copy reduction op state from op handle into info struct here since the
-    // op handle may be destroyed before ncclGroupEnd().
-    struct ncclDevRedOpFull opDev;
-    NCCLCHECK(hostToDevRedOp(&opDev, info->op, info->datatype, comm));
-
-    if (comm->nRanks == 1) {
-      NCCLCHECK(ncclLaunchOneRank(info->recvbuff, info->sendbuff, info->count, opDev, info->datatype, info->stream));
-      return ncclSuccess;
-    } else {
-      // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`.
-      ncclGroupCommJoin(info->comm, ncclGroupTaskTypeCollective);
-      struct ncclTaskColl* t = ncclMemoryPoolAlloc<struct ncclTaskColl>(&comm->memPool_ncclTaskColl, &comm->memPermanent);
-      t->func = info->coll;
-      t->sendbuff = info->sendbuff;
-      t->recvbuff = info->recvbuff;
-      t->count = info->count;
-      t->root = info->root;
-      t->datatype = info->datatype;
-      size_t elementSize = ncclTypeSize(t->datatype);
-      if (t->func == ncclFuncAllGather || t->func == ncclFuncBroadcast || t->func == ncclFuncAllToAllPivot || t->func == ncclFuncAllToAllGda) {
-        t->count *= elementSize;
-        t->datatype = ncclInt8;
-        elementSize = 1;
-      }
-      t->trafficBytes = t->count*elementSize*ncclFuncTrafficPerByte(t->func, comm->nRanks);
-      t->opHost = info->op;
-      t->opDev = opDev; // C++ struct assignment
-      t->chunkSteps = info->chunkSteps;
-      t->sliceSteps = info->sliceSteps;
-      t->eActivationMask = __atomic_load_n(&ncclProfilerEventMask, __ATOMIC_RELAXED);
-      t->opCount = comm->opCount;
-      t->acc = info->acc;
-
-      planner->nTasksColl += 1;
-      ncclTaskCollSorterInsert(&planner->collSorter, t, t->trafficBytes);
-    }
-  }
-
  if (info->stream != planner->streamRecent || planner->streams == nullptr) {
    planner->streamRecent = info->stream;
    struct ncclCudaStreamList* l = planner->streams;
@@ -2823,7 +2753,279 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
  return ncclSuccess;
 }

+static ncclResult_t p2pTaskAppend(
+    struct ncclComm* comm,
+    struct ncclInfo* info,
+    ncclFunc_t coll,
+    ncclFunc_t collAPI,
+    void* buff,
+    size_t count,
+    ncclDataType_t datatype,
+    int peer) {
+  struct ncclKernelPlanner *planner = &comm->planner;
+
+  // Determine peer and basic parameters.
+  ssize_t nBytes = count*ncclTypeSize(datatype);
+  bool isSendNotRecv = coll == ncclFuncSend;
+
+  // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`.
+  ncclGroupCommJoin(comm, ncclGroupTaskTypeCollective);
+  info->coll = coll;
+  // Set capturing graph. Called here so that profiler can emit a group API event with this information
+  NCCLCHECK(ncclPlannerSetCapturingGraph(comm, info));
+  bool isGraphCaptured = ncclCudaGraphValid(planner->capturingGraph);
+  NCCLCHECK(ncclProfilerStartGroupApiEvent(info, isGraphCaptured));
+  NCCLCHECK(ncclProfilerRecordGroupApiEventState(ncclProfilerGroupStartApiStop));
+
+  NCCLCHECK(ncclProfilerStartP2pApiEvent(info, isGraphCaptured));
+
+  struct ncclTaskP2p* p2p = ncclMemoryPoolAlloc<struct ncclTaskP2p>(&comm->memPool_ncclTaskP2p, &comm->memPermanent);
+  p2p->func = coll;
+  p2p->collAPI = collAPI;
+  p2p->buff = buff;
+  p2p->count = count;
+  p2p->datatype = datatype;
+  p2p->root = peer;
+  p2p->bytes = nBytes;
+  p2p->eActivationMask = ncclProfilerApiState.eActivationMask;
+  p2p->groupApiEventHandle = ncclProfilerApiState.groupApiEventHandle;
+  p2p->p2pApiEventHandle = ncclProfilerApiState.p2pApiEventHandle;
+  ncclIntruQueueEnqueue(
+    isSendNotRecv ? &planner->peers[peer].sendQueue : &planner->peers[peer].recvQueue,
+    p2p);
+  planner->nTasksP2p += 1;
+  if (isSendNotRecv)
+    planner->nTasksP2pSend += 1;
+  else
+    planner->nTasksP2pRecv += 1;
+
+  // Mark channels that need pre-connect
+  if (comm->rank != peer) {
+    if (!(isSendNotRecv ? planner->peers[peer].sendSeen : planner->peers[peer].recvSeen)) {
+      // planner->peers[peer].send/recvSeen is private to each comm, so we need to set it anyway.
+      (isSendNotRecv ? planner->peers[peer].sendSeen : planner->peers[peer].recvSeen) = true;
+      int round = 0;
+      while (peer != (isSendNotRecv ? comm->p2pSchedule[round].sendRank
+                                    : comm->p2pSchedule[round].recvRank)) {
+        round += 1;
+      }
+      uint8_t base = ncclP2pChannelBaseForRound(comm, round);
+      for (int c=0; c < comm->p2pnChannelsPerPeer; c++) {
+        int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, c, comm->p2pnChannelsPerPeer, comm->nNodes);
+        if (isSendNotRecv) {
+          if (comm->channels[channelId].peers[peer]->send[1].hasSeen == 0) { // P2P uses only 1 connector
+            // the send/recv connector is shared among split shared comms. We need to set hasSeen to
+            // 1 in order to avoid duplicate connection setup if user group sendrecv ops with split
+            // shared comms together.
+            comm->channels[channelId].peers[peer]->send[1].hasSeen = 1;
+            comm->channels[channelId].peers[peer]->send[1].p2pOnly = 1;
+            // comm->connectSend[peer] |= (1UL<<channelId);
+            comm->connectSend[peer].masks[channelId/64] |= (1UL<<(channelId%64));
+            ncclGroupCommPreconnect(comm);
+          }
+          if (comm->p2pNet && comm->channels[channelId].peers[peer]->send[NCCL_CONN_IDX_P2P_NET].hasSeen == 0) {
+            comm->channels[channelId].peers[peer]->send[1].hasSeen = 1;
+            //comm->connectSend[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET] |= (1UL<<channelId);
+            comm->connectSend[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET].masks[channelId/64] |= (1UL<<(channelId%64));
+            ncclGroupCommPreconnect(comm);
+          }
+        } else {
+          if (comm->channels[channelId].peers[peer]->recv[1].hasSeen == 0) { // P2P uses only 1 connector
+            comm->channels[channelId].peers[peer]->recv[1].hasSeen = 1;
+            comm->channels[channelId].peers[peer]->recv[1].p2pOnly = 1;
+            // comm->connectRecv[peer] |= (1UL<<channelId);
+            comm->connectRecv[peer].masks[channelId/64] |= (1UL<<(channelId%64));
+            ncclGroupCommPreconnect(comm);
+          }
+          if (comm->p2pNet && comm->channels[channelId].peers[peer]->recv[NCCL_CONN_IDX_P2P_NET].hasSeen == 0) {
+            comm->channels[channelId].peers[peer]->recv[1].hasSeen = 1;
+            //comm->connectRecv[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET] |= (1UL<<channelId);
+            comm->connectRecv[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET].masks[channelId/64] |= (1UL<<(channelId%64));
+            ncclGroupCommPreconnect(comm);
+          }
+        }
+      }
+    }
+  }
+  ncclProfilerStopP2pApiEvent();
+  return ncclSuccess;
+}
+
+static ncclResult_t collTaskAppend(
+    struct ncclComm* comm,
+    struct ncclInfo* info,
+    struct ncclDevRedOpFull opDev) {
+  struct ncclKernelPlanner *planner = &comm->planner;
+
+  // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`.
+  ncclGroupCommJoin(info->comm, ncclGroupTaskTypeCollective);
+  // Set capturing graph. Called here so that profiler can emit a group API event with this information
+  NCCLCHECK(ncclPlannerSetCapturingGraph(comm, info));
+  bool isGraphCaptured = ncclCudaGraphValid(planner->capturingGraph);
+  NCCLCHECK(ncclProfilerStartGroupApiEvent(info, isGraphCaptured));
+  NCCLCHECK(ncclProfilerRecordGroupApiEventState(ncclProfilerGroupStartApiStop));
+  NCCLCHECK(ncclProfilerStartCollApiEvent(info, isGraphCaptured));
+  
+  struct ncclTaskColl* t = ncclMemoryPoolAlloc<struct ncclTaskColl>(&comm->memPool_ncclTaskColl, &comm->memPermanent);
+  t->func = info->coll;
+  t->sendbuff = info->sendbuff;
+  t->recvbuff = info->recvbuff;
+  t->count = info->count;
+  t->root = info->root;
+  t->datatype = info->datatype;
+  size_t elementSize = ncclTypeSize(t->datatype);
+  if (t->func == ncclFuncAllGather || t->func == ncclFuncBroadcast || t->func == ncclFuncAlltoAllPivot) {
+    t->count *= elementSize;
+    t->datatype = ncclInt8;
+    elementSize = 1;
+  }
+  t->trafficBytes = t->count*elementSize*ncclFuncTrafficPerByte(t->func, comm->nRanks);
+  t->opHost = info->op;
+  t->opDev = opDev; // C++ struct assignment
+  t->chunkSteps = info->chunkSteps;
+  t->sliceSteps = info->sliceSteps;
+  t->eActivationMask = ncclProfilerApiState.eActivationMask;
+  t->groupApiEventHandle = ncclProfilerApiState.groupApiEventHandle;
+  t->collApiEventHandle = ncclProfilerApiState.collApiEventHandle;
+  t->opCount = comm->opCount;
+  t->acc = info->acc;
+
+  planner->nTasksColl += 1;
+  ncclTaskCollSorterInsert(&planner->collSorter, t, t->trafficBytes);
+
+  ncclProfilerStopCollApiEvent();
+  return ncclSuccess;
+}
+
+static ncclResult_t ceCollTaskAppend(
+    struct ncclComm* comm,
+    struct ncclInfo* info,
+    struct ncclDevrWindow* sendWin,
+    struct ncclDevrWindow* recvWin,
+    struct ncclDevRedOpFull opDev) {
+  struct ncclKernelPlanner *planner = &comm->planner;
+  
+  // Check if CE needs initialization
+  if (comm->ceColl.baseUCSymReadyPtr == NULL && ncclIntruQueueEmpty(&comm->ceInitTaskQueue)) {
+    struct ncclCeInitTask* ceTask;
+    NCCLCHECK(ncclCalloc(&ceTask, 1));
+    ceTask->comm = comm;
+    ncclIntruQueueEnqueue(&comm->ceInitTaskQueue, ceTask);
+    ncclGroupCommJoin(comm, ncclGroupTaskTypeSymRegister);
+  }
+
+  // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`.
+  ncclGroupCommJoin(info->comm, ncclGroupTaskTypeCollective);
+  NCCLCHECK(ncclPlannerSetCapturingGraph(comm, info));
+  struct ncclTaskColl* t = ncclMemoryPoolAlloc<struct ncclTaskColl>(&comm->memPool_ncclTaskColl, &comm->memPermanent);
+
+  t->func = info->coll;
+  t->sendbuff = info->sendbuff;
+  t->recvbuff = info->recvbuff;
+  t->count = info->count;
+  t->root = info->root;
+  t->datatype = info->datatype;
+  size_t elementSize = ncclTypeSize(t->datatype);
+  if (t->func == ncclFuncAllGather || t->func == ncclFuncBroadcast) {
+    t->count *= elementSize;
+    t->datatype = ncclInt8;
+    elementSize = 1;
+  }
+  t->trafficBytes = t->count*elementSize*ncclFuncTrafficPerByte(t->func, comm->nRanks);
+  t->opHost = info->op;
+  t->opDev = opDev; // C++ struct assignment
+  t->chunkSteps = info->chunkSteps;
+  t->sliceSteps = info->sliceSteps;
+  t->eActivationMask = __atomic_load_n(&ncclProfilerEventMask, __ATOMIC_RELAXED);
+  t->sendWin = sendWin;
+  t->recvWin = recvWin;
+
+  ncclIntruQueueEnqueue(&planner->collCeTaskQueue, t);
+
+  return ncclSuccess;
+}
+
+// Converts `info` to a task and adds it to `comm->planner`. The exception is with
+// single rank communicators, collectives are issued as `ncclMemcpyAsync`s and
+// thus don't need a task.
+static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
+  ncclFunc_t collAPI = info->coll;
+
+  if (info->coll == ncclFuncSend || info->coll == ncclFuncRecv) {
+    NCCLCHECK(p2pTaskAppend(comm, info, info->coll, collAPI, (void*)info->recvbuff, info->count, info->datatype, info->root));
+  } else {
+    // Empty collectives can be discarded.
+    if (info->count == 0) return ncclSuccess;
+
+    if (info->datatype == ncclFloat8e4m3 || info->datatype == ncclFloat8e5m2) {
+      if (comm->minCompCap < 90 && info->coll != ncclFuncAllGather && info->coll != ncclFuncBroadcast && info->coll != ncclFuncAlltoAll && info->coll != ncclFuncScatter && info->coll != ncclFuncGather) {
+        WARN("FP8 reduction support begins with sm90 capable devices.");
+        return ncclInvalidArgument;
+      }
+    }
+
+    // Copy reduction op state from op handle into info struct here since the
+    // op handle may be destroyed before ncclGroupEnd().
+    struct ncclDevRedOpFull opDev;
+    NCCLCHECK(hostToDevRedOp(&opDev, info->op, info->datatype, comm));
+
+    if (comm->nRanks == 1) {
+      NCCLCHECK(ncclLaunchOneRank(info->recvbuff, info->sendbuff, info->count, opDev, info->datatype, info->stream));
+      return ncclSuccess;
+    } else {
+      struct ncclDevrWindow* sendWin;
+      struct ncclDevrWindow* recvWin;
+      ncclDevrFindWindow(comm, info->sendbuff, &sendWin);
+      ncclDevrFindWindow(comm, info->recvbuff, &recvWin);
+      bool ceImplemented = ncclCeImplemented(info->coll, info->op, info->datatype);
+      
+      // Append CE collective task if CE is supported and requested by user
+      if (comm->symmetricSupport && comm->nNodes == 1 && sendWin && recvWin && (sendWin->winFlags & recvWin->winFlags & NCCL_WIN_COLL_SYMMETRIC) && comm->config.CTAPolicy == NCCL_CTA_POLICY_ZERO && ceImplemented) {
+        NCCLCHECK(ceCollTaskAppend(comm, info, sendWin, recvWin, opDev));
+      }
+      // Append kernel-based collective
+      else {
+        if (info->coll == ncclFuncAlltoAll) {
+          for (int r=0; r<comm->nRanks; r++) {
+            NCCLCHECK(p2pTaskAppend(comm, info, ncclFuncSend, collAPI, (void*)((char*)info->sendbuff+r*info->count*ncclTypeSize(info->datatype)), info->count, info->datatype, r));
+            NCCLCHECK(p2pTaskAppend(comm, info, ncclFuncRecv, collAPI, (void*)((char*)info->recvbuff+r*info->count*ncclTypeSize(info->datatype)), info->count, info->datatype, r));
+          }
+        } else if (info->coll == ncclFuncGather){
+          size_t offset = 0;
+          NCCLCHECK(p2pTaskAppend(comm, info, ncclFuncSend, collAPI, (void*)info->sendbuff, info->count, info->datatype, info->root));
+          if (comm->rank == info->root) {
+            for (int r=0; r<comm->nRanks; r++) {
+              void* buff = (void*)((char*)info->recvbuff + offset);
+              NCCLCHECK(p2pTaskAppend(comm, info, ncclFuncRecv, collAPI, buff, info->count, info->datatype, r));
+              offset += info->count * ncclTypeSize(info->datatype);
+            }
+          }
+        } else if (info->coll == ncclFuncScatter) {
+          size_t offset = 0;
+          if (comm->rank == info->root) {
+            for (int r = 0; r < comm->nRanks; r++) {
+              void* buff = (void*)((char*)info->sendbuff + offset);
+              NCCLCHECK(p2pTaskAppend(comm, info, ncclFuncSend, collAPI, buff, info->count, info->datatype, r));
+              offset += info->count * ncclTypeSize(info->datatype);
+            }
+          }
+          NCCLCHECK(p2pTaskAppend(comm, info, ncclFuncRecv, collAPI, (void*)info->recvbuff, info->count, info->datatype, info->root));
+        } else {
+          NCCLCHECK(collTaskAppend(comm, info, opDev));
+        }
+      }
+    }
+  }
+
+  return ncclSuccess;
+}
+
 ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
+  // Profiler - If a group API event has already started, update the profilerGroupDepth so that the depth
+  // updates correctly for implicit ncclGroupStartInternal and ncclGroupEndInternal calls
+  if (ncclProfilerApiState.profilerGroupDepth > 0) {
+    ncclProfilerApiState.profilerGroupDepth++;
+  }
  NCCLCHECK(ncclGroupStartInternal());
  ncclResult_t ret = ncclSuccess;
  int devOld = -1;
@@ -0,0 +1,14 @@
+# Graph sources
+set(GRAPH_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/topo.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/tuning.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/xml.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/search.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/paths.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/connect.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/rings.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/trees.cc
+)
+
+# Add graph sources to parent scope
+set(GRAPH_SOURCES ${GRAPH_SOURCES} PARENT_SCOPE)
@@ -24,6 +24,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs
  int localRanks = comm->topo->nodes[GPU].count;
  int nChannels = comm->nChannels;

+  topoRanks->crossNicRing = graphs[NCCL_ALGO_RING]->crossNic;
  topoRanks->nvlsHeadNum = 0;
  for (int c=0; c<nChannels; c++) {
    struct ncclChannel* channel = comm->channels+c;
@@ -430,7 +431,6 @@ static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
    sprintf(line+strlen(line), "nUp %d nHeads %d ", nUp, nHeads);
    sprintf(line+strlen(line), "headRank %d out %d shift %d", channel->collnetDirect.headRank, channel->collnetDirect.out, channel->collnetDirect.shift);
    INFO(NCCL_GRAPH, "%s", line);
-    channel->collnetChain.depth = comm->nRanks/comm->nNodes;
  }
  free(heads);
  return ncclSuccess;
@@ -447,7 +447,7 @@ static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, int nHead
    if (nvlsHeads[h * comm->nNodes + comm->node] == comm->rank) headRank = h;
  }

-  for (int c=0; c<comm->nChannels; c++) {
+  for (int c=0; c<comm->nvlsChannels; c++) {
    struct ncclChannel* channel = comm->channels+c;
    channel->nvls.nHeads = nHeads;
    for (int h=0; h<nHeads; h++) channel->nvls.up[h] = comm->nRanks+1+h;
@@ -499,7 +499,7 @@ static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, int nHead
  }
  // Set prev/next in all channels (NVLS compute channels work
  // orthogonally to NVLS search channels).
-  for (int c=0; c<comm->nChannels; c++) {
+  for (int c=0; c<comm->nvlsChannels; c++) {
    struct ncclChannel* channel = comm->channels+c;
    channel->nvls.treeUp = treeUp[c%2];
    channel->nvls.treeDown[0] = channel->nvls.down;
@@ -731,17 +731,17 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
  NCCLCHECKGOTO(ncclCalloc(&treeToChild1, nNodes*MAXCHANNELS), ret, fail);
  NCCLCHECKGOTO(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS), ret, fail);

-  // Alternate rings to avoid crossing rails
-  if (graphs[NCCL_ALGO_RING]->crossNic == 2 && (nChannels % 2) == 0) {
-    for (int r=0; r<comm->nRanks; r++) {
-      if (comm->rankToNode[r] % 2 == 1) {
-        // Exchange rings
-        for (int c=0; c<nChannels; c+=2) {
-          exchangeValues(allTopoRanks[r]->ringRecv+c, allTopoRanks[r]->ringRecv+(c^1));
-          exchangeValues(allTopoRanks[r]->ringSend+c, allTopoRanks[r]->ringSend+(c^1));
-          exchangeValues(allTopoRanks[r]->ringPrev+c, allTopoRanks[r]->ringPrev+(c^1));
-          exchangeValues(allTopoRanks[r]->ringNext+c, allTopoRanks[r]->ringNext+(c^1));
-        }
+  // Alternate rings to avoid crossing rails.
+  // CrossNic values could be not the same on all nodes as it depends on the number of net devs and the NVLink bandwidth.
+  // Therefore, it's only done if the rank obtained a solution with crossNic=2.
+  for (int r = 0; r < comm->nRanks; r++) {
+    if (allTopoRanks[r]->crossNicRing == 2 && (nChannels % 2) == 0 && (comm->rankToNode[r] % 2) == 1) {
+      // Exchange rings
+      for (int c=0; c<nChannels; c+=2) {
+        exchangeValues(allTopoRanks[r]->ringRecv+c, allTopoRanks[r]->ringRecv+(c^1));
+        exchangeValues(allTopoRanks[r]->ringSend+c, allTopoRanks[r]->ringSend+(c^1));
+        exchangeValues(allTopoRanks[r]->ringPrev+c, allTopoRanks[r]->ringPrev+(c^1));
+        exchangeValues(allTopoRanks[r]->ringNext+c, allTopoRanks[r]->ringNext+(c^1));
      }
    }
  }
@@ -858,7 +858,14 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
      int collNetNchannels = std::min(maxChannels, nChannels+nChannels/2);
      nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext);
    }
-    NCCLCHECKGOTO(connectCollNet(comm, graphs[NCCL_ALGO_COLLNET_DIRECT]), ret, fail);
+
+    for (int c = 0; c < comm->nChannels; c++) {
+      comm->channels[c].collnetChain.depth = comm->nRanks/comm->nNodes;
+    }
+
+    if (comm->maxLocalRanks <= NCCL_MAX_DIRECT_ARITY+1) {
+      NCCLCHECKGOTO(connectCollNet(comm, graphs[NCCL_ALGO_COLLNET_DIRECT]), ret, fail);
+    }
  }

  // Use 4 compute channels per search channel to reach peak BW on <8 PPN
@@ -910,9 +917,6 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
  if (shared && comm->nvlsChannels > parent->nvlsResources->nChannels) {
    comm->nvlsChannels = parent->nvlsResources->nChannels;
  }
-  if (comm->nChannels < comm->nvlsChannels) {
-    nChannels = comm->nChannels = copyChannels(comm, comm->nChannels, comm->nvlsChannels, ringPrev, ringNext);
-  }
  NCCLCHECKGOTO(connectNvls(comm, nvlsHeads, minHeadNum), ret, fail);
 #endif
  if (shared && comm->nChannels > parent->sharedRes->tpNChannels) {
@@ -391,11 +391,15 @@ ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerIn
  nvmlGpuFabricInfoV_t *fabricInfo1 = &info1->fabricInfo;
  nvmlGpuFabricInfoV_t *fabricInfo2 = &info2->fabricInfo;
  // A zero UUID means we don't have MNNVL fabric info
-  if ((((long *)&fabricInfo2->clusterUuid)[0]|((long *)fabricInfo2->clusterUuid)[1]) == 0) return ncclSuccess;
+  unsigned long uuid0 = 0;
+  unsigned long uuid1 = 0;
+  memcpy(&uuid0, fabricInfo2->clusterUuid, sizeof(uuid0));
+  memcpy(&uuid1, fabricInfo2->clusterUuid + sizeof(uuid0), sizeof(uuid1));
+  if ((uuid0 | uuid1) == 0) return ncclSuccess;
  if ((memcmp(fabricInfo1->clusterUuid, fabricInfo2->clusterUuid, NVML_GPU_FABRIC_UUID_LEN) == 0) &&
      (fabricInfo1->cliqueId == fabricInfo2->cliqueId)) {
    TRACE(NCCL_NET, "MNNVL matching peer 0x%lx UUID %lx.%lx cliqueId 0x%x",
-         info2->busId, ((long *)fabricInfo2->clusterUuid)[0], ((long *)fabricInfo2->clusterUuid)[1], fabricInfo2->cliqueId);
+         info2->busId, uuid0, uuid1, fabricInfo2->cliqueId);
    *ret = 1;
  }
  return ncclSuccess;
@@ -936,9 +940,6 @@ void ncclTopoFree(struct ncclTopoSystem* system) {
  free(system);
 }

-NCCL_PARAM(NChannelsPerNetPeer, "NCHANNELS_PER_NET_PEER", -1);
-NCCL_PARAM(NChannelsPerPeer, "NCHANNELS_PER_PEER", -2);
-
 static ncclResult_t ncclTopoGetNchannels(struct ncclComm* comm, int g /*local gpu index*/, int peerRank, int* nChannels) {
  int peer;
  struct ncclTopoSystem* system = comm->topo;
@@ -959,10 +960,10 @@ static ncclResult_t ncclTopoGetNchannels(struct ncclComm* comm, int g /*local gp
    }
  } else {
    // Remote rank, use network
-    int nNetChannels = ncclParamNChannelsPerNetPeer();
-    if (nNetChannels == -1) {
-      //start from 2 channels per NIC and reduce with scale
-      nNetChannels = 2;
+    int nNetChannels = comm->config.nChannelsPerNetPeer;
+    if (nNetChannels == NCCL_CONFIG_UNDEF_INT) {
+       //start from 2 channels per NIC and reduce with scale
+       nNetChannels = 2;

      // check if we need to use more than one NIC, hence more than one channel
      int netCountByBw = 1, nChannelsMax = nNetChannels;
@@ -1014,7 +1015,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
    comm->p2pnChannels = std::min(pow2Up(comm->p2pnChannels), pow2Down(ncclDevMaxChannelsForArgsBytes(ncclParamWorkArgsBytes())));
  } else {
    // Round to next pow2 nChannelsPerPeer and nChannels
-    comm->p2pnChannelsPerPeer = (ncclParamNChannelsPerPeer() == -2 ? pow2Up(minChannels) : ncclParamNChannelsPerPeer());
+    comm->p2pnChannelsPerPeer = pow2Up(minChannels);
    // Doubling P2P channels per peer on single node
    if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942") || IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950"))) comm->p2pnChannelsPerPeer *= 2;
    comm->p2pnChannels = std::min(pow2Up(comm->p2pnChannels), 4*CHANNEL_LIMIT);
@@ -9,6 +9,7 @@
 #include "graph.h"
 #include "topo.h"
 #include "comm.h"
+#include "nccl.h"
 #include "nvmlwrap.h"
 #include "coll_net.h"
 #include "transport.h"
@@ -16,6 +17,7 @@
 #include <fcntl.h>
 #include "cpuset.h"
 #include "bootstrap.h"
+#include <mutex>

 #define BUSID_SIZE (sizeof("0000:00:00.0"))
 #define BUSID_REDUCED_SIZE (sizeof("0000:00"))
@@ -427,6 +429,7 @@ ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* s

 #define PCI_BRIDGE_DEVICE_CLASS "0x060400"

+// struct kvDict kvDictPciClass[] = { { PCI_BRIDGE_DEVICE_CLASS, PCI }, {"0x080100", /*CX8 data direct*/PCI}, { "0x068000", NVS }, { "0x068001", CPU }, { "0x03", GPU }, { "0x02", NIC }, { NULL, PCI /* Default fallback value */ } };
 struct kvDict kvDictPciClass[] = { { PCI_BRIDGE_DEVICE_CLASS, PCI }, { "0x068000", NVS }, { "0x068001", CPU }, { "0x03", GPU }, { "0x02", NIC }, { "0x120000", GPU }, { NULL, PCI /* Default fallback value */ } };
 struct kvDict kvDictPciGen[] = {
  { "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, { "32 GT/s", 240 }, /* Kernel 5.6 and earlier */
@@ -1069,8 +1072,7 @@ ncclResult_t ncclTopoMakePciParent(struct ncclXml* xml, struct ncclXmlNode** par
  return ncclSuccess;
 }

-ncclResult_t ncclTopoMakeVnic(struct ncclXml* xml, ncclNetVDeviceProps_t* vProps,
-struct ncclXmlNode** physNetNodes, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
+ncclResult_t ncclTopoMakeVnic(struct ncclXml* xml, struct ncclTopoNetInfo* netInfo, ncclNetVDeviceProps_t* vProps, struct ncclXmlNode** physNetNodes) {
  if (vProps->ndevs > NCCL_NET_MAX_DEVS_PER_NIC) {
    WARN("TOPO/NET : Tried to merge too many NICs. %d > %d", vProps->ndevs, NCCL_NET_MAX_DEVS_PER_NIC);
    return ncclInternalError;
@@ -1084,7 +1086,7 @@ struct ncclXmlNode** physNetNodes, ncclResult_t (*makeVDevice)(int*, ncclNetVDev

  // Trigger the merge, then get the new device's properties
  int vDevIndex = 0;
-  ncclResult_t ret = makeVDevice(&vDevIndex, vProps);
+  ncclResult_t ret = netInfo->makeVDevice(&vDevIndex, vProps);
  if (ret != ncclSuccess) {
    INFO(NCCL_GRAPH|NCCL_INIT|NCCL_NET, "TOPO/NET : Tried merging multiple devices together and failed. vProps={ndevs=%d, devs=[%d %d %d %d]}. Set NCCL_NET_MERGE_LEVEL=LOC to disable NIC fusion.",
      vProps->ndevs, vProps->devs[0], vProps->devs[1], vProps->devs[2], vProps->devs[3]);
@@ -1102,9 +1104,10 @@ struct ncclXmlNode** physNetNodes, ncclResult_t (*makeVDevice)(int*, ncclNetVDev
  return ncclSuccess;
 }

-ncclResult_t ncclTopoForceMerge(struct ncclXml* xml, char* str, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
+ncclResult_t ncclTopoForceMerge(struct ncclXml* xml, struct ncclTopoNetInfo* netInfo, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs) {
  ncclResult_t ret = ncclSuccess;
-  INFO(NCCL_ENV|NCCL_NET, "TOPO/NET : Force-fusing NICs using NCCL_NET_FORCE_MERGE=%s", str);
+  const char* str = netInfo->forceMerge;
+  INFO(NCCL_ENV | NCCL_NET, "TOPO/NET : Force-fusing NICs using NCCL_NET_FORCE_MERGE=%s", str);
  char* ncStr;
  NCCLCHECK(ncclCalloc(&ncStr, strlen(str)+1));
  strcpy(ncStr, str);
@@ -1140,7 +1143,7 @@ ncclResult_t ncclTopoForceMerge(struct ncclXml* xml, char* str, int* placedDevs,
      goto fail;
    }

-    ret = ncclTopoMakeVnic(xml, &vProps, physNetNodes, makeVDevice);
+    ret = ncclTopoMakeVnic(xml, netInfo, &vProps, physNetNodes);
    if (ret == ncclSuccess) {
      // Only set that a device is "placed" after successfully making a vNic (it's possible to exit before this)
      for (int i = 0; i < vProps.ndevs; i++) {
@@ -1162,7 +1165,7 @@ fail:
  goto exit;
 }

-ncclResult_t ncclTopoAutoMerge(struct ncclXml* xml, int mergeLevel, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
+ncclResult_t ncclTopoAutoMerge(struct ncclXml* xml, struct ncclTopoNetInfo* netInfo, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs) {
  // Compute the path type between each device
  int* paths = NULL;
  ncclResult_t res = ncclSuccess;
@@ -1192,7 +1195,7 @@ ncclResult_t ncclTopoAutoMerge(struct ncclXml* xml, int mergeLevel, int* placedD
      // Select each unplaced device "j" which is at most "mergeLevel" distance from "i", but not equal to "i"
      // (Don't merge the same device with itself)
      for (int j = 0; j < nPhysDevs; j++) {
-        if (paths[i*nPhysDevs + j] <= mergeLevel &&
+        if (paths[i*nPhysDevs + j] <= netInfo->mergeLevel &&
        placedDevs[j] == 0 && j != i) {
          vProps.devs[vProps.ndevs++] = j;
          placedDevs[j] = 1;
@@ -1206,7 +1209,7 @@ ncclResult_t ncclTopoAutoMerge(struct ncclXml* xml, int mergeLevel, int* placedD
        return ncclInternalError;
      }

-      ncclResult_t ret = ncclTopoMakeVnic(xml, &vProps, physNetNodes, makeVDevice);
+      ncclResult_t ret = ncclTopoMakeVnic(xml, netInfo, &vProps, physNetNodes);

      // Merging failed.
      // Mark all as unplaced and increase their distance to disconnected (PATH_DIS)
@@ -1244,6 +1247,92 @@ struct kvDict nicPathKvList[] = {
  { NULL, 0 }
 };

+
+ncclResult_t ncclTopoFindLinkWidthRec(ncclXmlNode* node, ncclXmlNode** physNetNodes, int ndevs, int* foundPhysNet, int* linkWidth) {
+  int myLinkWidth = 0;
+  if (strcmp(node->name, "pci") == 0) {
+    NCCLCHECK(xmlGetAttrInt(node, "link_width", &myLinkWidth));
+#ifdef ENABLE_TRACE
+    const char *busidAttr, *linkAttr;
+    NCCLCHECK(xmlGetAttrStr(node, "busid", &busidAttr));
+    NCCLCHECK(xmlGetAttr(node, "link_width", &linkAttr));
+    TRACE(NCCL_GRAPH, "Found link_width (%s)=%d for busid=%s", linkAttr, myLinkWidth, busidAttr);
+#endif
+  }
+
+  *foundPhysNet = 0;
+  // Detect if a physical child is found. This information will be propagated up the stack.
+  int devId = 0;
+  while (devId < ndevs && !(*foundPhysNet)) *foundPhysNet = (node == physNetNodes[devId++]);
+
+  int totalChildLinkWidth = 0;
+  for (int i = 0; i < node->nSubs; i++) {
+    ncclXmlNode* child = node->subs[i];
+    int found = 0;
+    int tempLinkWidth = 0;
+    NCCLCHECK(ncclTopoFindLinkWidthRec(child, physNetNodes, ndevs, &found, &tempLinkWidth));
+    if (found) {
+      *foundPhysNet = 1;
+      totalChildLinkWidth += tempLinkWidth;
+    }
+  }
+
+  if (*foundPhysNet == 0) {
+    // No child NICs were found, do not accrue any detected link_width
+    *linkWidth = 0;
+    INFO(NCCL_GRAPH, "Did not find child net device. Returning link_width=%d totalChildLinkWidth=%d", *linkWidth, totalChildLinkWidth);
+  } else if (totalChildLinkWidth == 0) {
+    // If A child NIC was found but no link_width was detected among children, assign the link_width to mine (I am the first pci node right above the physNetNode).
+    *linkWidth = myLinkWidth;
+    INFO(NCCL_GRAPH, "Found child net device for %s. Returning link_width=%d totalChildLinkWidth=%d", node->name, *linkWidth, totalChildLinkWidth);
+  } else {
+  // Standard recursive accrual of link_width. The link_width is either the bottleneck of this PCI node's width or the sum of its children's width.
+    *linkWidth = myLinkWidth > 0 ? std::min(myLinkWidth, totalChildLinkWidth) : totalChildLinkWidth;
+    INFO(NCCL_GRAPH, "Found child net device for %s. Returning link_width=%d totalChildLinkWidth=%d", node->name, *linkWidth, totalChildLinkWidth);
+  }
+
+  return ncclSuccess;
+}
+
+// DFS over nodes under common parent
+// Exclude link widths of non-physNetNodes chains
+ncclResult_t ncclTopoFindLinkWidth(ncclXmlNode* parent, ncclXmlNode** physNetNodes, int ndevs, int* linkWidth) {
+  *linkWidth = 0;
+  for (int i = 0; i < parent->nSubs; i++) {
+    ncclXmlNode* child = parent->subs[i];
+    int foundPhysNet = 0;
+    int childLinkWidth = 0;
+    NCCLCHECK(ncclTopoFindLinkWidthRec(child, physNetNodes, ndevs, &foundPhysNet, &childLinkWidth));
+    if (foundPhysNet) {
+      *linkWidth += childLinkWidth;
+    }
+  }
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoWidenLinks(ncclXmlNode** physNetNodes, int ndevs, ncclXmlNode* parent) {
+  int sumLinkWidth = 0;
+  NCCLCHECK(ncclTopoFindLinkWidth(parent, physNetNodes, ndevs, &sumLinkWidth));
+  for (int i = 0; i < ndevs; i++) {
+    ncclXmlNode* temp = physNetNodes[i];
+    while (temp != parent) {
+      if (strcmp(temp->name, "pci") == 0) {
+        NCCLCHECK(xmlSetAttrInt(temp, "link_width", sumLinkWidth));
+        TRACE(NCCL_GRAPH, "Set link_width to %d for node %s", sumLinkWidth, temp->name);
+      }
+      temp = temp->parent;
+    }
+  }
+
+  if (strcmp(parent->name, "pci") == 0) {
+    NCCLCHECK(xmlSetAttrInt(parent, "link_width", sumLinkWidth));
+    TRACE(NCCL_GRAPH, "Set link_width to %d for node %s", sumLinkWidth, parent->name);
+  }
+
+  return ncclSuccess;
+}
+
 ncclResult_t ncclTopoGetVNicParent(struct ncclXml* xml, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), ncclNetVDeviceProps_t* vProps, ncclXmlNode** parent) {
  ncclNetProperties_t props[NCCL_NET_MAX_DEVS_PER_NIC];
  ncclXmlNode* physNetNodes[NCCL_NET_MAX_DEVS_PER_NIC];
@@ -1257,54 +1346,50 @@ ncclResult_t ncclTopoGetVNicParent(struct ncclXml* xml, ncclResult_t (*getProper

  int path = PATH_LOC;
  NCCLCHECK(ncclTopoGetPath(physNetNodes, vProps->ndevs, &path, parent));
-  if (path == PATH_LOC) {
-    *parent = NULL;
-  } else if (parent && strcmp((*parent)->name, "pci") == 0) {
-    // Compare PCI class here to avoid NCCL WARN when the "class" attribute doesn't exist
-    const char* c;
-    NCCLCHECK(xmlGetAttrStr(*parent, "class", &c));
-    if (strcmp(c, PCI_BRIDGE_DEVICE_CLASS) == 0) {
+  if (path == PATH_PHB || path == PATH_PXB || path == PATH_PIX) {
+    INFO(NCCL_GRAPH, "Widening links");
+    NCCLCHECK(ncclTopoWidenLinks(physNetNodes, vProps->ndevs, *parent));
+  }
+
+  if (*parent) {
+    if (strcmp((*parent)->name, "pci") == 0) {
+      // Compare PCI class here to avoid NCCL WARN when the "class" attribute doesn't exist
+      const char* c;
+      NCCLCHECK(xmlGetAttrStr(*parent, "class", &c));
+      if (c && strcmp(c, PCI_BRIDGE_DEVICE_CLASS) == 0) {
+        // If the common parent is a PCI switch, we must reparent the new NIC under a made up pci device with a unique busid
+        NCCLCHECK(ncclTopoMakePciParent(xml, parent, physNetNodes[0]));
+      }
+    } else if (strcmp((*parent)->name, "cpu") == 0) {
      // If the common parent is a PCI switch, we must reparent the new NIC under a made up pci device with a unique busid
      NCCLCHECK(ncclTopoMakePciParent(xml, parent, physNetNodes[0]));
    }
  }
+
  TRACE(NCCL_GRAPH, "Selected parent %s with path %d", (*parent)->name, path);
  return ncclSuccess;
 }

-ncclResult_t ncclTopoMakeVNics(struct ncclXml* xml, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*), ncclResult_t (*getProperties)(int, ncclNetProperties_t*), int physicalDevs) {
+ncclResult_t ncclTopoMakeVNics(struct ncclXml* xml, struct ncclTopoNetInfo* netInfo, int physicalDevs) {
  int* placedDevs = NULL;
  struct ncclXmlNode** physNetNodes = NULL;
+  ncclNetProperties_t* props = NULL;
+  ncclResult_t res = ncclSuccess;
  if (physicalDevs == 0) return ncclSuccess;

-  ncclCalloc(&physNetNodes, physicalDevs);
-  ncclResult_t res = ncclSuccess;
-
-  ncclNetProperties_t* props = NULL;
-  ncclCalloc(&props, physicalDevs);
+  NCCLCHECK(ncclCalloc(&physNetNodes, physicalDevs));
+  NCCLCHECK(ncclCalloc(&placedDevs, physicalDevs));
+  NCCLCHECK(ncclCalloc(&props, physicalDevs));
  for (int i = 0; i < physicalDevs; i++) {
-    NCCLCHECKGOTO(getProperties(i, props + i), res, out);
+    NCCLCHECKGOTO(netInfo->getProperties(i, props + i), res, out);
    struct ncclXmlNode* physNetNode;
    NCCLCHECKGOTO(xmlFindTagKv(xml, "net", &physNetNode, "name", props[i].name), res, out);
    physNetNodes[i] = physNetNode;
    TRACE(NCCL_GRAPH, "Found physical ncclNet node %d %s", i,  props[i].name);
  }

-  // By default, don't merge any devices
-  int mergeLevel;
-  mergeLevel = PATH_PORT;
-  { // Avoids warnings related to jumping to "out"
-    const char* mergeLevelEnv = ncclGetEnv("NCCL_NET_MERGE_LEVEL");
-    if (mergeLevelEnv) kvConvertToInt(mergeLevelEnv, &mergeLevel, nicPathKvList);
-    char* forceMerge = (char*) ncclGetEnv("NCCL_NET_FORCE_MERGE");
-    NCCLCHECK(ncclCalloc(&placedDevs, physicalDevs));
-    memset(placedDevs, 0, sizeof(int)*physicalDevs);
-
-    if (forceMerge) {
-      NCCLCHECKGOTO(ncclTopoForceMerge(xml, forceMerge, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out);
-    }
-  }
-  NCCLCHECKGOTO(ncclTopoAutoMerge(xml, mergeLevel, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out);
+  if (netInfo->forceMerge) NCCLCHECKGOTO(ncclTopoForceMerge(xml, netInfo, placedDevs, props, physNetNodes, physicalDevs), res, out);
+  NCCLCHECKGOTO(ncclTopoAutoMerge(xml, netInfo, placedDevs, props, physNetNodes, physicalDevs), res, out);

 out:
  free(physNetNodes);
@@ -1313,10 +1398,10 @@ out:
  return res;
 }

-static ncclResult_t ncclTopoPopulateNics(ncclXml* xml, int startIndex, int endIndex, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), const char* netName, int coll, int virtualNics, bool dmaBufSupport) {
+static ncclResult_t ncclTopoPopulateNics(ncclXml* xml, int startIndex, int endIndex, struct ncclTopoNetInfo* netInfo, int virtualNics) {
  for (int n = startIndex; n < endIndex; n++) {
    ncclNetProperties_t props;
-    NCCLCHECK(getProperties(n, &props));
+    NCCLCHECK(netInfo->getProperties(n, &props));
    struct ncclXmlNode* netNode = NULL;
    struct ncclXmlNode* parent = NULL;
    if (virtualNics) {
@@ -1324,7 +1409,7 @@ static ncclResult_t ncclTopoPopulateNics(ncclXml* xml, int startIndex, int endIn
      NCCLCHECK(xmlFindTagKv(xml, "net", &net, "name", props.name));
      // In the event of multithreaded use case, we need to re-discover the shared parent of the given devices for this vNIC
      // Only run this if the net doesn't exist locally - this may alter the XML state
-      if (net == NULL) NCCLCHECK(ncclTopoGetVNicParent(xml, getProperties, &props.vProps, &parent));
+      if (net == NULL) NCCLCHECK(ncclTopoGetVNicParent(xml, netInfo->getProperties, &props.vProps, &parent));
    }

    NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode, parent));
@@ -1335,18 +1420,18 @@ static ncclResult_t ncclTopoPopulateNics(ncclXml* xml, int startIndex, int endIn
    NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
    int dev;
    xmlGetAttrIntDefault(netNode, "dev", &dev, -1);
-    if (dev != -1 && dev != n) INFO(NCCL_GRAPH, "TOPO/NET : Changing %s dev index from %d to %d", netName, dev, n);
+    if (dev != -1 && dev != n) INFO(NCCL_GRAPH, "TOPO/NET : Changing %s dev index from %d to %d", netInfo->name, dev, n);
    NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
    NCCLCHECK(xmlInitAttrInt(netNode, "latency", props.latency));
    NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
    NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
    NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
    NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
-    bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
-    INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", netName, gdrSupport ? "Enabled" : "Disabled", n, props.name);
+    bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (netInfo->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
+    INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", netInfo->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
    NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport));
    // Only set coll if it's not 0
-    if (coll) NCCLCHECK(xmlInitAttrInt(netNode, "coll", coll));
+    if (netInfo->coll) NCCLCHECK(xmlInitAttrInt(netNode, "coll", netInfo->coll));

    const char* keepAttr;
    NCCLCHECK(xmlGetAttr(netNode, "coll", &colAttr));
@@ -1359,51 +1444,45 @@ static ncclResult_t ncclTopoPopulateNics(ncclXml* xml, int startIndex, int endIn
 }

 // Calls to network plugin APIs should be protected. This function should be called inside a per-process lock.
-ncclResult_t ncclTopoProcessNet(ncclXml* xml, int coll, const char* dumpXmlFile, ncclTopoNetState* state, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*), ncclResult_t (*devices)(int*), const char* netName, bool dmaBufSupport) {
-  int usePhysicalDevices = (dumpXmlFile || makeVDevice == NULL);
-  if (state->nPhysicalNics == -1) NCCLCHECK(devices(&state->nPhysicalNics));
-  // Enumerate physical devices
-  NCCLCHECK(ncclTopoPopulateNics(xml, 0, state->nPhysicalNics, getProperties, netName, coll, false, dmaBufSupport));
+ncclResult_t ncclTopoProcessNet(ncclXml* xml, const char* dumpXmlFile, struct ncclTopoNetInfo* net) {
+  bool usePhysicalDevices = (dumpXmlFile || net->makeVDevice == NULL);
+  int nPhysicalNics, nVirtualNics;
+  NCCLCHECK(net->getDevCount(net->netPluginIndex, &nPhysicalNics, &nVirtualNics));
+  // List the physical devices in the topo
+  NCCLCHECK(ncclTopoPopulateNics(xml, 0, nPhysicalNics, net, /*virtual=*/false));
  if (!usePhysicalDevices) {
-    if (state->nVirtualNics == -1) {
-      NCCLCHECK(ncclTopoMakeVNics(xml, makeVDevice, getProperties, state->nPhysicalNics));
+    // Virtual devices are only created once per network
+    if (nVirtualNics == NCCL_UNDEF_DEV_COUNT) {
+      NCCLCHECK(ncclTopoMakeVNics(xml, net, nPhysicalNics));
+      // Update the number of virtual devices both locally and in the state tracking the plugin.
+      // Note: 0 is a valid number of virtual devices
      int nDevs;
-      NCCLCHECK(devices(&nDevs));
-      state->nVirtualNics = nDevs - state->nPhysicalNics;
+      NCCLCHECK(net->devices(&nDevs));
+      nVirtualNics = nDevs - nPhysicalNics;
+      NCCLCHECK(net->setVirtDevCount(net->netPluginIndex, nVirtualNics));
    }
-    if (state->nVirtualNics > 0) {
-      // Populate new devices
-      NCCLCHECK(ncclTopoPopulateNics(xml, state->nPhysicalNics, state->nPhysicalNics+state->nVirtualNics, getProperties, netName, coll, true, dmaBufSupport));
+    // populate the virtual devices if any
+    if (nVirtualNics > 0) {
+      NCCLCHECK(ncclTopoPopulateNics(xml, nPhysicalNics, nPhysicalNics + nVirtualNics, net, /*virtual=*/true));
    }
  }

  return ncclSuccess;
 }

-static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER;
-ncclTopoNetState netStates[NCCL_NET_MAX_PLUGINS] = {};
-ncclTopoNetState collNetStates[NCCL_NET_MAX_PLUGINS] = {};
-ncclResult_t ncclTopoGetSharedState(ncclTopoNetState** state, const char* name, ncclTopoNetState* states) {
-  INFO(NCCL_GRAPH, "Retrieving state for %s", name);
-  for (int i = 0; i < NCCL_NET_MAX_PLUGINS; i++) {
-    // Empty slot
-    if (states[i].name == NULL) {
-      states[i].nVirtualNics = -1;
-      states[i].nPhysicalNics = -1;
-      states[i].name = strdup(name);
-      *state = states + i;
-      INFO(NCCL_GRAPH, "Initialized state %d for %s", i, name);
-      return ncclSuccess;
-    // Found my slot
-    } else if (strcmp(states[i].name, name) == 0) {
-      *state = states + i;
-      return ncclSuccess;
-    }
+ncclResult_t ncclTopoGetFusionEnv(int* mergeLevel, const char** forceMerge) {
+  if (forceMerge) *forceMerge = ncclGetEnv("NCCL_NET_FORCE_MERGE");
+  const char* mergeLevelEnv = ncclGetEnv("NCCL_NET_MERGE_LEVEL");
+  if (mergeLevelEnv) {
+    kvConvertToInt(mergeLevelEnv, mergeLevel, nicPathKvList);
+  } else {
+    *mergeLevel = PATH_PORT;
  }
-  WARN("NET/TOPO : Couldn't find net with name %s", name);
-  return ncclInternalError;
+  return ncclSuccess;
 }

+static std::mutex netMutex;
+
 ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system, const char* dumpXmlFile) {
  ncclResult_t ret = ncclSuccess;
  struct ncclXml* xml;
@@ -1411,7 +1490,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
  int* localRanks = NULL;
  struct ncclXml* rankXml;
  int localRank = -1, nLocalRanks = 0;
-  int netLockHeld = 0;
+  struct ncclTopoNetInfo netInfo = {0};
  NCCLCHECK(xmlAlloc(&xml, NCCL_TOPO_XML_MAX_NODES));
  const char* xmlTopoFile = ncclGetEnv("NCCL_TOPO_FILE");
  if (xmlTopoFile) {
@@ -1451,21 +1530,35 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy

  // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
  // so we start with collnet so that it has precedence.
-  pthread_mutex_lock(&netLock);
-  netLockHeld = 1;
-  INFO(NCCL_GRAPH, "TOPO/NET : Importing network plugins to topology");
-  ncclTopoNetState* state;
-  state = NULL;
-  if (collNetSupport(comm)) {
-    NCCLCHECKGOTO(ncclTopoGetSharedState(&state, comm->ncclCollNet->name, collNetStates), ret, fail);
-    NCCLCHECKGOTO(ncclTopoProcessNet(xml, 1, dumpXmlFile, state,
-      comm->ncclCollNet->getProperties, comm->ncclCollNet->makeVDevice, comm->ncclCollNet->devices, comm->ncclCollNet->name, comm->dmaBufSupport), ret, fail);
+  {
+      std::lock_guard<std::mutex> lock(netMutex);
+      INFO(NCCL_GRAPH, "TOPO/NET : Importing network plugins to topology");
+      if (collNetSupport(comm)) {
+        netInfo.coll = 1;
+        netInfo.netPluginIndex = comm->netPluginIndex;
+        netInfo.dmaBufSupport = comm->dmaBufSupport;
+        netInfo.getDevCount = ncclCollNetGetDevCount;
+        netInfo.setVirtDevCount = ncclCollNetSetVirtDevCount;
+        netInfo.name = comm->ncclCollNet->name;
+        netInfo.getProperties = comm->ncclCollNet->getProperties;
+        netInfo.makeVDevice = comm->ncclCollNet->makeVDevice;
+        netInfo.devices = comm->ncclCollNet->devices;
+        NCCLCHECK(ncclTopoGetFusionEnv(&netInfo.mergeLevel, &netInfo.forceMerge));
+        NCCLCHECKGOTO(ncclTopoProcessNet(xml, dumpXmlFile, &netInfo), ret, fail);
+      }
+
+      netInfo.coll = 0;
+      netInfo.netPluginIndex = comm->netPluginIndex;
+      netInfo.dmaBufSupport = comm->dmaBufSupport;
+      netInfo.getDevCount = ncclNetGetDevCount;
+      netInfo.setVirtDevCount = ncclNetSetVirtDevCount;
+      netInfo.name = comm->ncclNet->name;
+      netInfo.getProperties = comm->ncclNet->getProperties;
+      netInfo.makeVDevice = comm->ncclNet->makeVDevice;
+      netInfo.devices = comm->ncclNet->devices;
+      NCCLCHECK(ncclTopoGetFusionEnv(&netInfo.mergeLevel, &netInfo.forceMerge));
+      NCCLCHECKGOTO(ncclTopoProcessNet(xml, dumpXmlFile, &netInfo), ret, fail);
  }
-  NCCLCHECKGOTO(ncclTopoGetSharedState(&state, comm->ncclNet->name, netStates), ret, fail);
-  NCCLCHECKGOTO(ncclTopoProcessNet(xml, 0, dumpXmlFile, state,
-    comm->ncclNet->getProperties, comm->ncclNet->makeVDevice, comm->ncclNet->devices, comm->ncclNet->name, comm->dmaBufSupport), ret, fail);
-  pthread_mutex_unlock(&netLock);
-  netLockHeld = 0;

  // Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
  NCCLCHECKGOTO(ncclTopoTrimXml(xml), ret, fail);
@@ -1523,7 +1616,6 @@ exit:
  free(xml);
  return ret;
 fail:
-  if (netLockHeld) pthread_mutex_unlock(&netLock);
  goto exit;
 }

@@ -1578,6 +1670,38 @@ ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *c
  return ncclSuccess;
 }

+enum netDevsPolicy {
+  NETDEVS_POLICY_AUTO = 0x0,
+  NETDEVS_POLICY_ALL = 0x1,
+  NETDEVS_POLICY_MAX = 0x2,
+  NETDEVS_POLICY_UNDEF = 0xffffffff
+};
+
+static enum netDevsPolicy netDevsPolicy = NETDEVS_POLICY_UNDEF;
+static int netDevsPolicyNum = -1;
+
+static void getNetDevsPolicyOnce() {
+  const char* envStr = ncclGetEnv("NCCL_NETDEVS_POLICY");
+  if (envStr) {
+    if (strcasecmp(envStr, "AUTO") == 0) {
+      netDevsPolicy = NETDEVS_POLICY_AUTO;
+    } else if (strcasecmp(envStr, "ALL") == 0) {
+      netDevsPolicy = NETDEVS_POLICY_ALL;
+    } else if (strncasecmp(envStr, "MAX:", strlen("MAX:")) == 0) {
+      int envNum = atoi(envStr + strlen("MAX:"));
+      if (envNum > 0) {
+        netDevsPolicy = NETDEVS_POLICY_MAX;
+        netDevsPolicyNum = envNum;
+      }
+    }
+    if (netDevsPolicy == NETDEVS_POLICY_UNDEF)
+      INFO(NCCL_ENV, "Unable to recognize NCCL_NETDEVS_POLICY=%s, using NCCL_NETDEVS_POLICY_AUTO instead.", envStr);
+    else
+      INFO(NCCL_ENV, "NCCL_NETDEVS_POLICY set by environment to %s", envStr);
+  }
+  if (netDevsPolicy == NETDEVS_POLICY_UNDEF) netDevsPolicy = NETDEVS_POLICY_AUTO;
+}
+
 ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev) {
  int gpu;
  NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu, /*showWarn=*/true));
@@ -1592,13 +1716,30 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int ch
    return ncclInternalError;
  }

-  int localGpus[NCCL_TOPO_MAX_NODES];
-  int localGpuCount;
-  NCCLCHECK(ncclTopoGetLocal(system, NET, localNets[0], GPU, localGpus, &localGpuCount, NULL));
+  static pthread_once_t once = PTHREAD_ONCE_INIT;
+  pthread_once(&once,getNetDevsPolicyOnce);
+  int netsPerGpu = 0;
+  if (netDevsPolicy == NETDEVS_POLICY_AUTO) {
+    int localGpus[NCCL_TOPO_MAX_NODES];
+    int localGpuCount;
+    NCCLCHECK(ncclTopoGetLocal(system, NET, localNets[0], GPU, localGpus, &localGpuCount, NULL));
+    netsPerGpu = DIVUP(localNetCount, localGpuCount);
+  } else if (netDevsPolicy == NETDEVS_POLICY_ALL) {
+    netsPerGpu = localNetCount;
+  } else if (netDevsPolicy == NETDEVS_POLICY_MAX) {
+    if (netDevsPolicyNum <= 0) {
+      WARN("Invalid number of network devices = %d for policy MAX", netDevsPolicyNum);
+      return ncclInternalError;
+    }
+    netsPerGpu = std::min(netDevsPolicyNum, localNetCount);
+  } else {
+    WARN("Unknown netDevs policy");
+    return ncclInternalError;
+  }

  int net = system->nodes[GPU].nodes[gpu].gpu.dev;
  if (isPow2(localNetCount)) net = mirrorBits(net, localNetCount);
-  net += channelId%(DIVUP(localNetCount,localGpuCount));
+  net += channelId%(netsPerGpu);
  if (id) *id = system->nodes[NET].nodes[localNets[net%localNetCount]].id;
  if (dev) *dev = system->nodes[NET].nodes[localNets[net%localNetCount]].net.dev;
  return ncclSuccess;
@@ -1656,25 +1797,10 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu
  cpu_set_t mask;
  SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity");

-#ifdef ENABLE_TRACE
-  {
-    char affinityStr[sizeof(cpu_set_t)*2];
-    TRACE(NCCL_INIT, "Current affinity for GPU %d is %s", gpu->gpu.dev,
-          ncclCpusetToRangeStr(&mask, affinityStr, sizeof(affinityStr)));
-  }
-#endif
-
  // Get the affinity of the CPU close to our GPU.
  cpu_set_t cpuMask = cpu->cpu.affinity;

-#ifdef ENABLE_TRACE
-  {
-    char affinityStr[sizeof(cpu_set_t)*2];
-    TRACE(NCCL_INIT, "CPU GPU affinity for GPU %d is %s", gpu->gpu.dev,
-          ncclCpusetToRangeStr(&cpuMask, affinityStr, sizeof(affinityStr)));
-  }
-#endif
-
+  // Get the final affinity
  cpu_set_t finalMask;
  if (ncclParamIgnoreCpuAffinity())
    // Ignore the CPU affinity set and use the GPU one instead
@@ -1685,12 +1811,22 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu

  memcpy(affinity, &finalMask, sizeof(cpu_set_t));

-  // If there is a non empty set, use it to set affinity
+  // display the final affinity
+  char msg[1024] = "";
+  snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), "Affinity for GPU %d is ", gpu->gpu.dev);
  if (CPU_COUNT(&finalMask)) {
-    char affinityStr[sizeof(cpu_set_t)*2];
-    INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", gpu->gpu.dev,
-         ncclCpusetToRangeStr(&finalMask, affinityStr, sizeof(affinityStr)));
+    (void)ncclCpusetToRangeStr(&finalMask, msg + strlen(msg), sizeof(msg) - strlen(msg));
+  } else {
+    snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), "empty, ignoring");
  }
+  snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), ". (GPU affinity = ");
+  (void)ncclCpusetToRangeStr(&cpuMask, msg + strlen(msg), sizeof(msg) - strlen(msg));
+  if (!ncclParamIgnoreCpuAffinity()) {
+    snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), " ; CPU affinity = ");
+    (void)ncclCpusetToRangeStr(&mask, msg + strlen(msg), sizeof(msg) - strlen(msg));
+  }
+  snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), ").");
+  INFO(NCCL_INIT, "%s: %s", __func__, msg);
  return ncclSuccess;
 }

@@ -229,12 +229,26 @@ ncclResult_t ncclTopoGetGpuMinPath(struct ncclTopoSystem* system, int type, int*
 ncclResult_t ncclTopoGetGpuMaxPath(struct ncclTopoSystem* system, int type, int* max);
 ncclResult_t ncclTopoSplitNvLink(struct ncclTopoSystem* system, int* splitNvLink);

-struct ncclTopoNetState {
-  int nVirtualNics;
-  int nPhysicalNics;
+struct ncclTopoNetInfo {
+  bool coll;
+  // communicator-specific information
+  int netPluginIndex;
+  bool dmaBufSupport;
+  // NIC fusion
+  int mergeLevel;
+  const char* forceMerge;
+  // dev count tracking functions (not part of ncclNet)
+  ncclResult_t (*getDevCount)(int, int*, int*);
+  ncclResult_t (*setVirtDevCount)(int, int);
+  // ncclNet API functions
  const char* name;
+  ncclResult_t (*getProperties)(int, ncclNetProperties_t*);
+  ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*);
+  ncclResult_t (*devices)(int*);
 };
-ncclResult_t ncclTopoProcessNet(ncclXml* xml, int coll, const char* dumpXmlFile, ncclTopoNetState* state, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*), ncclResult_t (*devices)(int*), const char* netName, bool dmaBufSupport);
+
+ncclResult_t ncclTopoProcessNet(ncclXml* xml, const char* dumpXmlFile, struct ncclTopoNetInfo* net);
+ncclResult_t ncclTopoGetFusionEnv(int* mergeLevel, const char** forceMerge);

 #define NCCL_TOPO_XML_MAX_NODES 8192
 #define NCCL_GRAPH_XML_MAX_NODES 8192
@@ -279,6 +293,8 @@ static ncclResult_t ncclTopoDevToRank(struct ncclTopoSystem* system, int dev, in
  return ncclInternalError;
 }

+extern struct kvDict nicPathKvList[];
+
 static ncclResult_t ncclTopoIdToNetDev(struct ncclTopoSystem* system, int64_t id, int* netDev) {
  *netDev = -1;
  for (int i=0; i<system->nodes[NET].count; i++) {
@@ -10,6 +10,7 @@
 #include "device.h"
 #include "comm.h"
 #include "topo.h"
+#include "nccl_tuner.h"

 NCCL_PARAM(Nthreads, "NTHREADS", -2);
 NCCL_PARAM(Ll128Nthreads, "LL128_NTHREADS", -2);
@@ -484,40 +485,73 @@ static struct tuningModel rcclTuningModel[] = {
  tuning_model_7,
 };

-/* Array indexes used below */
-#define VOLTA_COMPCAP_IDX 0
-#define AMPERE_COMPCAP_IDX 1
-#define HOPPER_COMPCAP_IDX 2
-#define BLACKWELL_COMPCAP_IDX 3
-
-#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__)
-// LL128 max BW per channel
-static const double llMaxBws[][3] = {
-  /* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4},
-  /* Ampere-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0},
-  /* Hopper-N1/AMD-N2/AMD-N4) */ {141.0, 45.0 /*avg of ring & tree*/, 35.0},
-  /* Blackwell-N1/AMD-N2/AMD-N4) */ {2*141.0, 2*45.0 /*avg of ring & tree*/, 2*35.0},
+// NVLS efficiency factor.
+static const float nvlsEfficiency[NCCL_NUM_COMPCAPS] = {
+  0.0f, // Volta
+  0.0f, // Ampere
+  0.85f, // Hopper
+  0.74f, // Blackwell
 };

-static const double perChMaxRingLL128Bws[][3] = {
-  /* Volta (N1/N2/N4) */  {20.0, 20.0, 20.0},
-  /* Ampere (N1/N2/N4) */ {20.0, 20.0, 20.0},
-  /* Hopper (N1/N2/N4) */ {36.7, 36.7, 36.7},
-  /* Blackwell (N1/N2/N4) */ {2*36.7, 2*36.7, 2*36.7},
+// Default tuner constants
+static const ncclTunerConstants_t ncclTunerConstantsDefaults = {
+  .baseLatencies = {
+    {  6.8, 14.0,  8.4 }, {  6.6, 14.0,  8.4 },  // Tree, Ring
+    {    0,    0,    0 }, {    0,    0,    0 },  // Collnet Direct, Chain
+    {    0,    0,    0 }, {    0,    0,    0 },  // NVLS, NVLS Tree
+    {  8.0,  8.0,  8.0 }                         // PAT
+    },
+  .hwLatencies = {
+  /* NVLINK */
+  { { .6, 1.25, 4.0 }, { .6, 1.9, 3.4 }, /* Tree (LL/LL128/Simple), Ring (LL/LL128/Simple)*/
+    {  0,    0, 3.7 }, {  0,   0,  2.8 }, /* CollNetDirect (LL/LL128/Simple), CollNetChain (LL/LL128/Simple)*/
+    {  0,    0,  25 }, {  0,   0,  25 }, /* NVLS (LL/LL128/Simple), NVLSTree (LL/LL128/Simple)*/
+    {  0,    0, 4.0 } /* PAT (LL/LL128/Simple)*/
+    },
+  /* PCI */
+  { { 1.0, 1.9, 4.0 }, { 1.0, 2.5, 5.7 }, /* Tree (LL/LL128/Simple), Ring (LL/LL128/Simple)*/
+    {  0,    0, 3.7 }, {  0,   0,  2.8 }, /* CollNetDirect (LL/LL128/Simple), CollNetChain (LL/LL128/Simple)*/
+    {  0,    0,   0 }, {  0,   0,    0 }, /* NVLS (LL/LL128/Simple), NVLSTree (LL/LL128/Simple)*/
+    {  0,    0, 4.0 } /* PAT (LL/LL128/Simple)*/
+    },
+  /* NET */
+  { { 5.0, 8.5, 14 }, { 2.7, 4.0, 14.0 }, /* Tree (LL/LL128/Simple), Ring (LL/LL128/Simple)*/
+    {   0,   0, 31 }, {   0,   0,   30 }, /* CollNetDirect (LL/LL128/Simple), CollNetChain (LL/LL128/Simple)*/
+    {   0,   0, 18 }, {   0,   0,   14 }, /* NVLS (LL/LL128/Simple), NVLSTree (LL/LL128/Simple)*/
+    {   0,   0, 14 } /* PAT (LL/LL128/Simple)*/
+    },
+  },
+  .llMaxBws = {
+     {39.0, 39.0, 20.4}, /* Volta-N1/Intel-N2/Intel-N4) */
+     {87.7, 22.5 /*avg of ring & tree*/, 19.0}, /* Ampere-N1/AMD-N2/AMD-N4) */
+     {141.0, 45.0 /*avg of ring & tree*/, 35.0}, /* Hopper-N1/AMD-N2/AMD-N4) */
+     {2*141.0, 2*45.0 /*avg of ring & tree*/, 2*35.0}, /* Blackwell-N1/AMD-N2/AMD-N4) */
+  },
+  .perChMaxRingLL128Bws = {
+    {20.0, 20.0, 20.0}, /* Volta (N1/N2/N4) */
+    {20.0, 20.0, 20.0}, /* Ampere (N1/N2/N4) */
+    {36.7, 36.7, 36.7}, /* Hopper (N1/N2/N4) */
+    {2*36.7, 2*36.7, 2*36.7}, /* Blackwell (N1/N2/N4) */
+  },
+  .perChMaxTreeLL128Bws = {
+    {20.0, 20.0, 20.0}, /* Volta (N1/N2/N4) */
+    {20.0, 20.0, 20.0}, /* Ampere (N1/N2/N4) */
+    {36.7, 36.7, 29.0}, /* Hopper (N1/N2/N4) */
+    {55.6, 31.67, 20.0}, /* Blackwell (N1/N2/N4) */
+  },
+  .perChMaxTreeBws = {
+    {26.5, 18.5, 10.0}, /* Volta (N1/N2/N4) */
+    {24.0, 23.6, 17.8}, /* Ampere (N1/N2/N4) */
+    {38.7, 41.4, 36.0}, /* Hopper (N1/N2/N4) */
+    {70.0, 42.8, 24.0}, /* Blackwell (N1/N2/N4) */
+  },
+  .perChMaxNVLSTreeBws = {
+    {26.5, 18.5, 10.0}, /* Volta (N1/N2/N4) */
+    {24.0, 23.6, 17.8}, /* Ampere (N1/N2/N4) */
+    {0.0, 57.7, 45.5}, /* Hopper (N1/N2/N4) */
+    {0.0, 96.0, 43.1} /* Blackwell (N1/N2/N4) */
+  }
 };
-static const double perChMaxTreeLL128Bws[][3] = {
-  /* Volta (N1/N2/N4) */  {20.0, 20.0, 20.0},
-  /* Ampere (N1/N2/N4) */ {20.0, 20.0, 20.0},
-  /* Hopper (N1/N2/N4) */ {36.7, 36.7, 29.0},
-  /* Blackwell (N1/N2/N4) */ {2*36.7, 2*36.7, 2*29.0},
-};
-static const double perChMaxTreeBws[][3] = {
-  /* Volta (N1/N2/N4) */  {26.5, 18.5, 10.0},
-  /* Ampere (N1/N2/N4) */ {24.0, 23.6, 17.8},
-  /* Hopper (N1/N2/N4) */ {38.7, 41.4, 36.0},
-  /* Blackwell (N1/N2/N4) */ {2*38.7, 2*41.4, 2*36.0},
-};
-#endif

 NCCL_PARAM(PatEnable, "PAT_ENABLE", 0);
 static int ncclPatEnable(struct ncclComm* comm) {
@@ -542,6 +576,13 @@ static float getNetOverhead(struct ncclComm* comm) {

 NCCL_PARAM(Ll128C2c, "LL128_C2C", 1);

+ncclResult_t ncclTopoInitTunerConstants(struct ncclComm* comm) {
+
+  comm->tunerConstants = ncclTunerConstantsDefaults;
+
+  return ncclSuccess;
+}
+
 ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs) {
 #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
  static int rcclMaxThreads[NCCL_NUM_PROTOCOLS] = {0};
@@ -576,18 +617,19 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
  int nRanks = comm->nRanks;
  if (nRanks <= 1) return ncclSuccess;
 #if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__)
-  int compCapIndex = minCompCap >= 100 ? BLACKWELL_COMPCAP_IDX : (minCompCap >= 90 ? HOPPER_COMPCAP_IDX : minCompCap >= 80 ? AMPERE_COMPCAP_IDX : VOLTA_COMPCAP_IDX);
+  int compCapIndex = minCompCap >= 100 ? NCCL_BLACKWELL_COMPCAP_IDX : (minCompCap >= 90 ? NCCL_HOPPER_COMPCAP_IDX : minCompCap >= 80 ? NCCL_AMPERE_COMPCAP_IDX : NCCL_VOLTA_COMPCAP_IDX);
  int index2 = nNodes <= 2 ? nNodes-1 : 2;
  // LL: for single node, we look at GPU type; for multi-node, we look at CPU type
  int index1 = nNodes == 1 ? compCapIndex :
               (comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD || comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_MIXED) ? 1 : 0;
-  double llMaxBw = llMaxBws[index1][index2];
-  double perChMaxTreeBw = perChMaxTreeBws[compCapIndex][index2];
-  double perChMaxRingLL128Bw = perChMaxRingLL128Bws[compCapIndex][index2];
-  double perChMaxTreeLL128Bw = perChMaxTreeLL128Bws[compCapIndex][index2];
-#endif
+  double llMaxBw = comm->tunerConstants.llMaxBws[index1][index2];
+  double perChMaxTreeBw = comm->tunerConstants.perChMaxTreeBws[compCapIndex][index2];
+  double perChMaxRingLL128Bw = comm->tunerConstants.perChMaxRingLL128Bws[compCapIndex][index2];
+  double perChMaxTreeLL128Bw = comm->tunerConstants.perChMaxTreeLL128Bws[compCapIndex][index2];
+  double perChMaxNVLSTreeBw = comm->tunerConstants.perChMaxNVLSTreeBws[compCapIndex][index2];
  // De-penalize Tree/Simple latency on Power systems to favor Tree than Ring
-  //if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE];
+  if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_POWER) comm->tunerConstants.hwLatencies[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = comm->tunerConstants.hwLatencies[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE];
+#endif
  float ppn = (float)nRanks / nNodes;

  int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS];
@@ -621,18 +663,25 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
            && a == NCCL_ALGO_PAT && (p != NCCL_PROTO_SIMPLE || ncclPatEnable(comm) == 0)) continue;
        int collnet = (a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) ? 1 : 0;
        float bw = nNodes <= 2 || collnet ? graphs[a]->bwIntra : graphs[a]->bwInter;
-        float busBw = comm->topo->baseBw != 0.0 ? comm->topo->baseBw : graphs[a]->nChannels * bw;
-        //INFO(NCCL_INIT, "algo %s proto %s busBw %f baseBw %f bw %f nChannels %d bwIntra %f bwInter %f", ncclAlgoStr[a], ncclProtoStr[p], busBw, comm->topo->baseBw, bw, graphs[a]->nChannels, graphs[a]->bwIntra, graphs[a]->bwInter);
-
-        if (a == NCCL_ALGO_NVLS) {
+#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__)
+        if (a == NCCL_ALGO_NVLS_TREE || a == NCCL_ALGO_NVLS)
+        {
+          // NVLS/NVLStree needs at least 2 channels
+          if (graphs[a]->nChannels < 2 ) continue;
+          // Convert to NVLS busBW/channel
+          float intraBw = graphs[a]->bwIntra * nvlsEfficiency[compCapIndex] * (graphs[a]->nChannels - 1) / graphs[a]->nChannels;
+	        // AllReduce pipelines two operations.
          if (coll == ncclFuncAllReduce) {
-            bw = std::min(graphs[a]->bwIntra, graphs[a]->bwInter);
+            intraBw *= 2.0f;
          } else {
-            // allgather and reducescatter
-            bw = std::min(graphs[a]->bwIntra * (ppn - 1.0f) / ppn, graphs[a]->bwInter * 0.9f);
+            intraBw *= (ppn - 1) / ppn;
          }
-        }
-        if (a == NCCL_ALGO_NVLS_TREE) bw = std::min(graphs[a]->bwIntra, nNodes <= 2 ? graphs[a]->bwInter : graphs[a]->bwInter/2);
+          // Handle 2 node case of NVLSTree
+          float interBw = graphs[a]->bwInter * ((nNodes <= 2 && a == NCCL_ALGO_NVLS_TREE) ? 2 : 1);
+          bw = std::min( {intraBw, interBw, a == NCCL_ALGO_NVLS_TREE ? (float)perChMaxNVLSTreeBw : std::numeric_limits<float>::max()} );
+        };
+#endif
+        float busBw = graphs[a]->nChannels * bw;

        // Various model refinements
 #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
@@ -686,8 +735,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
        // Convert bus BW to algorithm BW
        if (!(a != NCCL_ALGO_RING && (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter))) {
          float ratio = 1.0f;
-          if (a == NCCL_ALGO_RING) ratio *= (1.0 * nRanks) / nsteps;
-          else if (a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) ratio *= 5.0/6.0;
+          if (a == NCCL_ALGO_RING || a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) ratio *= (1.0 * nRanks) / nsteps;
          else ratio *= .5;
          busBw *= ratio;
        }
@@ -735,8 +783,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
          comm->latencies[coll][a][p] += 2*(nNodes-1)*rcclTuningModel[comm->topo->tuning].hwLat[NCCL_HW_NET][a][p];
        } else if (a == NCCL_ALGO_PAT) {
          if (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter) {
-            comm->latencies[coll][a][p] = 8 // Base time
-              + log2i(nNodes) * (interLat/3.5) // Log latency
+            comm->latencies[coll][a][p] += log2i(nNodes) * (interLat/3.5) // Log latency
              + nRanks * 2.8; // Still a linear part; hopefully we'll manage to remove it at some point.
          }
        }
@@ -1008,31 +1008,33 @@ ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const cha

  if (*netNode != NULL) return ncclSuccess;

-  const char* pciSysPath = pciPath;
-  if (pciSysPath) {
-    char subSystem[PATH_MAX];
-    NCCLCHECK(ncclTopoGetSubsystem(pciSysPath, subSystem));
-    // This is not a PCI device (virtual, usb, ...).
-    if (strcmp(subSystem, "pci") != 0) {
-      INFO(NCCL_NET|NCCL_GRAPH, "Topology detection: network path %s is not a PCI device (%s). Attaching to first CPU", pciSysPath, subSystem);
-      pciSysPath = NULL;
-    }
-  }
-
  struct ncclXmlNode* parent = NULL;
  if (forceParent) {
    parent = forceParent;
-  } else if (pciSysPath) {
-    int offset;
-    for (offset=strlen(pciSysPath)-1; pciSysPath[offset] != '/'; offset--);
-    char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
-    strcpy(busId, pciSysPath+offset+1);
-    NCCLCHECK(ncclTopoGetPciNode(xml, busId, &parent));
-    NCCLCHECK(xmlSetAttrIfUnset(parent, "class", "0x02"));
-    NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml));
  } else {
-    // Virtual NIC, no PCI device, attach to first CPU
-    NCCLCHECK(xmlFindTag(xml, "cpu", &parent));
+    const char* pciSysPath = pciPath;
+    if (pciSysPath) {
+      char subSystem[PATH_MAX];
+      NCCLCHECK(ncclTopoGetSubsystem(pciSysPath, subSystem));
+      // This is not a PCI device (virtual, usb, ...).
+      if (strcmp(subSystem, "pci") != 0 && !forceParent) {
+        INFO(NCCL_NET | NCCL_GRAPH, "Topology detection: network path (name = %s) %s is not a PCI device (%s). Attaching to first CPU", netName, pciSysPath, subSystem);
+        pciSysPath = NULL;
+      }
+    }
+
+    if (pciSysPath) {
+      int offset;
+      for (offset = strlen(pciSysPath) - 1; pciSysPath[offset] != '/'; offset--);
+      char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+      strcpy(busId, pciSysPath + offset + 1);
+      NCCLCHECK(ncclTopoGetPciNode(xml, busId, &parent));
+      NCCLCHECK(xmlSetAttrIfUnset(parent, "class", "0x02"));
+      NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml));
+    } else {
+      // Virtual NIC, no PCI device, attach to first CPU
+      NCCLCHECK(xmlFindTag(xml, "cpu", &parent));
+    }
  }

  struct ncclXmlNode* nicNode = NULL;
@@ -128,6 +128,13 @@ static ncclResult_t xmlGetAttrUint64(struct ncclXmlNode* node, const char* attrN
  return ncclSuccess;
 }

+static ncclResult_t xmlGetAttrUint64Default(struct ncclXmlNode* node, const char* attrName, uint64_t* value, uint64_t defaultValue) {
+  const char* str;
+  NCCLCHECK(xmlGetAttr(node, attrName, &str));
+  *value = str ? strtoull(str, NULL, 0) : defaultValue;
+  return ncclSuccess;
+}
+
 static ncclResult_t xmlGetAttrLong(struct ncclXmlNode* node, const char* attrName, int64_t* value) {
  const char* str;
  NCCLCHECK(xmlGetAttrStr(node, attrName, &str));
@@ -14,6 +14,9 @@
 #include "api_trace.h"
 #include <assert.h>
 #include "bootstrap.h"
+#include "ce_coll.h"
+#include "profiler.h"
+#include "nvtx.h"

 #include "msccl/msccl_lifecycle.h"

@@ -101,7 +104,7 @@ ncclResult_t ncclGroupStart_impl() {
    NCCLCHECK(Recorder::instance().record(rrGroupStart, ncclGroupDepth));
  }
  ncclResult_t ret = ncclSuccess;
-  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  NCCL_NVTX3_FUNC_RANGE;

  NCCLCHECK(ncclGroupStartInternal());
  TRACE_CALL("ncclGroupStart()");
@@ -123,7 +126,7 @@ ncclResult_t ncclGroupEnd_impl() {
    NCCLCHECK(Recorder::instance().record(rrGroupEnd, ncclGroupDepth));
  }
  ncclResult_t ret = ncclSuccess;
-  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  NCCL_NVTX3_FUNC_RANGE;
  NCCLCHECKGOTO(ncclGroupEndInternal(), ret, exit);
  TRACE_CALL("ncclGroupEnd()");
 exit:
@@ -137,7 +140,7 @@ ncclResult_t ncclGroupSimulateEnd(ncclSimInfo_t* simInfo) {
    Recorder::instance().record(ncclGroupDepth, simInfo);
  }
  ncclResult_t ret = ncclSuccess;
-  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  NCCL_NVTX3_FUNC_RANGE;
  NCCLCHECKGOTO(ncclGroupEndInternal(simInfo), ret, exit);
  TRACE_CALL("ncclGroupSimulateEnd()");
 exit:
@@ -150,65 +153,88 @@ struct ncclPreconnectJob {
  bool* algoNeedConnect;
 };

+struct ncclPrepareTasksAndCollPreconnectJob {
+  struct ncclAsyncJob base;
+  struct ncclComm* comm;
+  ncclSimInfo_t* simInfo;
+};
+
 ncclResult_t ncclP2PPreconnectFunc(struct ncclAsyncJob* job_) {
  struct ncclPreconnectJob* job = (struct ncclPreconnectJob*)job_;
  struct ncclComm* comm = job->comm;
  CUDACHECK(cudaSetDevice(comm->cudaDev));
-  if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
+  if (!job_->isThreadMain && CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
  NCCLCHECK(ncclTransportP2pSetup(comm, NULL, 1));
  if (comm->p2pNet) NCCLCHECK(ncclTransportP2pSetup(comm, NULL, NCCL_CONN_IDX_P2P_NET));
  return ncclSuccess;
 }

+static ncclResult_t ncclCollPreconnect(struct ncclComm* comm, bool* algoNeedConnect) {
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; ++i) {
+    if (algoNeedConnect[i]) {
+      switch (i) {
+        case NCCL_ALGO_RING: {
+          NCCLCHECK(ncclTransportRingConnect(comm));
+          break;
+        }
+        case NCCL_ALGO_TREE: {
+          NCCLCHECK(ncclTransportTreeConnect(comm));
+          break;
+        }
+        case NCCL_ALGO_NVLS: {
+          /* If we are using NVLS_TREE algo, we must mark NVLS algo to set up
+           * NVLS intra-node buffer */
+          NCCLCHECK(ncclNvlsBufferSetup(comm));
+          break;
+        }
+        case NCCL_ALGO_NVLS_TREE: {
+          NCCLCHECK(ncclNvlsTreeConnect(comm));
+          break;
+        }
+        case NCCL_ALGO_COLLNET_CHAIN: {
+          NCCLCHECK(ncclCollNetChainBufferSetup(comm));
+          break;
+        }
+        case NCCL_ALGO_COLLNET_DIRECT: {
+          NCCLCHECK(ncclCollNetDirectBufferSetup(comm));
+          break;
+        }
+        case NCCL_ALGO_PAT: {
+          NCCLCHECK(ncclTransportPatConnect(comm));
+          break;
+        }
+        // Yes, it's a dead code.  That's fine...
+        // coverity[dead_error_begin]
+        default: {
+          NCCLCHECK(ncclInternalError);
+        }
+      }
+    }
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclPrepareTasksAndCollPreconnectFunc(struct ncclAsyncJob* job_) {
+  struct ncclPrepareTasksAndCollPreconnectJob* job = (ncclPrepareTasksAndCollPreconnectJob*)job_;
+  struct ncclComm* comm = job->comm;
+  bool needConnect;
+  bool algoNeedConnect[NCCL_NUM_ALGORITHMS];
+  memset(algoNeedConnect, 0, sizeof(bool)*NCCL_NUM_ALGORITHMS);
+  CUDACHECK(cudaSetDevice(comm->cudaDev));
+  if (!job_->isThreadMain && CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
+  NCCLCHECK(ncclPrepareTasks(comm, algoNeedConnect, &needConnect, job->simInfo));
+  if (comm->cuMemSupport && needConnect) NCCLCHECK(ncclCollPreconnect(comm, algoNeedConnect));
+  return ncclSuccess;
+}
+
 ncclResult_t ncclCollPreconnectFunc(struct ncclAsyncJob* job_) {
  struct ncclPreconnectJob* job = (struct ncclPreconnectJob*)job_;
  struct ncclComm* comm = job->comm;
  ncclResult_t ret = ncclSuccess;

-  CUDACHECK(cudaSetDevice(comm->cudaDev));
-  if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
-  for (int i = 0; i < NCCL_NUM_ALGORITHMS; ++i) {
-    if (job->algoNeedConnect[i]) {
-      switch (i) {
-        case NCCL_ALGO_RING: {
-          NCCLCHECKGOTO(ncclTransportRingConnect(comm), ret, fail);
-          break;
-        }
-        case NCCL_ALGO_TREE: {
-          NCCLCHECKGOTO(ncclTransportTreeConnect(comm), ret, fail);
-          break;
-        }
-        case NCCL_ALGO_NVLS: {
-          /* If we are using NVLS_TREE algo, we must mark NVLS algo to set up
-           * NVLS intra-node buffer */
-          NCCLCHECKGOTO(ncclNvlsBufferSetup(comm), ret, fail);
-          break;
-        }
-        case NCCL_ALGO_NVLS_TREE: {
-          NCCLCHECKGOTO(ncclNvlsTreeConnect(comm), ret, fail);
-          break;
-        }
-        case NCCL_ALGO_COLLNET_CHAIN: {
-          NCCLCHECKGOTO(ncclCollNetChainBufferSetup(comm), ret, fail);
-          break;
-        }
-        case NCCL_ALGO_COLLNET_DIRECT: {
-          NCCLCHECKGOTO(ncclCollNetDirectBufferSetup(comm), ret, fail);
-          break;
-        }
-        case NCCL_ALGO_PAT: {
-          NCCLCHECKGOTO(ncclTransportPatConnect(comm), ret, fail);
-          break;
-        }
-        // Yes, it's a dead code.  That's fine...
-        // coverity[dead_error_begin]
-        default: {
-          ret = ncclInternalError;
-          goto fail;
-        }
-      }
-    }
-  }
+  if (!job_->isThreadMain) CUDACHECK(cudaSetDevice(comm->cudaDev));
+  if (!job_->isThreadMain && CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
+  NCCLCHECKGOTO(ncclCollPreconnect(comm, job->algoNeedConnect), ret, fail);

 exit:
  free(job->algoNeedConnect);
@@ -222,52 +248,33 @@ struct ncclGroupSymmetricJob {
  struct ncclComm* comm;
 };

-NCCL_PARAM(WinStride, "WIN_STRIDE", -1);
-
 ncclResult_t ncclCommGroupRegisterSymmetric(struct ncclAsyncJob* job_) {
  struct ncclGroupSymmetricJob* job = (struct ncclGroupSymmetricJob*)job_;
  struct ncclComm* comm = job->comm;
  ncclResult_t ret = ncclSuccess;

  CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
-  if (comm->baseStride == 0) {
-    cudaStream_t hostStream;
-    // first time to allocate symmetric VA space.
-    // calling into this function means symmetric is supported.
-    struct ncclSymDevBase* symBase = NULL;
-    size_t size = ncclSymDevBase::size(comm->localRanks);
-    if (ncclParamWinStride() != -1) {
-      comm->baseStride = ncclParamWinStride();
-    } else {
-      size_t maxStride = 0;
-      for (int r = 0; r < comm->nRanks; ++r)
-        if (comm->peerInfo[r].totalGlobalMem > maxStride) maxStride = comm->peerInfo[r].totalGlobalMem;
-      comm->baseStride = maxStride;
-    }
-    INFO(NCCL_INIT, "rank %d base stride %zuGB total VM %zuGB", comm->rank, comm->baseStride >> 30, (comm->baseStride * comm->localRanks) >> 30);
-    NCCLCHECKGOTO(ncclIpcSymmetricInit(comm), ret, fail);
-    NCCLCHECKGOTO(ncclNvlsSymmetricInit(comm), ret, fail);
-    comm->symAllocHead = 0;

-    // Allocate symmetric memory for NCCL internal usage
-    NCCLCHECKGOTO(ncclCommSymmetricAllocInternal(comm, size, alignof(struct ncclSymDevBase), (void**)&symBase), ret, fail);
-    assert((void*)symBase == (void*)(comm->baseUCSymPtr + comm->localRank * comm->baseStride));
-    NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false, &hostStream), ret, fail);
-    CUDACHECKGOTO(cudaMemsetAsync(symBase, 0, size, hostStream), ret, fail);
-    CUDACHECKGOTO(cudaStreamSynchronize(hostStream), ret, fail);
-    NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false), ret, fail);
-
-    comm->symDevComm.base = (struct ncclSymDevBase*)(comm->baseUCSymPtr + comm->localRank * comm->baseStride);
-    comm->symDevComm.baseMc = (struct ncclSymDevBase*)comm->baseMCSymPtr;
-    comm->symDevComm.nRanks = comm->localRanks;
-    comm->symDevComm.nRanks_rcp32 = idivRcp32(comm->localRanks);
-    comm->symDevComm.rank = comm->localRank;
-    comm->symDevComm.stride4G = comm->baseStride >> 32;
+  while (!ncclIntruQueueEmpty(&comm->devrState.regTaskQueue)) {
+    struct ncclDevrRegTask* task = ncclIntruQueueDequeue(&comm->devrState.regTaskQueue);
+    NCCLCHECKGOTO(ncclDevrWindowRegisterInGroup(
+      comm, task->userPtr, task->userSize, task->winFlags, task->outWinDev),
+      ret, fail);
+    free(task);
  }

-  while (!ncclIntruQueueEmpty(&comm->symRegTaskQueue)) {
-    struct ncclSymRegTask* task = ncclIntruQueueDequeue(&comm->symRegTaskQueue);
-    NCCLCHECKGOTO(ncclCommSymmetricRegisterInternal(comm, task->buff, task->baseSize, task->alignment, task->memHandle, task->regHandle), ret, fail);
+  while (!ncclIntruQueueEmpty(&comm->devrState.commCreateTaskQueue)) {
+    struct ncclDevrCommCreateTask* task = ncclIntruQueueDequeue(&comm->devrState.commCreateTaskQueue);
+    NCCLCHECKGOTO(ncclDevrCommCreateInternal(
+      comm, (struct ncclDevCommRequirements const*)task->reqs, task->outDevComm),
+      ret, fail);
+    freeDevCommRequirements(task->reqs); // free additional task memory for reqs
+    free(task);
+  }
+
+  while (!ncclIntruQueueEmpty(&comm->ceInitTaskQueue)) {
+    struct ncclCeInitTask* task = ncclIntruQueueDequeue(&comm->ceInitTaskQueue);
+    NCCLCHECKGOTO(ncclCeInit(task->comm), ret, fail);
    free(task);
  }

@@ -324,7 +331,11 @@ static ncclResult_t doLaunches(struct ncclComm* head) {
            comm->planner.unlaunchedPlansHead = plan->next;
            CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), result, failure);
            NCCLCHECKGOTO(ncclLaunchKernelBefore_NoUncapturedCuda(comm, plan), result, failure);
-            NCCLCHECKGOTO(ncclLaunchKernel(comm, plan), result, failure);
+            if (plan->isCeColl) {
+              NCCLCHECKGOTO(ncclLaunchCeColl(comm, plan), result, failure);
+            } else {
+              NCCLCHECKGOTO(ncclLaunchKernel(comm, plan), result, failure);
+            }
          }
          // Barrier reduction input indicates if we require further rounds.
          if (useBarrier) ncclCommIntraBarrierIn(comm, comm->planner.unlaunchedPlansHead != nullptr ? 1 : 0);
@@ -422,6 +433,12 @@ static ncclResult_t asyncJobLaunch(struct ncclIntruQueue<struct ncclAsyncJob, &n

  if (!ncclIntruQueueEmpty(asyncJobsMain)) {
    struct ncclAsyncJob* job = ncclIntruQueueHead(asyncJobsMain);
+    if (job->next == nullptr) {
+      job->isThreadMain = true;
+      ncclAsyncJobMain(job);
+      job->state = ncclGroupJobJoined;
+      return job->result;
+    }
    do {
      PTHREADCHECKGOTO(pthread_create(&job->thread, nullptr, ncclAsyncJobMain, job), "pthread_create", ret, fail);
      job = job->next;
@@ -474,6 +491,51 @@ fail:
  goto exit;
 }

+NCCL_PARAM(SingleProcMemRegEnable, "SINGLE_PROC_MEM_REG_ENABLE", 0);
+
+static ncclResult_t ncclPrepareTasksAndCollPreconnect(struct ncclComm* comm, ncclSimInfo_t* simInfo, struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next>* asyncCollJobs) {
+  if (ncclParamSingleProcMemRegEnable()) {
+    struct ncclPrepareTasksAndCollPreconnectJob* job;
+    NCCLCHECK(ncclCalloc(&job, 1));
+    job->base.func = ncclPrepareTasksAndCollPreconnectFunc;
+    job->base.undo = nullptr;
+    job->base.destructor = free;
+    job->base.state = ncclGroupJobRunning;
+    job->base.abortFlag = comm->abortFlag;
+    job->base.abortFlagDev = comm->abortFlagDev;
+    job->comm = comm;
+    job->simInfo = simInfo;
+    ncclIntruQueueEnqueue(asyncCollJobs, &job->base);
+  } else {
+    bool needConnect = false;
+    bool algoNeedConnect[NCCL_NUM_ALGORITHMS];
+    memset(algoNeedConnect, 0, sizeof(bool) * NCCL_NUM_ALGORITHMS);
+
+    CUDACHECK(cudaSetDevice(comm->cudaDev));
+    NCCLCHECK(ncclPrepareTasks(comm, algoNeedConnect, &needConnect, simInfo));
+
+    if (comm->cuMemSupport && needConnect) {
+      ncclResult_t ret;
+      struct ncclPreconnectJob* job;
+      NCCLCHECK(ncclCalloc(&job, 1));
+      job->base.func = ncclCollPreconnectFunc;
+      job->base.undo = nullptr;
+      job->base.destructor = free;
+      job->base.state = ncclGroupJobRunning;
+      job->base.abortFlag = comm->abortFlag;
+      job->base.abortFlagDev = comm->abortFlagDev;
+      job->comm = comm;
+      if ((ret = ncclCalloc(&job->algoNeedConnect, NCCL_NUM_ALGORITHMS))) {
+        free(job);
+        NCCLCHECK(ret);
+      }
+      memcpy(job->algoNeedConnect, algoNeedConnect, sizeof(bool) * NCCL_NUM_ALGORITHMS);
+      ncclIntruQueueEnqueue(asyncCollJobs, &job->base);
+    }
+  }
+  return ncclSuccess;
+}
+
 static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInfo = NULL) {
  ncclResult_t ret = ncclSuccess;
  struct ncclGroupJob *gjob = (struct ncclGroupJob*) job_;
@@ -548,27 +610,7 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
      // at the same time.
      comm = cliqueHead;
      do {
-        bool needConnect = false;
-        bool algoNeedConnect[NCCL_NUM_ALGORITHMS];
-        memset(algoNeedConnect, 0, sizeof(bool) * NCCL_NUM_ALGORITHMS);
-
-        CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
-        NCCLCHECKGOTO(ncclPrepareTasks(comm, algoNeedConnect, &needConnect, simInfo), ret, fail);
-
-        if (comm->cuMemSupport && needConnect) {
-          struct ncclPreconnectJob* job;
-          NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
-          job->base.func = ncclCollPreconnectFunc;
-          job->base.undo = nullptr;
-          job->base.destructor = free;
-          job->base.state = ncclGroupJobRunning;
-          job->base.abortFlag = comm->abortFlag;
-          job->base.abortFlagDev = comm->abortFlagDev;
-          job->comm = comm;
-          NCCLCHECKGOTO(ncclCalloc(&job->algoNeedConnect, NCCL_NUM_ALGORITHMS), ret, fail);
-          memcpy(job->algoNeedConnect, algoNeedConnect, sizeof(bool) * NCCL_NUM_ALGORITHMS);
-          ncclIntruQueueEnqueue(&asyncCollJobs, &job->base);
-        }
+        NCCLCHECKGOTO(ncclPrepareTasksAndCollPreconnect(comm, simInfo, &asyncCollJobs), ret, fail);
        comm = comm->groupNext[ncclGroupTaskTypeCollective];
      } while (comm != nullptr && comm->intraComm0 == cliqueHead->intraComm0);
      // connect
@@ -650,6 +692,13 @@ ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo) {
  if (mscclAvailable() && !mscclIsCaller()) {
    NCCLCHECK(mscclGroupEnd());
  }
+  
+  if (ncclProfilerApiState.profilerGroupDepth > 0) {
+    ncclProfilerApiState.profilerGroupDepth--;
+  }
+  if (ncclProfilerApiState.profilerGroupDepth == 0) {
+    NCCLCHECK(ncclProfilerRecordGroupApiEventState(ncclProfilerGroupEndApiStart));
+  }

  if ((--ncclGroupDepth) > 0) goto exit;

@@ -735,6 +784,8 @@ ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo) {
  groupLocalResetJobState();

 exit:
+  // Profiler group API start is called inside taskAppend to get graph capture information for the event
+  NCCLCHECK(ncclProfilerStopGroupApiEvent());
  return ret;
 fail:
  if (groupJob) {
@@ -7,7 +7,55 @@
 #ifndef NCCL_ALLOCATOR_H_
 #define NCCL_ALLOCATOR_H_

-ncclResult_t ncclCommSymmetricAllocInternal(struct ncclComm* comm, size_t size, size_t alignment, void** symPtr);
-ncclResult_t ncclCommSymmetricFreeInternal(struct ncclComm* comm, void* symPtr);
+////////////////////////////////////////////////////////////////////////////////
+// ncclSpace: Allocates contiguous segments of non-negative integers. Useful
+// as a memory allocator when we can't put allocator state within the memory
+// being allocated.
+
+struct ncclSpace {
+  int count;
+  int capacity;
+  int64_t* cuts;
+};
+
+void ncclSpaceConstruct(struct ncclSpace* a);
+void ncclSpaceDestruct(struct ncclSpace* a);
+ncclResult_t ncclSpaceAlloc(struct ncclSpace* a, int64_t spaceLimit, int64_t objSize, int objAlign, int64_t* outObjOffset);
+ncclResult_t ncclSpaceFree(struct ncclSpace* a, int64_t objOffset, int64_t objSize);
+
+
+////////////////////////////////////////////////////////////////////////////////
+// ncclShadowPool: Allocates device-side objects, their host-side shadows, and
+// maintains the device->host object address mapping.
+
+struct ncclShadowObject;
+struct ncclShadowPage;
+struct ncclShadowPool {
+  int count, hbits;
+  struct ncclShadowObject** table;
+  cudaMemPool_t memPool;
+  struct ncclShadowPage* pages;
+};
+
+void ncclShadowPoolConstruct(struct ncclShadowPool*);
+ncclResult_t ncclShadowPoolDestruct(struct ncclShadowPool*);
+ncclResult_t ncclShadowPoolAlloc(struct ncclShadowPool*, size_t size, void** outDevObj, void** outHostObj, cudaStream_t stream);
+ncclResult_t ncclShadowPoolFree(struct ncclShadowPool*, void* devObj, cudaStream_t stream);
+ncclResult_t ncclShadowPoolToHost(struct ncclShadowPool*, void* devObj, void** outHostObj);
+
+template<typename T>
+static inline ncclResult_t ncclShadowPoolAlloc(struct ncclShadowPool* pool, T** outDevObj, T** outHostObj, cudaStream_t stream) {
+  void* devObj;
+  void* hostObj;
+  ncclResult_t got = ncclShadowPoolAlloc(pool, sizeof(T), &devObj, &hostObj, stream);
+  if (outDevObj) *outDevObj = (T*)devObj;
+  if (outHostObj) *outHostObj = (T*)hostObj;
+  return got;
+}
+
+template<typename T>
+static inline ncclResult_t ncclShadowPoolToHost(struct ncclShadowPool* pool, T* devObj, T** hostObj) {
+  return ncclShadowPoolToHost(pool, (void*)devObj, (void**)hostObj);
+}

 #endif
@@ -31,7 +31,7 @@
 #define RCCL_API_TRACE_VERSION_MAJOR 0

 // should be increased every time new members are added to existing dispatch tables
-#define RCCL_API_TRACE_VERSION_PATCH 2
+#define RCCL_API_TRACE_VERSION_PATCH 3

 #if !defined(RCCL_EXTERN_C_INIT)
 #    ifdef __cplusplus
@@ -65,10 +65,10 @@ typedef ncclResult_t (*ncclAllReduceWithBias_fn_t)(const void* sendbuff, void* r
                                           size_t count, ncclDataType_t datatype,
                                           ncclRedOp_t op, struct ncclComm* comm,
                                           hipStream_t stream, const void* acc);
-typedef ncclResult_t (*ncclAllToAll_fn_t)(const void* sendbuff, void* recvbuff,
+typedef ncclResult_t (*ncclAlltoAll_fn_t)(const void* sendbuff, void* recvbuff,
                                          size_t count, ncclDataType_t datatype,
                                          ncclComm_t comm, hipStream_t stream);
-typedef ncclResult_t (*ncclAllToAllv_fn_t)(
+typedef ncclResult_t (*ncclAlltoAllv_fn_t)(
    const void* sendbuff, const size_t sendcounts[], const size_t sdispls[],
    void* recvbuff, const size_t recvcounts[], const size_t rdispls[],
    ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
@@ -162,7 +162,7 @@ typedef ncclResult_t (*ncclCommRegister_fn_t)(const ncclComm_t comm, void* buff,

 typedef ncclResult_t (*ncclCommDeregister_fn_t)(const ncclComm_t comm, void* handle);

-typedef ncclResult_t (*ncclCommWindowRegister_fn_t)(ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags);
+typedef ncclResult_t (*ncclCommWindowRegister_fn_t)(ncclComm_t comm, void* userPtr, size_t userSize, ncclWindow_t* outWinDev, int winFlags);

 typedef ncclResult_t (*ncclCommWindowDeregister_fn_t)(ncclComm_t comm, ncclWindow_t win);

@@ -172,8 +172,8 @@ typedef struct rcclApiFuncTable
    uint64_t                      size;
    ncclAllGather_fn_t            ncclAllGather_fn;
    ncclAllReduce_fn_t            ncclAllReduce_fn;
-    ncclAllToAll_fn_t             ncclAllToAll_fn;
-    ncclAllToAllv_fn_t            ncclAllToAllv_fn;
+    ncclAlltoAll_fn_t             ncclAllToAll_fn;
+    ncclAlltoAllv_fn_t            ncclAllToAllv_fn;
    ncclBroadcast_fn_t            ncclBroadcast_fn;
    ncclGather_fn_t               ncclGather_fn;
    ncclReduce_fn_t               ncclReduce_fn;
@@ -211,6 +211,8 @@ typedef struct rcclApiFuncTable
    ncclCommShrink_fn_t           ncclCommShrink_fn;
    ncclCommWindowRegister_fn_t   ncclCommWindowRegister_fn;
    ncclCommWindowDeregister_fn_t ncclCommWindowDeregister_fn;
+    ncclAlltoAll_fn_t             ncclAlltoAll_fn;
+    ncclAlltoAllv_fn_t            ncclAlltoAllv_fn;
    // ADD NEW FUNCTIONS HERE ONLY
 } rcclApiFuncTable;

@@ -41,6 +41,9 @@ constexpr static __host__ __device__ Int maxval(Int a, Int b, More ...more) {
  #endif
 }

+#define BIT(x) (1UL << (x))
+#define MASK(x) ((1UL << x) - 1UL)
+
 #define DIVUP(x, y) \
    (((x)+(y)-1)/(y))

@@ -68,14 +71,26 @@ static __host__ __device__ constexpr Z roundDown(X x, Y y) {
 }

 // assumes second argument is a power of 2
-template<typename X, typename Z = decltype(X()+int())>
-static __host__ __device__ constexpr Z alignUp(X x, int a) {
-  return (x + a-1) & Z(-a);
+template<typename X, typename Y, typename Z = decltype(X()+Y())>
+static __host__ __device__ constexpr Z alignUp(X x, Y a) {
+  return (x + a-1) & -Z(a);
 }
+template<typename T>
+static __host__ __device__ T* alignUp(T* x, size_t a) {
+  static_assert(sizeof(T) == 1, "Only single byte types allowed.");
+  return reinterpret_cast<T*>((reinterpret_cast<uintptr_t>(x) + a-1) & -uintptr_t(a));
+}
+
 // assumes second argument is a power of 2
-template<typename X, typename Z = decltype(X()+int())>
-static __host__ __device__ constexpr Z alignDown(X x, int a) {
-  return x & Z(-a);
+template<typename X, typename Y, typename Z = decltype(X()+Y())>
+static __host__ __device__ constexpr Z alignDown(X x, Y a) {
+  return x & -Z(a);
+}
+
+template<typename T>
+static __host__ __device__ T* alignDown(T* x, size_t a) {
+  static_assert(sizeof(T) == 1, "Only single byte types allowed.");
+  return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(x) & -uintptr_t(a));
 }

 template<typename Int>
@@ -341,7 +356,7 @@ static __host__ __device__ UInt reverseSubBits(UInt x) {
    default: static_assert(8*sizeof(UInt) <= 64, "Unsupported integer type.");
    }
    return reverseSubBits<UInt, 8>(x);
-  } else if (nSubBits == 1) {
+  } else if (nSubBits <= 1) {
    return x;
  } else {
    UInt m = UInt(-1)/((UInt(1)<<(nSubBits/2))+1);
@@ -0,0 +1,76 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_CE_COLL_H_
+#define NCCL_CE_COLL_H_
+
+#include "nccl.h"
+#include "nccl_common.h"
+#include "bitops.h"
+
+// Memory operations per rank for different synchronization protocols
+#define NCCL_CE_SYNC_OPS_PER_RANK_MC 2
+#define NCCL_CE_SYNC_OPS_PER_RANK_UC 3
+
+struct ncclCeColl {
+  uint8_t* baseUCSymReadyPtr;
+  uint8_t* baseUCSymComplPtr;
+  size_t baseUCSymReadyOffset;
+  size_t baseUCSymComplOffset;
+  uint32_t ceSeqNum;
+  bool useCompletePtr;
+  uint32_t intraBatchSyncFreq;
+  uint64_t intraBatchSyncMsgThreshold;
+  struct ncclDevrWindow* ceSyncWin;
+};
+
+struct ncclCeInitTask {
+  struct ncclCeInitTask *next;
+  struct ncclComm* comm;
+};
+
+struct alignas(16) ncclCeCollArgs {
+  ncclFunc_t func;
+  int rootRank;
+  size_t nElts;
+  size_t eltSize;
+  uint8_t* sendBuff;
+  uint8_t* recvBuff;
+  struct ncclDevrWindow* sendWin;
+  struct ncclDevrWindow* recvWin;
+};
+
+struct ncclCeBatchOpsParams {
+  void** dsts;
+  void** srcs;
+  size_t* sizes;
+  size_t numOps;
+  bool intraBatchSync;
+#if CUDART_VERSION >= 12080
+  cudaMemcpyAttributes* attrs;
+  size_t* attrIdxs;
+  size_t numAttrs;
+#endif
+};
+
+bool ncclCeImplemented(ncclFunc_t coll, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty);
+
+ncclResult_t ncclCeInit(struct ncclComm* comm);
+
+ncclResult_t ncclCeFinalize(struct ncclComm* comm);
+
+ncclResult_t ncclMemOpSync(struct ncclComm* comm, cudaStream_t stream);
+
+ncclResult_t ncclLaunchCeColl(struct ncclComm* comm, struct ncclKernelPlan* plan);
+
+ncclResult_t ncclCeAllGather(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream);
+
+ncclResult_t ncclCeScatter(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream);
+
+ncclResult_t ncclCeGather(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream);
+
+ncclResult_t ncclCeAlltoAll(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream);
+#endif /* NCCL_CE_COLL_H_ */
@@ -17,16 +17,17 @@ ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncc
 ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks);

 inline uint8_t ncclP2pChannelBaseForRound(struct ncclComm* comm, int p2pRound, int p2pBatchEnable = 0) {
+  int base;
  if (comm->nNodes > 1) {
    int nodeDelta = p2pRound/comm->maxLocalRanks;
    int localDelta = p2pRound%comm->maxLocalRanks;
    int batchSize = (comm->nNodes > 2 && p2pBatchEnable) ? NCCL_MAX_DEV_WORK_P2P_PER_BATCH : 1;
-    int base = nodeDelta*divUp(comm->maxLocalRanks, batchSize);
+    base = nodeDelta*divUp(comm->maxLocalRanks, batchSize);
    base += localDelta/batchSize;
-    return base & 0xff;
  } else {
-    return p2pRound & 0xff;
+    base = p2pRound;
  }
+  return base & 0xff;
 }

 #endif
@@ -16,7 +16,7 @@ typedef char collNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
 static const char* collNetName(struct ncclComm* comm) { return comm->ncclCollNet->name; }
 static ncclResult_t collNetDevices(struct ncclComm* comm, int* ndev) { NCCLCHECK(comm->ncclCollNet->devices(ndev)); return ncclSuccess; }
 static ncclResult_t collNetGetProperties(struct ncclComm* comm, int dev, ncclNetProperties_t* props) { NCCLCHECK(comm->ncclCollNet->getProperties(dev, props)); return ncclSuccess; }
-static ncclResult_t collNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclCollNet->listen(dev, handle, listenComm)); return ncclSuccess; }
+static ncclResult_t collNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclCollNet->listen(comm->collNetContext, dev, handle, listenComm)); return ncclSuccess; }
 static ncclResult_t collNetConnect(struct ncclComm* comm, void* handles[], int nranks, int rank, void* listenComm, void** collComm) { NCCLCHECK(comm->ncclCollNet->connect(handles, nranks, rank, listenComm, collComm)); return ncclSuccess; }
 static ncclResult_t collNetReduceSupport(struct ncclComm* comm, ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { NCCLCHECK(comm->ncclCollNet->reduceSupport(dataType, redOp, supported)); return ncclSuccess; }
 static ncclResult_t collNetRegMr(struct ncclComm* comm, void* collComm, void* data, size_t size, int type, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMr(collComm, data, size, type, mhandle)); return ncclSuccess; }
@@ -29,6 +29,7 @@ static ncclResult_t collNetIflush(struct ncclComm* comm, void* collComm, void* d
 static ncclResult_t collNetTest(struct ncclComm* comm, void* request, int* done, int* size) { NCCLCHECK(comm->ncclCollNet->test(request, done, size)); return ncclSuccess; }
 static ncclResult_t collNetCloseColl(struct ncclComm* comm, void* collComm) { NCCLCHECK(comm->ncclCollNet->closeColl(collComm)); return ncclSuccess; }
 static ncclResult_t collNetCloseListen(struct ncclComm* comm, void* listenComm) { NCCLCHECK(comm->ncclCollNet->closeListen(listenComm)); return ncclSuccess; }
+static ncclResult_t collNetFinalize(struct ncclComm* comm, void* ctx) { NCCLCHECK(comm->ncclCollNet->finalize(ctx)); return ncclSuccess; }

 static int collNetSupport(struct ncclComm* comm) { return comm->ncclCollNet != nullptr ? 1 : 0; }

@@ -10,7 +10,7 @@
 #define NCCL_COLLECTIVES_H_

 #include "nccl.h"
-#include "nccl_common.h"
+#include "nccl_tuner.h"
 #include "device.h"

 #define NCCL_MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two.
@@ -25,11 +25,17 @@
 #define ALLGATHER_SLICESTEPS (NCCL_STEPS/4)
 #define ALLGATHER_SLICESTEPS_SINGLE_NODE (NCCL_STEPS/2)
 #define ALLGATHER_CHUNKSTEPS (NCCL_STEPS/2)
+#define ALLTOALL_SLICESTEPS 1
+#define ALLTOALL_CHUNKSTEPS 1
 #define REDUCESCATTER_SLICESTEPS (NCCL_STEPS/4)
 #define REDUCESCATTER_SLICESTEPS_SINGLE_NODE (NCCL_STEPS/2)
 #define REDUCESCATTER_CHUNKSTEPS (NCCL_STEPS/2)
 #define BROADCAST_SLICESTEPS 1
 #define BROADCAST_CHUNKSTEPS 1
+#define GATHER_SLICESTEPS 1
+#define GATHER_CHUNKSTEPS 1
+#define SCATTER_SLICESTEPS 1
+#define SCATTER_CHUNKSTEPS 1
 #define REDUCE_SLICESTEPS 1
 #define REDUCE_CHUNKSTEPS 1
 #define NCCL_MAX_SLICE_PER_CHUNK 2  // max value for CHUNKSTEPS/SLICESTEPS, must accord with above
@@ -20,6 +20,9 @@
 #include "nvmlwrap.h"
 #include "profiler.h"
 #include "allocator.h"
+#include "dev_runtime.h"
+#include "sym_kernels.h"
+#include "ce_coll.h"
 #include "latency_profiler/CollTrace.h"
 #include "rccl_common.h"
 #include "recorder.h"
@@ -217,13 +220,15 @@ struct ncclTaskColl {
 #endif
  int32_t nWarps:8;
  int32_t algorithm:8, protocol:8, pipeline:8;
-  uint32_t isCollnet:1, isNvls:1;
-  uint32_t devFuncId:30;
+  uint32_t isCollnet:1, isNvls:1, isSymLast:1;
+  uint32_t devFuncId:29;
  int regBufType;
  uint64_t opCount;
  // number of elements in planner->ipcMemQueue associated with this collective
  int nCleanupQueueElts;

+  struct ncclDevrWindow* sendWin;
+  struct ncclDevrWindow* recvWin;
  void* sendMhandle;
  void* recvMhandle;
  void** sendNetHandles;
@@ -237,12 +242,16 @@ struct ncclTaskColl {

  // Profiler plugin
  int eActivationMask;
+  void* groupApiEventHandle;
+  void* collApiEventHandle;
  void* eventHandle;
  uint8_t nChannels;
 };
+
 struct ncclTaskP2p {
  struct ncclTaskP2p* next;
  ncclFunc_t func;
+  ncclFunc_t collAPI;
  void* buff;
  size_t count;
  ncclDataType_t datatype;
@@ -252,6 +261,8 @@ struct ncclTaskP2p {

  // Profiler plugin
  int eActivationMask;
+  void* groupApiEventHandle;
+  void* p2pApiEventHandle;
  void* eventHandle;
  uint8_t nChannels;
 };
@@ -267,12 +278,14 @@ struct ncclKernelPlan {
  bool persistent; // aka captured in a graph
  bool isHostCbEnq;
  bool isSymColl;
+  bool isCeColl;
  enum ncclDevWorkStorageType workStorageType;
  bool kernelSpecialized;
  void* kernelFn;
  union {
    struct ncclDevKernelArgs* kernelArgs;
-    struct ncclSymDevArgs* kernelSymArgs;
+    void* kernelSymArgs;
+    struct ncclCeCollArgs* ceCollArgs;
  };
  size_t kernelArgsSize;
  struct channelMasks channelMask;
@@ -291,6 +304,8 @@ struct ncclKernelPlan {
  struct ncclIntruQueue<struct ncclProxyOp, &ncclProxyOp::enqNext> proxyOpQueue;

  // Profiler plugin
+  void* groupApiEventHandle;
+  void* kernelLaunchEventHandle;
  void* groupEventHandle;
 };

@@ -381,9 +396,8 @@ struct ncclKernelPlanner {
  struct ncclTaskCollSorter collSorter;
  struct Peer* peers/*[nRanks]*/;
  int nTasksColl, nTasksP2p;
+  int nTasksP2pSend, nTasksP2pRecv;
  bool persistent;
-  bool isSymColl;
-
  // The list of user streams aggregated over all tasks present.
  struct ncclCudaStreamList* streams;
  // Keep track of the number of user streams
@@ -401,6 +415,8 @@ struct ncclKernelPlanner {
  //////////////////////////////////////////////////////////////////////////////

  struct ncclIntruQueue<struct ncclTaskColl, &ncclTaskColl::next> collTaskQueue;
+  struct ncclIntruQueue<struct ncclTaskColl, &ncclTaskColl::next> collCeTaskQueue;
+  struct ncclIntruQueue<struct ncclTaskColl, &ncclTaskColl::next> collSymTaskQueue;
  struct ncclIntruQueue<struct ncclWorkList, &ncclWorkList::next> collWorkQueue;
  struct ncclIntruQueue<struct ncclWorkList, &ncclWorkList::next> tmpCollWorkQueue;
  struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next> collCleanupQueue;
@@ -459,6 +475,8 @@ typedef enum ncclGroupTaskType {
  ncclGroupTaskTypeNum = 2,
 } ncclGroupTaskType_t;

+struct ncclCommSymTeams;
+
 struct ncclComm {
  uint64_t startMagic;
  struct ncclMemoryStack memPermanent, memScoped;
@@ -478,10 +496,12 @@ struct ncclComm {
  bool peerInfoValid;

  ncclNet_t* ncclNet;
+  void* netContext;
  int netPluginIndex;
  int ncclNetVer;
  ncclNetDeviceType netDeviceType;
  ncclCollNet_t* ncclCollNet;
+  void* collNetContext;
  void* bootstrap;
  // Bitmasks for ncclTransportP2pSetup
  struct channelMasks* connectSend;
@@ -517,6 +537,7 @@ struct ncclComm {
  int localRank;
  int localRanks;
  int maxLocalRanks;
+  int minLocalRanks;
  int* rankToNode;
  int* rankToLocalRank;
  int* localRankToRank;
@@ -527,6 +548,9 @@ struct ncclComm {
  struct cliqueInfo clique; // Our MNNVL clique information
  int cliqueRank; // Our rank within the MNNVL clique

+  // NVL Domain info
+  ncclNvlDomainInfo_v5_t nvlDomainInfo;
+
  bool checkPointers;
  bool dmaBufSupport;

@@ -553,7 +577,8 @@ struct ncclComm {
  int p2pChunkSize;
  int nvlsChunkSize;

-  // Algorithm/Protocols thresholds
+  // Tuner values
+  ncclTunerConstants_t tunerConstants;
  ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
  float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
  float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
@@ -579,8 +604,7 @@ struct ncclComm {
  bool hasFineGrain;

  // Device side of the communicator (for cudaFree's)
-  struct ncclDevComm* devComm; // actually = &ncclDevCommAndChannels::comm
-  struct ncclSymDevComm symDevComm;
+  struct ncclKernelComm* devComm; // actually = &ncclKernelCommAndChannels::comm

  uint32_t workArgsBytes; // max size of kernel args
  uint32_t workFifoBytes; // size of workFifoBuf, power of 2
@@ -703,6 +727,10 @@ struct ncclComm {
  uint64_t seqNumber[NCCL_NUM_FUNCTIONS];
  struct ncclProfilerProxy profiler;

+  // CE Collective
+  struct ncclCeColl ceColl;
+  struct ncclIntruQueue<struct ncclCeInitTask, &ncclCeInitTask::next> ceInitTaskQueue;
+  
  // buffer registration cache
  struct ncclRegCache regCache;
  int isAllNvlink;
@@ -712,13 +740,8 @@ struct ncclComm {
  bool useGdr;
  int splitCount;

-  // symmetric buffer
-  uint8_t* baseUCSymPtr;
-  uint8_t* baseMCSymPtr;
-  size_t baseStride;
-  size_t symAllocHead;
-  CUmemGenericAllocationHandle symMCHandle;
-  struct ncclIntruQueue<struct ncclSymRegTask, &ncclSymRegTask::next> symRegTaskQueue;
+  struct ncclDevrState devrState; // The symmetric runtime state
+  struct ncclSymkState symkState; // The symmetric kernels state (built on previous)

  // unroll factor for comm [RCCL]
  int unroll;
@@ -17,6 +17,7 @@

 #ifdef PROFAPI
 #define NCCL_API(ret, func, args...)        \
+    extern "C"                              \
    __attribute__ ((visibility("default"))) \
    __attribute__ ((alias(#func)))          \
    ret p##func (args);                     \
--- a/さらに表示
+++ b/さらに表示