Merge commit '3d4813d99196bb349eccd50a925e2addc8f1622c' into develop

2026-01-21 20:28:14 +00:00
@@ -34,7 +34,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          repository: "ROCm/TheRock"
-          ref: d76278526218def9fb1b016bc9e421738cb4f8f6 # 2025-12-09 commit
+          ref: ff46daa79b4c826c4f4676893d0d6586de567dfa # 2026-01-12 commit

      - name: Checkout rccl repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -143,5 +143,5 @@ jobs:
    with:
      amdgpu_families: ${{ inputs.amdgpu_families }}
      artifact_group: ${{ inputs.artifact_group }}
-      test_runs_on: linux-mi325-1gpu-ossci-rocm-frac
+      test_runs_on: linux-mi325-4gpu-ossci-rocm
      artifact_run_id: ${{ github.run_id }}
@@ -39,14 +39,15 @@ jobs:
    env:
      VENV_DIR: ${{ github.workspace }}/.venv
      ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id }}"
-      OUTPUT_ARTIFACTS_DIR: /home/arravikum/dist_new/dist/rocm
+      OUTPUT_ARTIFACTS_DIR: /apps/cvs_tests/dist_new/dist/rocm
      THEROCK_BIN_DIR: "./build/bin"
+      AWS_SHARED_CREDENTIALS_FILE: /apps/cvs_tests/awsconfig/credentials.ini
    steps:
      - name: Checkout Repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          repository: "ROCm/TheRock"
-          ref: d76278526218def9fb1b016bc9e421738cb4f8f6 # 2025-12-09 commit
+          ref: ff46daa79b4c826c4f4676893d0d6586de567dfa # 2026-01-12 commit

      - name: Run setup test environment workflow
        uses: './.github/actions/setup_test_environment'
@@ -61,20 +62,11 @@ jobs:

      # The following step leverages slurm to run multi node rccl tests on the slurm mi350x cluster.
      # salloc will hold 4 nodes while the commands inside the block run. After the block completes, salloc automatically releases the nodes.
+      # sbatch script runs rccl_heatmap_cvs script which validates and generates a bandwidth heatmap file for different rccl collectives
      - name: Test gfx950
        if: ${{ inputs.amdgpu_families == 'gfx950-dcgpu' }}
        run: |
-          salloc -N 4 -p meta64 -t 04:00:00 --exclusive bash -c "
-          source /home/arravikum/TheRock/.venv/bin/activate &&
-          cd /home/arravikum/cvs &&
-          python input/setup.py &&
-          pytest -vvv -s ./tests/rccl/rccl_multinode_cvs.py \
-              --cluster_file ./input/cluster.json \
-              --config_file ./input/mi350_config.json \
-              --log-file=/tmp/rccl_log.log \
-              --html=/home/arravikum/cvs/test_reports/ci_test_report.html \
-              --capture=tee-sys \
-              --self-contained-html"
+          SETUP_NODES=1 sbatch --wait -N4 /apps/cvs_tests/cvs-sbatch/sbatch/default.sbatch

      - name: Configure AWS Credentials for non-forked repos
        if: ${{ always() && !github.event.pull_request.head.repo.fork }}
@@ -91,6 +83,6 @@ jobs:
          python3 build_tools/github_actions/upload_test_report_script.py \
            --run-id "${{ github.run_id }}" \
            --amdgpu-family "${{ inputs.amdgpu_families }}" \
-            --report-path "/home/arravikum/cvs/test_reports" \
+            --report-path "/apps/cvs_tests/test_reports" \
            --log-destination "/logs/gfx950-dcgpu" \
            --index-file-name "index_rccl_test_report.html"
@@ -30,13 +30,16 @@ jobs:
    name: 'Test single-node'
    runs-on: ${{ inputs.test_runs_on }}
    container:
-      image: ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:405945a40deaff9db90b9839c0f41d4cba4a383c1a7459b28627047bf6302a26
+      image: ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:4150afe4759d14822f0e3f8930e1124f26e11f68b5c7b91ec9a02b20b1ebbb98
      options: --ipc host
        --group-add video
        --device /dev/kfd
        --device /dev/dri
        --group-add 110
+        --ulimit memlock=-1:-1
+        --security-opt seccomp=unconfined
        --env-file /etc/podinfo/gha-gpu-isolation-settings
+        --user 0:0
    defaults:
      run:
        shell: bash
@@ -50,7 +53,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          repository: "ROCm/TheRock"
-          ref: d76278526218def9fb1b016bc9e421738cb4f8f6 # 2025-12-09 commit
+          ref: ff46daa79b4c826c4f4676893d0d6586de567dfa # 2026-01-12 commit

      - name: Run setup test environment workflow
        uses: './.github/actions/setup_test_environment'
@@ -70,5 +73,5 @@ jobs:
        # TODO (geomin12): Rebuild rccl-tests without MPI to enable RCCL correctness tests.
        run: |
          pytest ./build_tools/github_actions/test_executable_scripts/test_rccl.py -v -s \
-            --log-cli-level=info \
-            -k "not test_rccl_correctness_tests"
+            -k "not test_rccl_correctness_tests" \
+            --log-cli-level=info
@@ -3,6 +3,6 @@
 /coverage/
 build/
 ext/
-
+src/transport/net_ib_rocm.cc
 # Visual Studio Code
-.vscode
+.vscode
@@ -8,3 +8,7 @@
 	url = https://github.com/nlohmann/json.git
 	ignore = dirty
 	shallow = true
+[submodule "ext-src/rocSHMEM"]
+	path = ext-src/rocSHMEM
+	url = https://github.com/ROCm/rocSHMEM.git
+	branch = develop
@@ -2,12 +2,29 @@

 Full documentation for RCCL is available at [https://rccl.readthedocs.io](https://rccl.readthedocs.io)

+## Unreleased - RCCL 2.28.3 for ROCm 7.11
+
+### Known issues
+* AllGather regression for small message sizes (less than 1 MB) due to the Direct algorithm.
+* ROCTx feature needs to be verified.
+* Profiler plugin needs to be verified.
+
+### Changed
+* Compatibility with NCCL 2.28.3.
+* The MSCCL feature is now disabled by default. The `--disable-msccl-kernel` build flag is replaced with `--enable-msccl-kernel` in the `rccl/install.sh` script.
+* MSCCL and NPKIT are deprecated and will be removed in a future release of RCCL.
+
 ## Unreleased - RCCL 2.27.7 for ROCm 7.2.0

 ### Changed
-
 * RCCL error messages have been made more verbose in several cases. RCCL now prints out fatal error messages by default. Fatal error messages can be suppressed by setting `NCCL_DEBUG=NONE`.
 * Disabled `reduceCopyPacks` pipelining for `gfx950`.
+* Experimental support for traffic shaping using warp specialization (also known as WarpSpeed) is now available for the Ring algorithm.
+* Enabling WarpSpeed in auto mode using RCCL_WARP_SPEED_AUTO optimizes performance and reduces the CU count by 50% on a single node for AllReduce, AllGather from 64MB, and ReduceScatter from 256MB.
+* The following configuration knobs control WarpSpeed behavior for debugging purposes: `RCCL_WARP_SPEED_ENABLE`, `RCCL_UNROLL_FACTOR`, `RCCL_WARP_SPEED_CU_COUNT`, and `RCCL_THREADS_PER_BLOCK`. Note that the effective unroll factor is calculated as 2 raised to the value of `RCCL_UNROLL_FACTOR`.
+
+### Known issues
+* AllToAllv/AlltoAll for single GPU is hanging.

 ## Unreleased - RCCL 2.27.7 for ROCm 7.1.1

@@ -26,7 +26,7 @@ option(BUILD_TESTS                             "Build unit test programs"
 option(COLLTRACE                               "Collective Trace Option"                       ON)
 option(DUMP_ASM                                "Disassemble and dump"                          OFF)
 option(ENABLE_CODE_COVERAGE                    "Enable code coverage"                          OFF)
-option(ENABLE_MSCCL_KERNEL                     "Enable MSCCL while compiling"                  ON)
+option(ENABLE_MSCCL_KERNEL                     "Enable MSCCL while compiling"                  OFF)
 option(ENABLE_MSCCLPP                          "Enable MSCCL++"                                OFF)
 option(ENABLE_MSCCLPP_CLIP                     "Enable MSCCL++ CLIP"                           OFF)
 option(ENABLE_MSCCLPP_EXECUTOR                 "Enable MSCCL++ Executor"                       OFF)
@@ -42,6 +42,7 @@ option(TIMETRACE                               "Enable time-trace during compila
 option(TRACE                                   "Enable additional tracing"                     OFF)
 option(FAULT_INJECTION                         "Enable fault injection"                        ON)
 option(QUIET_WARNINGS                          "Supress compiler warnings"                     OFF)
+option(ENABLE_ROCSHMEM                         "Enable rocSHMEM support in RCCL"               OFF)

 # Default GPU architectures to build
 #==================================================================================================
@@ -65,6 +66,11 @@ include(CheckSymbolExists)
 include(cmake/Dependencies.cmake) # GTest, rocm-cmake, rocm_local_targets
 include(cmake/CheckSymbolExistsNoWarn.cmake)

+# Include rocSHMEM build module only if enabled
+if(ENABLE_ROCSHMEM)
+  include(cmake/ROCSHMEM.cmake)
+endif()
+
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")

 # Build only for local GPU architecture
@@ -80,6 +86,9 @@ endif()
 # Determine which GPU architectures to build for
 set(GPU_TARGETS "${DEFAULT_GPUS}" CACHE STRING "Target default GPUs if GPU_TARGETS is not defined.")

+# ROCM NetIB patch
+include(cmake/rocmIb.cmake)
+
 # Modify GPU architectures for Address Sanitizer builds by appending "xnack+"
 if (BUILD_ADDRESS_SANITIZER)
  SET(amdgpu_targets "")
@@ -252,26 +261,56 @@ find_package(hsa-runtime64 REQUIRED)
 get_target_property(HSA_INCLUDE_PATH hsa-runtime64::hsa-runtime64 INTERFACE_INCLUDE_DIRECTORIES)
 message(STATUS "HSA runtime: ${HSA_INCLUDE_PATH}")

-## Check for ROCM-smi
-find_package(rocm_smi PATHS ${ROCM_PATH}/lib/cmake/rocm_smi)
-if (rocm_smi_FOUND)
-  message(STATUS "Found rocm_smi at ${ROCM_SMI_INCLUDE_DIR}")
-else()
-  message(STATUS "Checking old include directory structure for rocm_smi")
-  set(ROCM_SMI_INCLUDE_DIR "${ROCM_PATH}/rocm_smi/include")
-  set(ROCM_SMI_LIB_DIR     "${ROCM_PATH}/rocm_smi/lib")
-  set(ROCM_SMI_LIBRARIES   rocm_smi64)
+## Check for amd-smi if ROCm 7.11.0 or newer
+if(ROCM_VERSION VERSION_GREATER_EQUAL "71100")
+  find_package(amd_smi PATHS ${ROCM_PATH}/lib/cmake/amd_smi)
+  if(amd_smi_FOUND)
+    message(STATUS "amd_smi_INCLUDE_DIR: ${amd_smi_INCLUDE_DIR}")
+    message(STATUS "amd_smi_LIB_DIR: ${amd_smi_LIB_DIR}")
+    set(SMI_INCLUDE_DIR "${amd_smi_INCLUDE_DIR}" CACHE INTERNAL "amd-smi include directory")
+    set(SMI_LIB_DIR "${amd_smi_LIB_DIR}" CACHE INTERNAL "amd-smi library directory")
+    set(SMI_LIB_NAME "amd-smi-lib" CACHE INTERNAL "amd-smi-lib for packaging")
+    if(NOT EXISTS "${SMI_INCLUDE_DIR}" OR NOT EXISTS "${SMI_LIB_DIR}")
+      message(FATAL_ERROR "amd_smi not found in ${SMI_INCLUDE_DIR}")
+    endif()
+    message(STATUS "Found amd_smi at ${SMI_INCLUDE_DIR}")
+    set(SMI_LIBRARIES amd_smi)
+    set(USE_AMDSMI ON CACHE INTERNAL "Use amd-smi instead of rocm-smi")
+  endif()
+endif()
+
+if(NOT USE_AMDSMI)
+  ## Fallback to rocm-smi if amd-smi not found or ROCm < 7.11.0
+  message(WARNING "Could not find amd_smi. Falling back to rocm_smi.")
+  find_package(rocm_smi PATHS ${ROCM_PATH}/lib/cmake/rocm_smi)
+  if(rocm_smi_FOUND)
+    set(SMI_INCLUDE_DIR "${rocm_smi_INCLUDE_DIR}" CACHE INTERNAL "rocm-smi include directory")
+    set(SMI_LIB_DIR "${rocm_smi_LIB_DIR}" CACHE INTERNAL "rocm-smi library directory")
+  else()
+    message(WARNING "CMake could not find rocm-smi. Checking old include directory structure for rocm_smi")
+    set(SMI_INCLUDE_DIR "${ROCM_PATH}/rocm_smi/include")
+    set(SMI_LIB_DIR     "${ROCM_PATH}/rocm_smi/lib")
+  endif()
+
+  if(NOT EXISTS "${SMI_INCLUDE_DIR}" OR NOT EXISTS "${SMI_LIB_DIR}")
+    message(FATAL_ERROR "rocm_smi not found in ${SMI_INCLUDE_DIR}")
+  endif()
+  message(STATUS "Found rocm_smi at ${SMI_INCLUDE_DIR}")
+  set(SMI_LIB_NAME "rocm-smi-lib" CACHE INTERNAL "rocm-smi-lib for packaging")
+  set(SMI_LIBRARIES rocm_smi64)
+
+  check_include_file_cxx("${SMI_INCLUDE_DIR}/rocm_smi/rocm_smi64Config.h" HAVE_ROCM_SMI64CONFIG)
+
+  ### Check for RSMI_INIT_FLAG_THRAD_ONLY_MUTEX support
+  file(READ "${SMI_INCLUDE_DIR}/rocm_smi/rocm_smi.h" rocm_smi_incl)
+  string(FIND "${rocm_smi_incl}" "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX" matchres)
+  if(${matchres} EQUAL -1)
+    message(STATUS "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX not supported")
+  else()
+    message(STATUS "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX supported")
+    set(HAVE_ROCM_SMI_THREAD_ONLY_MUTEX True)
+  endif ()
 endif()
-check_include_file_cxx("${ROCM_SMI_INCLUDE_DIR}/rocm_smi/rocm_smi64Config.h" HAVE_ROCM_SMI64CONFIG)
-### Check for RSMI_INIT_FLAG_THRAD_ONLY_MUTEX support
-file(READ "${ROCM_SMI_INCLUDE_DIR}/rocm_smi/rocm_smi.h" rocm_smi_incl)
-string(FIND "${rocm_smi_incl}" "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX" matchres)
-if(${matchres} EQUAL -1)
-  message(STATUS "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX not supported")
-else()
-  message(STATUS "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX supported")
-  set(HAVE_ROCM_SMI_THREAD_ONLY_MUTEX True)
-endif ()

 ## Check for BFD library if custom backtrace is requested
 if(BUILD_BFD)
@@ -318,6 +357,8 @@ if(BUILD_BFD)
  endif()
 endif()

+
+
 # Check for --amdgpu-kernarg-preload-count
 check_cxx_compiler_flag("-mllvm --amdgpu-kernarg-preload-count=16" HAVE_KERNARG_PRELOAD)
 if (HAVE_KERNARG_PRELOAD)
@@ -333,6 +374,7 @@ endif()
 ## Currently MSCCL++ is supported only on gfx942 and gfx950, and only on Ubuntu and CentOS
 set(MSCCLPP_SUPPORTED_ARCHS "gfx942" "gfx942:xnack-" "gfx942:xnack+" "gfx950" "gfx950:xnack-" "gfx950:xnack+")

+
 # Check if any of the supported architectures are in GPU_TARGETS
 set(ARCH_MATCH_FOUND OFF)
 set(MSCCLPP_GPU_TARGETS "")
@@ -355,6 +397,20 @@ if (ENABLE_MSCCLPP AND ROCM_VERSION VERSION_LESS "60200")
  message(WARNING "MSCCL++ integration only supported on ROCm 6.2.0 or greater; disabling MSCCL++ build")
 endif()

+## Disable WARP_SPEED if the build environment is invalid
+set(WARP_SPEED_SUPPORTED_ARCHS "gfx942" "gfx942:xnack-" "gfx942:xnack+" "gfx950" "gfx950:xnack-" "gfx950:xnack+")
+set(ARCH_MATCH_FOUND OFF)
+foreach(ARCH IN LISTS GPU_TARGETS)
+  if(ARCH IN_LIST WARP_SPEED_SUPPORTED_ARCHS)
+    set(ARCH_MATCH_FOUND ON)
+  endif()
+endforeach()
+if (NOT ARCH_MATCH_FOUND)
+  set(ENABLE_WARP_SPEED OFF)
+  message(WARNING "Can only build WARP_SPEED for supported GPU_TARGETS: ${WARP_SPEED_SUPPORTED_ARCHS}; current GPU_TARGETS: ${GPU_TARGETS}; so disabling WARP_SPEED build")
+endif()
+
+
 # cmake_host_system_information(RESULT HOST_OS_ID QUERY DISTRIB_ID) ## Requires cmake 3.22
 execute_process(
  COMMAND bash -c "grep '^ID=' /etc/os-release | cut -d'=' -f2 | cut -d'\"' -f2"
@@ -437,9 +493,12 @@ configure_file(src/nccl.h.in ${PROJECT_BINARY_DIR}/include/nccl.h)      # Used b
 set(SRC_FILES
  src/allocator.cc
  src/bootstrap.cc
+  src/ce_coll.cc
  src/channel.cc
  src/collectives.cc
+  src/commDump.cc
  src/debug.cc
+  src/dev_runtime.cc
  src/enqueue.cc
  src/group.cc
  src/init.cc
@@ -448,11 +507,12 @@ set(SRC_FILES
  src/msccl.cc
  src/proxy.cc
  src/rccl_wrap.cc
-  src/symmetric.cc
+  src/sym_kernels.cc
  src/transport.cc
  src/device/all_gather.h
  src/device/all_reduce.h
  src/device/alltoall_pivot.h
+  src/device/alltoall_gda.h
  src/device/broadcast.h
  src/device/common.h
  src/device/common_kernel.h
@@ -498,6 +558,7 @@ set(SRC_FILES
  src/include/BfdBacktrace.hpp
  src/include/bitops.h
  src/include/bootstrap.h
+  src/include/ce_coll.h
  src/include/channel.h
  src/include/checks.h
  src/include/collectives.h
@@ -507,6 +568,7 @@ set(SRC_FILES
  src/include/cpuset.h
 # src/include/cudawrap.h
  src/include/debug.h
+  src/include/dev_runtime.h
  src/include/device.h
  src/include/enqueue.h
  src/include/gdrwrap.h
@@ -521,6 +583,7 @@ set(SRC_FILES
  src/include/ipcsocket.h
  src/include/mnnvl.h
  src/include/nccl_common.h
+  src/include/nccl_device.h
  src/include/net_device.h
  src/include/net.h
  src/include/nvmlwrap.h
@@ -537,16 +600,16 @@ set(SRC_FILES
  src/include/register.h
  src/include/register_inline.h
  src/include/rccl_float8.h
-  src/include/rocm_smi_wrap.h
  src/include/rocmwrap.h
  src/include/roctx.h
  src/include/recorder.h
+  src/include/scheduler.h
  src/include/shm.h
  src/include/shmutils.h
  src/include/signals.h
  src/include/socket.h
  src/include/strongstream.h
-  src/include/symmetric.h
+  src/include/sym_kernels.h
  src/include/timer.h
  src/include/transport.h
  src/include/trees.h
@@ -555,12 +618,32 @@ set(SRC_FILES
  src/include/mlx5/mlx5dvcore.h
  src/include/mlx5/mlx5dvsymbols.h
  src/include/mlx5/mlx5dvwrap.h
+  src/include/ionic/ionicdvcore.h
+  src/include/ionic/ionicdvsymbols.h
+  src/include/ionic/ionicdvwrap.h
  src/include/msccl/msccl_lifecycle.h
  src/include/msccl/msccl_parser.h
  src/include/msccl/msccl_scheduler.h
  src/include/msccl/msccl_setup.h
  src/include/msccl/msccl_status.h
  src/include/msccl/msccl_struct.h
+  src/include/nccl_device/comm.h
+  src/include/nccl_device/coop.h
+  src/include/nccl_device/core.h
+  src/include/nccl_device/ll_a2a.h
+  src/include/nccl_device/mem_barrier.h
+  src/include/nccl_device/ptr.h
+  src/include/nccl_device/utility.h
+  src/include/nccl_device/impl/comm__funcs.h
+  src/include/nccl_device/impl/comm__types.h
+  src/include/nccl_device/impl/core__funcs.h
+  src/include/nccl_device/impl/core__types.h
+  src/include/nccl_device/impl/ll_a2a__funcs.h
+  src/include/nccl_device/impl/ll_a2a__types.h
+  src/include/nccl_device/impl/mem_barrier__funcs.h
+  src/include/nccl_device/impl/mem_barrier__types.h
+  src/include/nccl_device/impl/ptr__funcs.h
+  src/include/nccl_device/impl/ptr__types.h
  src/include/npkit/npkit.h
  src/include/npkit/npkit_event.h
  src/include/npkit/npkit_struct.h
@@ -608,6 +691,7 @@ set(SRC_FILES
  src/include/plugin/net/net_v8.h
  src/include/plugin/net/net_v9.h
  src/include/plugin/net/net_v10.h
+  src/include/plugin/net/net_v11.h
  src/include/plugin/profiler/net_ib_v1.h
  src/include/plugin/profiler/net_ib.h
  src/include/plugin/profiler/net_socket_v1.h
@@ -616,9 +700,11 @@ set(SRC_FILES
  src/include/plugin/profiler/profiler_v2.h
  src/include/plugin/profiler/profiler_v3.h
  src/include/plugin/profiler/profiler_v4.h
+  src/include/plugin/profiler/profiler_v5.h
  src/include/plugin/tuner/tuner_v2.h
  src/include/plugin/tuner/tuner_v3.h
  src/include/plugin/tuner/tuner_v4.h
+  src/include/plugin/tuner/tuner_v5.h
  src/misc/alt_rsmi.cc
  src/misc/archinfo.cc
  src/misc/argcheck.cc
@@ -631,11 +717,12 @@ set(SRC_FILES
  src/misc/ipcsocket.cc
  src/misc/mlx5dvsymbols.cc
  src/misc/mlx5dvwrap.cc
+  src/misc/ionicdvsymbols.cc
+  src/misc/ionicdvwrap.cc
  src/misc/npkit.cc
 # src/misc/nvmlwrap.cc
  src/misc/nvmlwrap_stub.cc
  src/misc/param.cc
-  src/misc/rocm_smi_wrap.cc
  src/misc/rocmwrap.cc
  src/misc/roctx.cc
  src/misc/recorder.cc
@@ -649,6 +736,9 @@ set(SRC_FILES
  src/misc/msccl/msccl_setup.cc
  src/misc/msccl/msccl_status.cc
  src/misc/proxy_trace/proxy_trace.cc
+  src/nccl_device/core.cc
+  src/nccl_device/ll_a2a.cc
+  src/nccl_device/mem_barrier.cc
  src/plugin/net.cc
  src/plugin/plugin_open.cc
  src/plugin/profiler.cc
@@ -658,13 +748,16 @@ set(SRC_FILES
  src/plugin/net/net_v8.cc
  src/plugin/net/net_v9.cc
  src/plugin/net/net_v10.cc
+  src/plugin/net/net_v11.cc
  src/plugin/profiler/profiler_v1.cc
  src/plugin/profiler/profiler_v2.cc
  src/plugin/profiler/profiler_v3.cc
  src/plugin/profiler/profiler_v4.cc
+  src/plugin/profiler/profiler_v5.cc
  src/plugin/tuner/tuner_v2.cc
  src/plugin/tuner/tuner_v3.cc
  src/plugin/tuner/tuner_v4.cc
+  src/plugin/tuner/tuner_v5.cc
  src/ras/client.cc
  src/ras/client_support.cc
  src/ras/collectives.cc
@@ -675,10 +768,12 @@ set(SRC_FILES
  src/register/coll_reg.cc
  src/register/register.cc
  src/register/sendrecv_reg.cc
+  src/scheduler/symmetric_sched.cc
  src/transport/coll_net.cc
  src/transport/generic.cc
  src/transport/net.cc
  src/transport/net_ib.cc
+  src/transport/net_ib_rocm.cc
  src/transport/net_socket.cc
  src/transport/nvls.cc
  src/transport/p2p.cc
@@ -695,6 +790,19 @@ set(SRC_FILES
  src/misc/latency_profiler/CollTraceUtils.cc
 )

+if(USE_AMDSMI)
+  set(SMI_SOURCES
+    src/include/amdsmi_wrap.h
+    src/misc/amdsmi_wrap.cc
+  )
+else()
+  set(SMI_SOURCES
+    src/include/rocm_smi_wrap.h
+    src/misc/rocm_smi_wrap.cc
+  )
+endif()
+list(APPEND SRC_FILES ${SMI_SOURCES})
+
 if (ENABLE_MSCCL_KERNEL)
  set(MSCCL_KERNEL_SOURCES
    src/device/msccl_kernel_impl.h
@@ -846,6 +954,8 @@ target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/device)
 target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/device/network/unpack)
 target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include)
 target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include/mlx5)
+target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include/nccl_device)
+target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include/ionic)
 target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include/plugin)
 target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/gensrc)
 target_include_directories(rccl PRIVATE ${HSA_INCLUDE_PATH})
@@ -858,26 +968,59 @@ if(ROCTX_ENABLE)
  target_include_directories(rccl PRIVATE ${ROCTRACER_INCLUDE_DIR})
 endif()

+
 ## Set RCCL compile definitions
 if(COLLTRACE)
  target_compile_definitions(rccl PRIVATE ENABLE_COLLTRACE)
 endif()
 if(ENABLE_MSCCL_KERNEL)
+  message(WARNING "MSCCL is deprecated and will be removed in a future version of RCCL.")
  target_compile_definitions(rccl PRIVATE COMPILE_MSCCL_KERNEL)
 endif()
 if(ENABLE_MSCCLPP)
  target_compile_definitions(rccl PRIVATE ENABLE_MSCCLPP)
 endif()
-if(HAVE_ROCM_SMI64CONFIG)
-  target_compile_definitions(rccl PRIVATE USE_ROCM_SMI64CONFIG)
+
+if(USE_AMDSMI)
+  target_compile_definitions(rccl PRIVATE USE_AMDSMI)
+else()
+  if(HAVE_ROCM_SMI64CONFIG)
+    target_compile_definitions(rccl PRIVATE USE_ROCM_SMI64CONFIG)
+  endif()
+  if(HAVE_ROCM_SMI_THREAD_ONLY_MUTEX)
+    target_compile_definitions(rccl PRIVATE USE_ROCM_SMI_THREAD_ONLY_MUTEX)
+  endif()
 endif()
-if(HAVE_ROCM_SMI_THREAD_ONLY_MUTEX)
-  target_compile_definitions(rccl PRIVATE USE_ROCM_SMI_THREAD_ONLY_MUTEX)
+if(ENABLE_WARP_SPEED)
+  target_compile_definitions(rccl PRIVATE ENABLE_WARP_SPEED)
+endif()
+if(ENABLE_ROCSHMEM)
+  target_compile_definitions(rccl PRIVATE ENABLE_ROCSHMEM)
+endif()
+
+# ==== rocSHMEM integration (optional) ====
+
+if (ENABLE_ROCSHMEM)
+  add_rocshmem_targets()
+  # Ensure rocSHMEM is fully built/installed before compiling rccl
+  if (TARGET rocshmem_ext)
+    add_dependencies(rccl rocshmem_ext)
+  endif()
+
+  if (ROCSHMEM_INCLUDE_DIR)
+    target_include_directories(rccl PRIVATE ${ROCSHMEM_INCLUDE_DIR})
+  endif()
+
+  # Moved to where MSCCL target_links
+  ## target_link_libraries(rccl PRIVATE ${ROCSHMEM_LIBRARY})
+  target_link_libraries(rccl PRIVATE ${IBVERBS})
+
 endif()

 # NPKit flags
 ## May be better to move these to a separate file
 if(ENABLE_NPKIT)
+  message(WARNING "NPKit is deprecated and will be removed in a future version of RCCL. Please consider using alternative profiling tools.")
  target_compile_definitions(rccl PRIVATE ENABLE_NPKIT)
  target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
  target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
@@ -1099,8 +1242,7 @@ if(ENABLE_CODE_COVERAGE)
  message(STATUS "Code coverage is enabled with build type '${CMAKE_BUILD_TYPE}'.")

  target_compile_options(rccl PRIVATE
-    -fvisibility=default -Xarch_host -fprofile-instr-generate
-    -Xarch_host -fcoverage-mapping)
+    -fvisibility=default -Xarch_host -fprofile-instr-generate -Xarch_host -fcoverage-mapping)

  set(COVERAGE_SHARED_LINKER_FLAGS
    -fprofile-generate
@@ -1169,7 +1311,7 @@ if (FAULT_INJECTION)
 endif()

 ## Set RCCL linked library directories
-target_link_directories(rccl PRIVATE ${ROCM_SMI_LIB_DIR})
+target_link_directories(rccl PRIVATE ${SMI_LIB_DIR})

 if (ROCM_VERSION VERSION_GREATER_EQUAL "60100")
    option(RCCL_ROCPROFILER_REGISTER "Enable rocprofiler-register support" ON)
@@ -1201,11 +1343,15 @@ target_link_libraries(rccl PRIVATE   Threads::Threads)
 target_link_libraries(rccl INTERFACE hip::host)
 target_link_libraries(rccl PRIVATE   hip::device)
 target_link_libraries(rccl PRIVATE   dl)
-target_link_libraries(rccl PRIVATE   ${ROCM_SMI_LIBRARIES})
+target_link_libraries(rccl PRIVATE   ${SMI_LIBRARIES})
 target_link_libraries(rccl PRIVATE fmt::fmt-header-only)
 if(ENABLE_MSCCLPP)
  target_link_libraries(rccl PRIVATE mscclpp_nccl)
 endif()
+if(ENABLE_ROCSHMEM)
+  target_link_libraries(rccl PRIVATE ${ROCSHMEM_LIBRARY})
+  target_link_libraries(rccl PRIVATE ${IBVERBS})
+endif()

 ## Set RCCL link options
 ## Find out available memory
@@ -1317,7 +1463,8 @@ if(BUILD_ADDRESS_SANITIZER)
 else()
  set(DEPENDS_HIP_RUNTIME "hip-runtime-amd" )
 endif()
-rocm_package_add_dependencies(DEPENDS "${DEPENDS_HIP_RUNTIME} >= 4.5.0" "rocm-smi-lib >= 4.0.0")
+
+rocm_package_add_dependencies(DEPENDS "${DEPENDS_HIP_RUNTIME} >= 4.5.0" "${SMI_LIB_NAME}")
 set(CPACK_DEB_COMPONENT_INSTALL ON)
 set(CPACK_DEBIAN_PACKAGE_SHLIBDEPS ON)
 set(CPACK_RPM_COMPONENT_INSTALL ON)
@@ -42,7 +42,7 @@ RCCL build & installation helper script
       --debug                 Build debug library
       --enable_backtrace      Build with custom backtrace support
       --disable-colltrace     Build without collective trace
-       --disable-msccl-kernel  Build without MSCCL kernels
+       --enable-msccl-kernel   Build with MSCCL kernels
       --enable-mscclpp        Build with MSCCL++ support
       --enable-mscclpp-clip   Build MSCCL++ with clip wrapper on bfloat16 and half addition routines
       --disable-roctx         Build without ROCTX logging
@@ -0,0 +1,35 @@
+# MIT License
+#
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+find_path(ROCSHMEM_INCLUDE_DIR
+    NAMES rocshmem/rocshmem.hpp rocshmem/rocshmem.h
+    HINTS ${ROCSHMEM_INSTALL_DIR}/include/)
+
+find_library(ROCSHMEM_LIBRARY
+    NAMES rocshmem
+        HINTS ${ROCSHMEM_INSTALL_DIR}/lib)
+  
+## -- todo --- what to do with verbs? add to handle args call below? -- ##
+find_library(IBVERBS ibverbs)
+
+find_package_handle_standard_args(rocshmem_static DEFAULT_MSG ROCSHMEM_INCLUDE_DIR ROCSHMEM_LIBRARY)
+## mark_as_advanced(MSCCLPP_INCLUDE_DIRS MSCCLPP_NCCL_STATIC_LIB) add this for Rocshmem?
@@ -0,0 +1,113 @@
+# MIT License
+#
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+include(ExternalProject)
+
+function(add_rocshmem_targets)
+
+    # Check for an existing installation via the user-provided prefix ROCSHMEM_INSTALL DIR
+    if(ROCSHMEM_INSTALL_DIR)
+        list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
+        find_package(rocshmem_static)
+        if(NOT IBVERBS)
+            find_library(IBVERBS ibverbs)
+            if(IBVERBS)
+                set(IBVERBS ${IBVERBS} PARENT_SCOPE)
+            endif()
+        endif()
+    endif()
+
+    # If no pre-existing installation, build from submodule into ext/rocshmem
+    if(NOT rocshmem_static_FOUND)
+        set(_rccl_root            "${CMAKE_SOURCE_DIR}")
+        set(ROCSHMEM_SOURCE       "${_rccl_root}/ext-src/rocSHMEM")
+        set(ROCSHMEM_INSTALL_DIR  "${_rccl_root}/ext/rocshmem")
+
+        # Make sure submodule exists (same style as MSCCL++: custom rule + target)
+        add_custom_command(
+            OUTPUT "${ROCSHMEM_SOURCE}/CMakeLists.txt"
+            COMMAND git submodule update --init --recursive ext-src/rocSHMEM
+            WORKING_DIRECTORY "${_rccl_root}"
+            COMMENT "Checking out submodule: ext-src/rocSHMEM"
+            VERBATIM
+        )
+
+        add_custom_target(rocshmem_checkout_submodule
+            DEPENDS "${ROCSHMEM_SOURCE}/CMakeLists.txt")
+
+        # Where our patch files live (like MSCCL++)
+        set(EXT_SOURCE "${_rccl_root}/ext-src")
+
+            # Build and install rocSHMEM. We run `../build_scripts/gdx_bxnt`
+        # from a 'build' dir just like the README shows.
+        ExternalProject_Add(rocshmem_ext
+            SOURCE_DIR          "${ROCSHMEM_SOURCE}"
+            INSTALL_DIR         "${ROCSHMEM_INSTALL_DIR}"
+            UPDATE_DISCONNECTED TRUE
+            LOG_DOWNLOAD        FALSE
+            LOG_CONFIGURE       FALSE
+            LOG_BUILD           FALSE
+            LOG_INSTALL         FALSE
+            BUILD_IN_SOURCE     TRUE
+            DOWNLOAD_COMMAND    ""   # using the submodule checkout above
+            TEST_COMMAND        ""
+            DEPENDS             rocshmem_checkout_submodule   
+
+            # Rocshmem submodule commit hash -> commit b28a56bd54ccc581d05a439ffa466c3dacb3385
+            # The project has its own scripts; we replicate the README sequence:
+            CONFIGURE_COMMAND   ""
+            BUILD_COMMAND
+                ${CMAKE_COMMAND} -E make_directory build
+		&& ${CMAKE_COMMAND} -E chdir build bash -lc "../scripts/build_configs/gda_bnxt -DUSE_EXTERNAL_MPI=OFF -DUSE_IPC=ON -DBUILD_EXAMPLES=OFF "
+                && ${CMAKE_COMMAND} -E chdir build ${CMAKE_COMMAND}
+                    -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+                    -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
+                    -DBUILD_EXAMPLES=OFF ..
+                && ${CMAKE_COMMAND} -E chdir build ${CMAKE_MAKE_PROGRAM} -j
+            INSTALL_COMMAND
+                ${CMAKE_COMMAND} -E chdir build ${CMAKE_MAKE_PROGRAM} install
+        )
+
+         # After build, define the variables RCCL expects
+        set(ROCSHMEM_INCLUDE_DIR "${ROCSHMEM_INSTALL_DIR}/include" PARENT_SCOPE)
+        set(ROCSHMEM_LIBRARY      "${ROCSHMEM_INSTALL_DIR}/lib/librocshmem.a" PARENT_SCOPE)
+        find_library(_IBVERBS ibverbs)
+        if(NOT _IBVERBS)
+            message(FATAL_ERROR "libibverbs not found (install rdma-core/libibverbs-dev)")
+        endif()
+        set(IBVERBS ${_IBVERBS} PARENT_SCOPE)
+
+        # Provide a dummy target other code can depend on
+        add_custom_target(rocshmem_static ALL DEPENDS rocshmem_ext)
+    else()
+    # We found a prebuilt rocSHMEM; export variables upward as-is
+    set(ROCSHMEM_INCLUDE_DIR  "${ROCSHMEM_INCLUDE_DIR}" PARENT_SCOPE)
+    set(ROCSHMEM_LIBRARY      "${ROCSHMEM_LIBRARY}"      PARENT_SCOPE)
+
+    find_library(_IBVERBS ibverbs)
+    if(NOT _IBVERBS)
+        message(FATAL_ERROR "libibverbs not found")
+    endif()
+    set(IBVERBS ${_IBVERBS} PARENT_SCOPE)
+    endif()
+
+endfunction()
@@ -0,0 +1,257 @@
+# MIT License
+#
+# Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# Dependencies
+
+# HIP dependency is handled earlier in the project cmake file
+# when VerifyCompiler.cmake is included.
+
+# GIT
+
+# Test dependencies
+
+# For downloading, building, and installing required dependencies
+include(cmake/DownloadProject.cmake)
+
+message(STATUS "Generating ROCM NetIB... ")
+
+# -------------------------
+# Configurable paths
+# -------------------------
+# Path to RCCL source tree (local clone)
+set(RCCL_SRC_DIR "${CMAKE_SOURCE_DIR}" CACHE PATH "Path to RCCL source directory")
+# Path to patch file
+set(ROCM_NETIB_PATCH_FILE "${CMAKE_SOURCE_DIR}/ext-src/rocm_netib.patch" CACHE FILEPATH "ROCM NETIB Patch file to apply to RCCL")
+set(ROCM_NETIB_FILE "${CMAKE_SOURCE_DIR}/src/transport/net_ib_rocm.cc" CACHE FILEPATH "Generated ROCM NETIB file")
+
+# -------------------------
+# Find tools
+# -------------------------
+find_program(PATCH_EXECUTABLE patch)
+find_program(SED_EXECUTABLE sed)
+
+execute_process(
+  COMMAND ${CMAKE_COMMAND} -E echo "Applying RCCL ROCM NetIB patch... to ${CMAKE_SOURCE_DIR}"
+  COMMAND bash -c "patch -p1 -i ${ROCM_NETIB_PATCH_FILE} -o ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/NCCL_PARAM(Ib/NCCL_PARAM(RocmIb/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/RCCL_PARAM(Ib/RCCL_PARAM(RocmIb/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclParamIb/ncclParamRocmIb/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/rcclParamIb/rcclParamRocmIb/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbMergedDevs/rocmIbMergedDevs/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbDevs/rocmIbDevs/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbLock/rocmIbLock/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ibProviderName/rocmIbProviderName/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbAsyncThread/rocmIbAsyncThread/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbGdrSupport/rocmIbGdrSupport/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbDmaBufSupport/rocmIbDmaBufSupport/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbInitCommDevBase/rocmIbInitCommDevBase/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbDestroyBase/rocmIbDestroyBase/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbRtrQp/rocmIbRtrQp/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbRtsQp/rocmIbRtsQp/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ForceEnableGdrdma/RocmForceEnableGdrdma/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbCheckVProps/rocmIbCheckVProps/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbGetRequest/rocmIbGetRequest/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbFreeRequest/rocmIbFreeRequest/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbRegMrDmaBufInternal/rocmIbRegMrDmaBufInternal/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbGetNetCommDevBase/rocmIbGetNetCommDevBase/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbDeregMrInternal/rocmIbDeregMrInternal/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbPostFifo/rocmIbPostFifo/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/reqTypeStr/rocmIbReqTypeStr/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/rcclNetP2pPolicy/rcclRocmNetP2pPolicy/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbMakeVDeviceInternal/rocmIbMakeVDeviceInternal/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbMakeVDevice/rocmIbMakeVDevice/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbInit/rocmIbInit/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbDevices/rocmIbDevices/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbGetPhysProperties/rocmIbGetPhysProperties/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbGetProperties/rocmIbGetProperties/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbListen\(/rocmIbListen\(/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbListen,/rocmIbListen,/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbConnect\(/rocmIbConnect\(/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbConnect /rocmIbConnect /g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbConnect,/rocmIbConnect,/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbAccept/rocmIbAccept/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbTest/rocmIbTest/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbRegMrDmaBuf/rocmIbRegMrDmaBuf/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbRegMr/rocmIbRegMr/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbDeregMr/rocmIbDeregMr/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbIsend/rocmIbIsend/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbIrecv/rocmIbIrecv/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbIflush/rocmIbIflush/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbCloseSend/rocmIbCloseSend/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbCloseRecv/rocmIbCloseRecv/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbCloseListen/rocmIbCloseListen/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclNetIb/rocmNetIb/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbFinalize/rocmNetIbFinalize/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+execute_process(
+  COMMAND bash -c "sed -i 's/ncclIbSetNetAttr/rocmNetIbSetNetAttr/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
@@ -38,13 +38,15 @@ Collect this information about the ROCm version, GPU/accelerator, platform, and

      rocminfo

-*  Run these ``rocm-smi`` commands to display the system topology.
+*  Run these ``amd-smi`` commands to display the system topology.

   .. code:: shell

-      rocm-smi
-      rocm-smi --showtopo
-      rocm-smi --showdriverversion
+      amd-smi
+      amd-smi topology
+      amd-smi static --driver
+      amd-smi firmware
+      amd-smi xgmi

 *  Determine the values of the ``PATH`` and ``LD_LIBRARY_PATH`` environment variables.

@@ -1 +1 @@
-rocm-docs-core==1.26.0
+rocm-docs-core==1.29.0
@@ -25,7 +25,7 @@ breathe==4.35.0
    # via rocm-docs-core
 certifi==2024.7.4
    # via requests
-cffi==1.16.0
+cffi==2.0.0
    # via
    #   cryptography
    #   pynacl
@@ -164,7 +164,7 @@ pygments==2.18.0
    #   sphinx
 pyjwt[crypto]==2.8.0
    # via pygithub
-pynacl==1.5.0
+pynacl==1.6.2
    # via pygithub
 python-dateutil==2.9.0.post0
    # via jupyter-client
@@ -187,7 +187,7 @@ requests==2.32.4
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core==1.26.0
+rocm-docs-core==1.29.0
    # via -r requirements.in
 rpds-py==0.22.3
    # via
@@ -265,7 +265,7 @@ typing-extensions==4.12.0
    #   pygithub
    #   referencing
    #   sqlalchemy
-urllib3==2.5.0
+urllib3==2.6.3
    # via
    #   pygithub
    #   requests
@@ -60,36 +60,36 @@ of newer ones.
 The `nccl/` directory is populated with `net_vX.h` files extracting all relevant definitions
 from old API versions. It also provides error codes in `err.h`.

-# API (v10)
+# API (v11)

-Below is the main `ncclNet_v10` struct. Each function is explained in later sections.
+Below is the main `ncclNet_v11` struct. Each function is explained in later sections.

 ```
 typedef struct {
  // Name of the network (mainly for logs)
  const char* name;
  // Initialize the network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
+  ncclResult_t (*init)(void** ctx, uint64_t commId, ncclNetCommConfig_v11_t* config, ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
  // Return the number of adapters.
  ncclResult_t (*devices)(int* ndev);
  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props);
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v11_t* props);
  // Create a receiving object and provide a handle to connect to it. The
  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
  // between ranks to create a connection.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  ncclResult_t (*listen)(void* ctx, int dev, void* handle, void** listenComm);
  // Connect to a handle and return a sending comm object for that peer.
  // This call must not block for the connection to be established, and instead
  // should return successfully with sendComm == NULL with the expectation that
  // it will be called again until sendComm != NULL.
  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*connect)(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm);
+  ncclResult_t (*connect)(void* ctx, int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v11_t** sendDevComm);
  // Finalize connection establishment after remote peer has called connect.
  // This call must not block for the connection to be established, and instead
  // should return successfully with recvComm == NULL with the expectation that
  // it will be called again until recvComm != NULL.
  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v10_t** recvDevComm);
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v11_t** recvDevComm);
  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
@@ -191,6 +191,12 @@ This will allow the plugin to discover network devices and make sure they are us
 `init` function does not return `ncclSuccess`, then NCCL will not use the plugin and fall back on
 internal ones.

+Every call to `init` returns an opaque context that the plugin uses internally to allocate resources
+and manage state. Such context is passed to other net plugin calls that create further resources,
+such as `listen` and `connect`. Every context is uniquely associated to a communicator
+using the commId. The network can also be initialized with a per communicator configuration using
+the `config` argument.
+
 To allow the plugin logs to integrate into the NCCL logs seemlessly, NCCL provides a logging
 function to `init`. This function is typically used to allow for `INFO` and `WARN` macros within
 the plugin code adding the following definitions:
@@ -282,7 +288,7 @@ side.
 `listen`

 To create a connection, NCCL will start by calling `listen` on the receiver side. This function
-takes a device number as input argument, and should return a local `listenComm` object, and a
+takes the opaque plugin context returned by `init` and a device number as input argument, and should return a local `listenComm` object, and a
 `handle` to pass to the other side, so that the sender side can connect to the receiver.

 The `handle` is a buffer of size `NCCL_NET_HANDLE_MAXSIZE` and is provided by NCCL.
@@ -304,7 +310,8 @@ the `listen` call previously. If the sender did not connect yet, `accept` should
 should return `ncclSuccess`, setting `recvComm` to `NULL`. NCCL will call `accept` again until it
 succeeds.

-The `connect` API takes a `ncclNetCommConfig_t`, which contains a trafficClass field.
+The `connect` API takes the opaque plugin context returned by `init`. The plugin context can reference
+the `ncclNetCommConfig_t` passed to the `init` function and containing a trafficClass field.
 This field can be used by the network plugin to specify the QoS level of the connection. By default,
 `trafficClass` is set to -1 but can be configured by the application during communicator initialization
 to select a plugin-supported QoS level.
@@ -0,0 +1,19 @@
+set(SRC_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/plugin.c
+)
+
+# Create shared library
+add_library(nccl-net-example SHARED ${SRC_FILES})
+
+# Set include directories
+target_include_directories(nccl-net-example PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/nccl
+)
+
+# Set output name to match Makefile
+set_target_properties(nccl-net-example PROPERTIES
+    OUTPUT_NAME "nccl-net-example"
+    PREFIX "lib"
+    POSITION_INDEPENDENT_CODE ON
+    LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/test/unit/plugins
+)
@@ -22,7 +22,9 @@

 // Maximum number of requests per comm object
 #define NCCL_NET_MAX_REQUESTS 32
+#define NCCL_NET_MAX_DEVS_PER_NIC 4

+#include "net_v11.h"
 #include "net_v10.h"
 #include "net_v9.h"
 #include "net_v8.h"
@@ -33,9 +35,9 @@
 #include "net_v3.h"
 #include "net_v2.h"

-typedef ncclNet_v10_t ncclNet_t;
-typedef ncclNetProperties_v10_t ncclNetProperties_t;
-typedef ncclNetVDeviceProps_v10_t ncclNetVDeviceProps_t;
-typedef ncclNetCommConfig_v10_t ncclNetCommConfig_t;
+typedef ncclNet_v11_t ncclNet_t;
+typedef ncclNetProperties_v11_t ncclNetProperties_t;
+typedef ncclNetVDeviceProps_v11_t ncclNetVDeviceProps_t;
+typedef ncclNetCommConfig_v11_t ncclNetCommConfig_t;

 #endif // end include guard
@@ -12,7 +12,7 @@

 // Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin
 // version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version.
-#define NCCL_NET_DEVICE_UNPACK_VERSION 0x7  
+#define NCCL_NET_DEVICE_UNPACK_VERSION 0x7

 typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType;

@@ -27,6 +27,7 @@ typedef struct {
 typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
 typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t;
 typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t;
-typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_t;
+typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_v11_t;
+typedef ncclNetDeviceHandle_v11_t ncclNetDeviceHandle_t;

 #endif
@@ -5,10 +5,9 @@
 #ifndef NET_V10_H_
 #define NET_V10_H_

-#define NCCL_NET_MAX_DEVS_PER_NIC_V10 4
 typedef struct {
  int ndevs;
-  int devs[NCCL_NET_MAX_DEVS_PER_NIC_V10];
+  int devs[NCCL_NET_MAX_DEVS_PER_NIC];
 } ncclNetVDeviceProps_v10_t;


@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NET_V11_H_
+#define NET_V11_H_
+
+typedef struct {
+  int ndevs;
+  int devs[NCCL_NET_MAX_DEVS_PER_NIC];
+} ncclNetVDeviceProps_v11_t;
+
+#define NCCL_NET_TRAFFIC_CLASS_UNDEF -1
+
+typedef struct {
+  // Plugin-specific TC value
+  int trafficClass;
+} ncclNetCommConfig_v11_t;
+
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int regIsGlobal;                 // regMr is not tied to a particular comm
+  int forceFlush;                  // Force a flush on receives
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+  ncclNetVDeviceProps_v11_t vProps;
+  size_t maxP2pBytes;              // Max transfer size for point-to-point operations
+  size_t maxCollBytes;             // Max transfer size for collective operations
+  int maxMultiRequestSize;         // Maximum number of requests supported in a single multi-request.
+} ncclNetProperties_v11_t;
+
+typedef struct {
+  int32_t maxConcurrentPeers;
+  int32_t minConcurrentPeers;
+  int32_t maxFlowsPerPeer;
+  int32_t minFlowsPerPeer;
+} ncclNetCommAttr_v11_t;
+
+typedef struct {
+  ncclNetCommAttr_v11_t sendCommAttr;
+  ncclNetCommAttr_v11_t recvCommAttr;
+  uint32_t op;
+  uint32_t algo;
+  uint32_t proto;
+} ncclNetAttr_v11_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(void** ctx, uint64_t commId, ncclNetCommConfig_v11_t* config, ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v11_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(void* ctx, int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(void* ctx, int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v11_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v11_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+
+  // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
+  // what index this new vNIC exists at
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v11_t* props);
+  // Finalize the network.
+  ncclResult_t (*finalize)(void* ctx);
+
+  ncclResult_t (*setNetAttr)(void* ctx, ncclNetAttr_v11_t* netAttr);
+} ncclNet_v11_t;
+
+#endif // end include guard
@@ -5,10 +5,9 @@
 #ifndef NET_V9_H_
 #define NET_V9_H_

-#define NCCL_NET_MAX_DEVS_PER_NIC_V9 4
 typedef struct {
  int ndevs;
-  int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9];
+  int devs[NCCL_NET_MAX_DEVS_PER_NIC];
 } ncclNetVDeviceProps_v9_t;

 typedef struct {
@@ -11,7 +11,7 @@

 int max_requests = NCCL_NET_MAX_REQUESTS;

-__hidden ncclResult_t pluginInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { return ncclSuccess; }
+__hidden ncclResult_t pluginInit(void** ctx, uint64_t commId, ncclNetCommConfig_t* config, ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { return ncclSuccess; }
 __hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess; }
 __hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; }
 __hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; }
@@ -51,8 +51,8 @@ __hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_t* props) {
  return ncclSuccess;
 }

-__hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; }
-__hidden ncclResult_t pluginConnect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginListen(void* ctx, int dev, void* handle, void** listenComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginConnect(void* ctx, int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { return ncclInternalError; }
 __hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** recvDevComm) { return ncclInternalError; }
 __hidden ncclResult_t pluginRegMr(void* collComm, void* data, size_t size, int type, void** mhandle) { return ncclInternalError; }
 __hidden ncclResult_t pluginRegMrDmaBuf(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { return ncclInternalError; }
@@ -67,10 +67,11 @@ __hidden ncclResult_t pluginCloseListen(void* listenComm) { return ncclInternalE
 __hidden ncclResult_t pluginIrecvConsumed(void* recvComm, int n, void* request) { return ncclInternalError; }
 __hidden ncclResult_t pluginGetDeviceMr(void* comm, void* mhandle, void** dptr_mhandle) { return ncclInternalError; }
 __hidden ncclResult_t pluginMakeVDevice(int* d, ncclNetVDeviceProps_t* props) { return ncclInternalError; }
+__hidden ncclResult_t pluginFinalize(void* ctx) { return ncclSuccess; }

 #define PLUGIN_NAME "Plugin"

-const ncclNet_v10_t ncclNetPlugin_v10 = {
+const ncclNet_v11_t ncclNetPlugin_v11 = {
  .name = PLUGIN_NAME,
  .init = pluginInit,
  .devices = pluginDevices,
@@ -91,18 +92,84 @@ const ncclNet_v10_t ncclNetPlugin_v10 = {
  .getDeviceMr = pluginGetDeviceMr,
  .irecvConsumed = pluginIrecvConsumed,
  .makeVDevice   = pluginMakeVDevice,
+  .finalize = pluginFinalize,
 };

+__hidden ncclResult_t pluginInit_v10(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { return ncclSuccess; }
+__hidden ncclResult_t pluginGetProperties_v10(int dev, ncclNetProperties_v10_t* props) {
+  // Below are default values, if unsure don't change.
+
+  props->name = "Example";
+  // Fill for proper topology detection, e.g. /sys/devices/pci0000:00/0000:00:10.0/0000:0b:00.0
+  props->pciPath = NULL;
+  // Only used to detect NICs with multiple PCI attachments.
+  props->guid = 0;
+  // Add NCCL_PTR_CUDA if GPU Direct RDMA is supported and regMr can take CUDA pointers.
+  props->ptrSupport = NCCL_PTR_HOST;
+  // If you regMr has a fast registration cache, set to 1. If set to 0, user buffer registration may be disabled.
+  props->regIsGlobal = 0;
+  // Force flush after receive. Needed if the control path and data path use a different path to the GPU
+  props->forceFlush = 0;
+  // Speed in *Mbps*. 100000 means 100G
+  props->speed = 100000;
+  // Port number, used in conjunction with guid
+  props->port = 0;
+  // Custom latency (used to help tuning if latency is high. If set to 0, use default NCCL values.
+  props->latency = 0;
+  // Maximum number of comm objects we can create.
+  props->maxComms = 1024*1024;
+  // Maximum number of receive operations taken by irecv().
+  props->maxRecvs = NCCL_PLUGIN_MAX_RECVS;
+  // Coupling with NCCL network device-side code.
+  props->netDeviceType = NCCL_NET_DEVICE_HOST;
+  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  // Used to tell NCCL core whether this is a virtual device fusing multiple physical devices.
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  // maximum transfer sizes the plugin can handle
+  props->maxP2pBytes = NCCL_MAX_NET_SIZE_BYTES;
+  props->maxCollBytes = NCCL_MAX_NET_SIZE_BYTES;
+  return ncclSuccess;
+}
+
+__hidden ncclResult_t pluginListen_v10(int d, void* handle, void** listenComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginConnect_v10(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginMakeVDevice_v10(int* d, ncclNetVDeviceProps_v10_t* props) { return ncclInternalError; }
+
+const ncclNet_v10_t ncclNetPlugin_v10 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit_v10,
+  .devices = pluginDevices,
+  .getProperties = pluginGetProperties_v10,
+  .listen = pluginListen_v10,
+  .connect = pluginConnect_v10,
+  .accept = pluginAccept,
+  .regMr = pluginRegMr,
+  .regMrDmaBuf = pluginRegMrDmaBuf,
+  .deregMr = pluginDeregMr,
+  .isend = pluginIsend,
+  .irecv = pluginIrecv,
+  .iflush = pluginIflush,
+  .test = pluginTest,
+  .closeSend = pluginCloseSend,
+  .closeRecv = pluginCloseRecv,
+  .closeListen = pluginCloseListen,
+  .getDeviceMr = pluginGetDeviceMr,
+  .irecvConsumed = pluginIrecvConsumed,
+  .makeVDevice   = pluginMakeVDevice_v10,
+};
+
+
 __hidden ncclResult_t pluginInit_v9(ncclDebugLogger_t logFunction) {
-  return pluginInit(logFunction, NULL);
+  return pluginInit_v10(logFunction, NULL);
 }

 __hidden ncclResult_t pluginGetProperties_v9(int dev, ncclNetProperties_v9_t* props) {
-  return pluginGetProperties(dev, (ncclNetProperties_t*)props);
+  return pluginGetProperties_v10(dev, (ncclNetProperties_v10_t*)props);
 }

 __hidden ncclResult_t pluginConnect_v9(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm){
-  return pluginConnect(dev, NULL, handle, sendComm, sendDevComm);
+  return pluginConnect_v10(dev, NULL, handle, sendComm, sendDevComm);
 }

 __hidden ncclResult_t pluginIsend_v9(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
@@ -120,7 +187,7 @@ const ncclNet_v9_t ncclNetPlugin_v9 = {
  .init = pluginInit_v9,
  .devices = pluginDevices,
  .getProperties = pluginGetProperties_v9,
-  .listen = pluginListen,
+  .listen = pluginListen_v10,
  .connect = pluginConnect_v9,
  .accept = pluginAccept,
  .regMr = pluginRegMr,
@@ -172,7 +239,7 @@ const ncclNet_v8_t ncclNetPlugin_v8 = {
  .init = pluginInit_v9,
  .devices = pluginDevices,
  .getProperties = pluginGetProperties_v8,
-  .listen = pluginListen,
+  .listen = pluginListen_v10,
  .connect = pluginConnect_v9,
  .accept = pluginAccept,
  .regMr = pluginRegMr,
@@ -216,7 +283,7 @@ const ncclNet_v7_t ncclNetPlugin_v7 = {
  .init = pluginInit_v9,
  .devices = pluginDevices,
  .getProperties = pluginGetProperties_v7,
-  .listen = pluginListen,
+  .listen = pluginListen_v10,
  .connect = pluginConnect_v9,
  .accept = pluginAccept,
  .regMr = pluginRegMr_v7,
@@ -257,7 +324,7 @@ const ncclNet_v6_t ncclNetPlugin_v6 = {
  .init = pluginInit_v9,
  .devices = pluginDevices,
  .getProperties = pluginGetProperties_v6,
-  .listen = pluginListen,
+  .listen = pluginListen_v10,
  .connect = pluginConnect_v6,
  .accept = pluginAccept_v6,
  .regMr = pluginRegMr_v7,
@@ -278,7 +345,7 @@ const ncclNet_v5_t ncclNetPlugin_v5 = {
  .init = pluginInit_v9,
  .devices = pluginDevices,
  .getProperties = pluginGetProperties_v6,
-  .listen = pluginListen,
+  .listen = pluginListen_v10,
  .connect = pluginConnect_v6,
  .accept = pluginAccept_v6,
  .regMr = pluginRegMr_v7,
@@ -320,7 +387,7 @@ static ncclResult_t pluginConnect_v4(int dev, void* handle, void** sendComm) {
  ncclResult_t ret;
  do {
    ncclNetDeviceHandle_v7_t* handle = NULL;
-    ret = pluginConnect(dev, NULL, handle, sendComm, &handle);
+    ret = pluginConnect_v10(dev, NULL, handle, sendComm, &handle);
  } while (ret == ncclSuccess && *sendComm == NULL);
  return ret;
 }
@@ -337,7 +404,7 @@ const ncclNet_v4_t ncclNetPlugin_v4 = {
  .init = pluginInit_v9,
  .devices = pluginDevices,
  .getProperties = pluginGetProperties_v4,
-  .listen = pluginListen,
+  .listen = pluginListen_v10,
  .connect = pluginConnect_v4,
  .accept = pluginAccept_v4,
  .regMr = pluginRegMr_v7,
@@ -363,12 +430,12 @@ static ncclResult_t pluginFlush(void* recvComm, void* data, int size, void* mhan
 }
 static ncclResult_t pluginInit_v3(ncclDebugLogger_t logFunction) {
  max_requests = NCCL_NET_MAX_REQUESTS_V3;
-  return pluginInit(logFunction, NULL);
+  return pluginInit_v10(logFunction, NULL);
 }
 #include <string.h>
 static ncclResult_t pluginListen_v3(int dev, void* handle, void** listenComm) {
  char pluginHandle[NCCL_NET_HANDLE_MAXSIZE];
-  ncclResult_t ret = pluginListen(dev, &pluginHandle, listenComm);
+  ncclResult_t ret = pluginListen_v10(dev, &pluginHandle, listenComm);
  memcpy(handle, &pluginHandle, NCCL_NET_HANDLE_MAXSIZE_V4);
  return ret;
 }
@@ -403,7 +470,7 @@ const ncclNet_v2_t ncclNetPlugin_v2 = {
  .devices = pluginDevices,
  .pciPath = pluginPciPath,
  .ptrSupport = pluginPtrSupport,
-  .listen = pluginListen,
+  .listen = pluginListen_v3,
  .connect = pluginConnect_v4,
  .accept = pluginAccept_v4,
  .regMr = pluginRegMr_v7,
@@ -49,9 +49,9 @@ of newer ones.
 The `nccl/` directory is populated with `profiler_vX.h` files extracting all relevant definitions
 from old API versions. It also provides error codes in `err.h`.

-# API (v4)
+# API (v5)

-Below is the main `ncclProfiler_v4` struct. Each function is explained in later sections.
+Below is the main `ncclProfiler_v5` struct. Each function is explained in later sections.

 ```
 typedef struct {
@@ -60,15 +60,15 @@ typedef struct {
  // init - initialize the profiler plugin
  // Input
  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  //  - commId         : communicator id
  //  - commName       : user assigned communicator name
-  //  - commHash       : communicator id
  //  - nNodes         : number of nodes in communicator
  //  - nranks         : number of ranks in communicator
  //  - rank           : rank identifier in communicator
  //  - logfn          : logger function
  // Output
  //  - eActivationMask: bitmask of active events set by the plugin
-  ncclResult_t (*init)(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
+  ncclResult_t (*init)(void** context, uint64_t commId, int* eActivationMask, const char* commName, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);

  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
  // Input
@@ -76,7 +76,7 @@ typedef struct {
  //  - eDescr : pointer to ncclProfilerEventDescr_t object
  // Output
  //  - eHandle: return event handle for supplied event descriptor object
-  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v4_t* eDescr);
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v5_t* eDescr);

  // stopEvent - stop/finalize an event inside and event set
  // Input
@@ -88,13 +88,13 @@ typedef struct {
  //  - eHandle   : handle to event object created through startEvent
  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
  //  - eState    : event state transition
-  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v4_t eState, ncclProfilerEventStateArgs_v4_t* eStateArgs);
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v5_t eState, ncclProfilerEventStateArgs_v5_t* eStateArgs);

  // finalize - finalize the profiler plugin
  // Input
  //  - context: opaque profiler context object
  ncclResult_t (*finalize)(void* context);
-} ncclProfiler_v4_t;
+} ncclProfiler_v5_t;
 ```

 ## Error codes
@@ -148,10 +148,37 @@ is the `ncclProfilerEventDescr_t` struct.

 ```
 typedef struct {
-  uint8_t type;             // event type (e.g., ncclProfileGroup, ncclProfileColl, ...)
-  void* parentObj;          // pointer to parent event used to expose the event hierarchy to the profiler
-  int rank;                 // rank that generated the event
+  uint64_t type;             // event type descriptor: ncclProfileGroupApi, ncclProfileCollApi, ...
+  void* parentObj;           // pointer to parent event used to expose the event hierarchy to the profiler
+  int rank;                  // rank that generated the event
  union {
+    struct {                 // GroupAPI event metadata
+      bool graphCaptured;    // Set to true if the Group API event is emitted inside a CUDA graph capture
+      int groupDepth;        // Determines the depth of a ncclGroup. A depth of 1 implies that the Group API call is implicit (internal to NCCL)
+                             // and not called by the user. Any depth greater than 1 means that the user made the Group API call.
+    } groupApi;
+
+    struct {                 // Collective API call metadata
+      const char* func;      // string containing name of the collective operation during
+      size_t count;          // data count
+      const char* datatype;  // string containing the name of the datatype
+      int root;              // root rank
+      void* stream;          // Opaque handle that points to the CUDA stream that the operation is enqueued in
+      bool graphCaptured;    // Set to true if the Collective API event is emitted inside a CUDA graph capture
+    } collApi;
+
+    struct {                // Point-to-point API call metadata
+      const char* func;     // string containing name of the p2p operation
+      size_t count;         // data count
+      const char* datatype; // string containing the name of the datatype
+      void* stream;         // Opaque handle that points to a CUDA stream object
+      bool graphCaptured;   // Set to true if the Collective API event is emitted inside a CUDA graph capture
+    } p2pApi;
+
+    struct {                // Kernel Launch event metadata
+      void* stream;         // Opaque handle that points to the CUDA stream that the operation is enqueued in
+    } kernelLaunch;
+
    struct {                // collective events metadata
      uint64_t seqNumber;   // sequence number of this collective operation in the communicator
      const char* func;     // string containing name of the collective
@@ -164,6 +191,7 @@ typedef struct {
      uint8_t nWarps;       // number of GPU warps for this collective
      const char* algo;     // string containing name of the algorithm for this collective
      const char* proto;    // string containing name of the protocol for this collective
+      void* parentGroup;    // for backward compatibility with v4 - this points to the legacy v4 group parent
    } coll;

    struct {                // point-to-point events metadata
@@ -173,6 +201,7 @@ typedef struct {
      size_t count;
      int peer;             // peer rank for this point-to-point
      uint8_t nChannels;    // number of channels for this p2p
+      void* parentGroup;    // for backward compatibility with v4 - this points to the legacy v4 group parent
    } p2p;

    struct {                // proxyOp events metadata
@@ -198,12 +227,12 @@ typedef struct {
      void* data;           // pointer to network plugin defined event
    } netPlugin;
  };
-} ncclProfilerEventDescr_v4_t;
+} ncclProfilerEventDescr_v5_t;
 ```

-NCCL defines the following events: `ncclProfileGroup`, `ncclProfileColl`, `ncclProfileP2p`,
-`ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileProxyCtrl`, `ncclProfileKernelCh` and
-`ncclProfileNetPlugin`.
+NCCL defines the following events: `ncclProfileGroupApi`, `ncclProfileCollApi`, `ncclProfileP2pApi`, `ncclProfileKernelLaunch`,
+`ncclProfileGroup`, `ncclProfileColl`, `ncclProfileP2p`,`ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileProxyCtrl`,
+`ncclProfileKernelCh` and `ncclProfileNetPlugin`.

 #### stopEvent

@@ -213,10 +242,10 @@ handle after `eventStop` is undefined behavior.

 #### recordEventState

-Some events can only be started and stopped. For example, `ncclProfileGroup`, `ncclProfileColl`,
-`ncclProfileP2p`, cannot be updated through calls to `recordEventState`.
+Some events can only be started and stopped. For example, `ncclProfileP2pApi`, `ncclProfileCollApi`, `ncclProfileGroup`,
+`ncclProfileColl`, `ncclProfileP2p` cannot be updated through calls to `recordEventState`.

-`ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileNetPlugin`, `ncclProfileKernelCh`, and
+`ncclProfileGroupApi`, `ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileNetPlugin`, `ncclProfileKernelCh`, and
 `ncclProfileProxyCtrl` can be updated through calls to `recordEventState`.

 The state of these events can be updated, along with event attributes, using `recordEventState`.
@@ -258,9 +287,21 @@ typedef enum {

  // ncclProfileKernelCh event states
  ncclProfilerKernelChStop             = 22,// state marks stop of kernelCh event and timestamp update
-} ncclProfilerEventState_v4_t;
+
+  // Group API States
+  ncclProfilerGroupStartApiStop        = 23,// state marks the end of a ncclGroupStart() API call
+  ncclProfilerEndGroupApiStart         = 24 // state marks the start of a ncclGroupEnd() API call
+} ncclProfilerEventState_v5_t;
 ```

+NCCL profile API events are generated when the API calls are made, right after NCCL checks
+for graph capture information. They parent collective, point-to-point and kernel launch events
+and persist across multiple operations in a group.
+
+`ncclProfileKernelLaunch` events are generated when the CUDA call to a kernel launch is made. In the
+case of graph capture, the event start indicates that the kernel launch operation has been recorded,
+not launched.
+
 `ncclProfileProxyOp` events are generated by the proxy progress thread while it is processing
 network requests for the GPU kernel. ProxyOp events are generated for every active channel and
 provide a summary of the activity of the proxy progress thread for that channel. Most of the
@@ -379,7 +420,7 @@ typedef union {
  struct {                // attribute to update for ncclProfileKernelCh events
    uint64_t pTimer;      // timestamp provided by the NCCL kernel
  } kernelCh;
-} ncclProfilerEventStateArgs_v4_t;
+} ncclProfilerEventStateArgs_v5_t;
 ```

 The example profiler in `ext-profiler/example` contains details on how to capture and use the events above.
@@ -389,27 +430,33 @@ The example profiler in `ext-profiler/example` contains details on how to captur
 NCCL core events (reported above) are organized into a hierarchy as reported below:

 ```
-Group event
+Group API event
   |
-   +- Collective event
+   +- Collective API event
   |  |
-   |  +- ProxyOp event
-   |  |  |
-   |  |  +- ProxyStep event
-   |  |     |
-   |  |     +- NetPlugin event
-   |  |
-   |  +- KernelCh event
+   |  +- Collective event
+   |     |
+   |     +- ProxyOp event
+   |     |  |
+   |     |  +- ProxyStep event
+   |     |     |
+   |     |     +- NetPlugin event
+   |     |
+   |     +- KernelCh event
   |
-   +- Point-to-point event
-      |
-      +- ProxyOp event
-      |  |
-      |  +- ProxyStep event
-      |     |
-      |     +- NetPlugin event
-      |
-      +- KernelCh event
+   +- Point-to-point API event
+   |  |
+   |  +- Point-to-point event
+   |     |
+   |     +- ProxyOp event
+   |     |  |
+   |     |  +- ProxyStep event
+   |     |     |
+   |     |     +- NetPlugin event
+   |     |
+   |     +- KernelCh event
+   |
+   +- Kernel Launch event

 ProxyCtrl event
 ```
@@ -0,0 +1,34 @@
+# Find all C source files in current directory
+set(SRC_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/plugin.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/print_event.cc
+)
+
+# Create shared library
+add_library(nccl-profiler-example SHARED ${SRC_FILES})
+
+# Set include directories
+target_include_directories(nccl-profiler-example PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/nccl
+    ${CUDAToolkit_INCLUDE_DIRS}
+)
+
+# Set output name to match Makefile
+set_target_properties(nccl-profiler-example PROPERTIES
+    OUTPUT_NAME "nccl-profiler-example"
+    PREFIX "lib"
+    POSITION_INDEPENDENT_CODE ON
+    LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
+)
+
+add_custom_command(TARGET nccl-profiler-example POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_BINARY_DIR}/test/unit/plugins
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/lib/libnccl-profiler-example.so ${CMAKE_BINARY_DIR}/test/unit/plugins
+)
+
+# Add custom target for clean (equivalent to Makefile clean target)
+add_custom_target(clean-profiler-lib
+    COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_BINARY_DIR}/lib/libnccl-profiler-example.so
+    COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_BINARY_DIR}/test/unit/plugins/libnccl-profiler-example.so
+    COMMENT "Cleaning libnccl-profiler-example.so"
+)
@@ -4,19 +4,26 @@
 # See LICENSE.txt for license information
 #
 .DEFAULT_GOAL: build
-include ../../makefiles/common.mk
-SRCDIR   ?= $(abspath ../..)
+ROCM_PATH ?= $(wildcard /opt/rocm)
+CXX = $(ROCM_PATH)/lib/llvm/bin/amdclang++
 BUILDDIR ?= .
-NCCLDIR  := $(BUILDDIR)
+HIPIFY_DIR := hipify-profiler

-SRC_FILES := $(wildcard *.c)
+SRC_FILES := $(wildcard *.cc)
+HIPIFY_SRC := $(addprefix $(HIPIFY_DIR)/,$(SRC_FILES))

-build: ${BUILDDIR}/librccl-profiler.so
+build: ${BUILDDIR}/librccl-profiler-example.so

-${BUILDDIR}/librccl-profiler.so: ${SRC_FILES}
+${BUILDDIR}/librccl-profiler-example.so: $(HIPIFY_SRC)
 	@printf "Compiling  %-35s > %s\n" $< $@
 	@mkdir -p ${BUILDDIR}
-	$(CC) -Inccl -fPIC -shared -o $@ $^
+	$(CXX) -D__HIP_PLATFORM_AMD__ -I$(HIPIFY_DIR) -I$(HIPIFY_DIR)/nccl -I$(ROCM_PATH)/include -fPIC -shared -o $@ $^
+
+$(HIPIFY_DIR)/%.cc: %.cc
+	@mkdir -p $(HIPIFY_DIR)/nccl
+	@cp *.cc *.h $(HIPIFY_DIR)/
+	@cp nccl/*.h $(HIPIFY_DIR)/nccl/
+	@hipify-perl -inplace -quiet-warnings $(HIPIFY_DIR)/*.cc $(HIPIFY_DIR)/*.h

 clean:
-	rm -f ${BUILDDIR}/librccl-profiler.so
+	rm -rf ${BUILDDIR}/librccl-profiler-example.so $(HIPIFY_DIR)
@@ -13,8 +13,7 @@ change the size of the event window the profiler keeps track of.

 ## Building the profiler plugin

-To use the example plugin, just type `make`. You will need a NCCL build's include directory present.
-You can override `NCCL_HOME` to where the NCCL installation is on your system.
+To build the example plugin shipped as part of NCCL, just type `make`.

 ## Using the profiler plugin

@@ -27,13 +26,13 @@ You can override `NCCL_HOME` to where the NCCL installation is on your system.

   As an example, setting:

-   `NCCL_PROFILE_EVENT_MASK` to 1 (`ncclProfileGroup`) | 2 (`ncclProfileColl`) | 8 (`ncclProfileProxyOp`)
+   `NCCL_PROFILE_EVENT_MASK` to 256 (`ncclProfileGroupApi`) | 2 (`ncclProfileColl`) | 8 (`ncclProfileProxyOp`)

-   enables the profiling of the group, the collective and the proxy op events. The same events can be
+   enables the profiling of the group API, the collective and the proxy op events. The same events can be
   expressed more concisely by setting `NCCL_PROFILE_EVENT_MASK` to 8 (`ncclProfileProxyOp`). Indeed,
   in NCCL all the events above (in the event hierarchy) the one requested are also captured. The advantage
   is that the profiler can easily correlate events that belong to the same NCCL operation and present
-   them accordingly.
+   them accordingly. Setting `NCCL_PROFILE_EVENT_MASK` to 4095 enables all events supported by the v5 profiler.

 3. Set `NCCL_PROFILE_DUMP_FILE` to the name of the dump file for the collected traces. A file named
   ${NCCL_PROFILE_DUMP_FILE}-hostname-tid.txt is created. Profiler traces are saved using the chrome
@@ -57,11 +56,14 @@ The group, collective and p2p pools contain objects for the corresponding events
 contains objects for `ProxyCtrl` events and the `ProxyDetach` pool contains objects for `ProxyOp` events
 generated by remote proxies. A list of pools and their size is reported below:

- `NCCL_PROFILE_GROUP_POOL_SIZE` (16)
- `NCCL_PROFILE_COLL_POOL_SIZE` (16)
- `NCCL_PROFILE_P2P_POOL_SIZE` (1024)
+- `NCCL_PROFILE_GROUP_API_POOL_SIZE` (256)
+- `NCCL_PROFILE_COLL_API_POOL_SIZE` (256)
+- `NCCL_PROFILE_P2P_API_POOL_SIZE` (256)
+- `NCCL_PROFILE_KERNEL_LAUNCH_POOL_SIZE` (256)
+- `NCCL_PROFILE_COLL_POOL_SIZE` (256)
+- `NCCL_PROFILE_P2P_POOL_SIZE` (256)
 - `NCCL_PROFILE_PROXY_CTRL_POOL_SIZE` (16)
- `NCCL_PROFILE_PROXY_DETACH_POOL_SIZE` (128)
+- `NCCL_PROFILE_PROXY_DETACH_POOL_SIZE` (256)

 Remote proxy operations are generated when PXN is in use. Refer to this article for more information
 about PXN and how it works:
@@ -73,76 +75,58 @@ The example profiler generates traces using the json format. An example of trace

 ```
 [
-{"name": "Group", "cat": "GROUP", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 764234.611328, "args": {"groupId": 0}},
-{"name": "AllReduce", "cat": "COLL", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 764237.294922, "args": {"SeqNum": 0, "CommHash": 673864846479792718, "Rank": 1, "Count": 32768, "Datatype": "ncclFloat32", "Algorithm": "RING", "Protocol": "LL", "nMaxChannels": 2}},
-{"name": "Recv", "cat": "PROXY", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768464.936523, "args": {"Channel": 0, "Peer": 0, "Steps": 14, "ChunkSize": 32768, "transSize": 229376, "POSTED": {"step": 14, "ts": 772020.300781}, "RECEIVED": {"step": 14, "ts": 772196.049805}, "TRANSMITTED": {"step": 14, "ts": 772197.326172}, "DONE": {"step": 14, "ts": 772201.538086}}},
-{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768465.158203, "args": {"Step": 0}},
-{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768477.924805},
-{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768477.924805, "args": {"Step": 0}},
-{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768547.197266},
-{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768547.197266, "args": {"Step": 0}},
-{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768564.174805},
-{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768564.174805, "args": {"Step": 0}},
-{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768568.276367},
-{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768503.604492, "args": {"Step": 1}},
-{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 768504.549805},
-{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768504.549805, "args": {"Step": 1}},
-{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 769994.490234},
-{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 769994.490234, "args": {"Step": 1}},
-{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 769995.012695},
-{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 769995.012695, "args": {"Step": 1}},
-{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 770006.914062},
-{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 768506.941406, "args": {"Step": 2}},
-{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 768507.435547},
-{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 768507.435547, "args": {"Step": 2}},
-{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771452.536133},
-{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 771452.536133, "args": {"Step": 2}},
-{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771453.060547},
-{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 771453.060547, "args": {"Step": 2}},
-{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771468.458008},
-{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 768509.484375, "args": {"Step": 3}},
-{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 768510.250000},
-{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 768510.250000, "args": {"Step": 3}},
-{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.499023},
-{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.499023, "args": {"Step": 3}},
-{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.991211},
-{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.991211, "args": {"Step": 3}},
-{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771910.500000},
-{"name": "Send", "cat": "PROXY", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768482.878906, "args": {"Channel": 0, "Peer": 2, "Steps": 14, "ChunkSize": 32768, "transSize": 229376, "POSTED": {"step": 14, "ts": 771995.675781}, "REM_FIFO_WAIT": {"step": 14, "ts": 772190.692383}, "TRANSMITTED": {"step": 14, "ts": 772191.516602}, "DONE": {"step": 14, "ts": 772208.473633}}},
-{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.019531, "args": {"Step": 0}},
-{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.300781},
-{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.300781, "args": {"Step": 0}},
-{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 769594.615234},
-{"name": "SendWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 769594.615234, "args": {"Step": 0}},
-{"name": "SendWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 769618.889648},
-{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.083008, "args": {"Step": 1}},
-{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.163086},
-{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.163086, "args": {"Step": 1}},
-{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 769610.555664},
-{"name": "SendWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 769610.555664, "args": {"Step": 1}},
-{"name": "SendWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 769622.517578},
-{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 768507.937500, "args": {"Step": 2}},
-{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 768508.017578},
-{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 768508.017578, "args": {"Step": 2}},
-{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 770002.129883},
-{"name": "SendWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 770002.129883, "args": {"Step": 2}},
-{"name": "SendWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 770013.848633},
-{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.742188, "args": {"Step": 3}},
-{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.822266},
-{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.822266, "args": {"Step": 3}},
-{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 771461.563477},
-{"name": "SendWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 771461.563477, "args": {"Step": 3}},
-{"name": "SendWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 771469.171875},
+{"name": "Group API", "cat": "GROUP_API", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 3433.595001, "args": {"groupApiId": 0, "groupDepth":1}},
+{"name": "KernelLaunch", "cat": "KERNEL_LAUNCH", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 0.000000, "args": {"groupId": 0, "Stream": 0x5020000567d0}},
+{"name": "KernelLaunch", "cat": "KERNEL_LAUNCH", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 111991.558990},
+{"name": "AllReduce", "cat": "COLL_API", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 0.000000, "args": {"count": 262144, "datatype": ncclFloat32, "root": 0, "GraphCaptured":0, "Stream": 0x5020000567d0}},
+{"name": "AllReduce", "cat": "COLL", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 111994.477997, "args": {"SeqNum": 0, "CommHash": 1493613951195738943, "Rank": 0, "Count": 262144, "Datatype": "ncclFloat32", "Algorithm": "RING", "Protocol": "SIMPLE", "nChannels": 2}},
+{"name": "KernelCh", "cat": "GPU", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119711.888000, "args": {"Channel": 0, "StartGpuClk": 1756135989724672000, "StopGpuClk": 1756135989732831232}},
+{"name": "ScheduleRecv", "cat": "PROXY", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119652.709991, "args": {"Channel": 0, "Peer": 1, "Steps": 4, "ChunkSize": 4194304, "transSize": 524288}},
+{"name": "ScheduleRecv", "cat": "PROXY", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 119686.300995},
+{"name": "ProgressRecv", "cat": "PROXY", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119686.300995, "args": {"Channel": 0, "Peer": 1, "Steps": 4, "ChunkSize": 4194304, "transSize": 524288}},
+{“name": "RecvWait", "cat": "NET", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119707.677979, "args": {"Step": 0}},
+{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 119807.691986},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119807.691986, "args": {"Step": 0}},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 119867.338989},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119867.338989, "args": {"Step": 0}},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 120120.983002},
+{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 1, "pid": 225798, "tid": 1, "ts": 119733.647980, "args": {"Step": 1}},
+{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 1, "pid": 225798, "tid": 1, "ts": 119844.401001},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 1, "pid": 225798, "tid": 1, "ts": 119844.401001, "args": {"Step": 1}},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 1, "pid": 225798, "tid": 1, "ts": 119890.567993},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 1, "pid": 225798, "tid": 1, "ts": 119890.567993, "args": {"Step": 1}},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 1, "pid": 225798, "tid": 1, "ts": 120121.129974},
+{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 2, "pid": 225798, "tid": 1, "ts": 119753.023987, "args": {"Step": 2}},
+{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 2, "pid": 225798, "tid": 1, "ts": 120038.847992},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 2, "pid": 225798, "tid": 1, "ts": 120038.847992, "args": {"Step": 2}},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 2, "pid": 225798, "tid": 1, "ts": 120085.685974},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 2, "pid": 225798, "tid": 1, "ts": 120085.685974, "args": {"Step": 2}},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 2, "pid": 225798, "tid": 1, "ts": 120121.244995},
+{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 3, "pid": 225798, "tid": 1, "ts": 119772.510986, "args": {"Step": 3}},
+{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 3, "pid": 225798, "tid": 1, "ts": 120062.944977},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 3, "pid": 225798, "tid": 1, "ts": 120062.944977, "args": {"Step": 3}},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 3, "pid": 225798, "tid": 1, "ts": 120101.089996},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 3, "pid": 225798, "tid": 1, "ts": 120101.089996, "args": {"Step": 3}},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 3, "pid": 225798, "tid": 1, "ts": 120165.115997},
+{"name": "ProgressRecv", "cat": "PROXY", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 120165.356995},
+{"name": "ScheduleSend", "cat": "PROXY", "ph": "b", "id": 1, "pid": 225798, "tid": 1, "ts": 119656.950989, "args": {"Channel": 0, "Peer": 1, "Steps": 4, "ChunkSize": 4194304, "transSize": 524288}},
+{"name": "ScheduleSend", "cat": "PROXY", "ph": "e", "id": 1, "pid": 225798, "tid": 1, "ts": 119709.078979},
+{"name": "ProgressSend", "cat": "PROXY", "ph": "b", "id": 1, "pid": 225798, "tid": 1, "ts": 119709.078979, "args": {"Channel": 0, "Peer": 1, "Steps": 4, "ChunkSize": 4194304, "transSize": 524288}},
+{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 4, "pid": 225798, "tid": 1, "ts": 119710.632996, "args": {"Step": 0}},
+{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 4, "pid": 225798, "tid": 1, "ts": 119808.636993},
+{"name": "SendPeerWait", "cat": "NET", "ph": "b", "id": 4, "pid": 225798, "tid": 1, "ts": 119808.636993, "args": {"Step": 0}},
+{"name": "SendPeerWait", "cat": "NET", "ph": "e", "id": 4, "pid": 225798, "tid": 1, "ts": 119818.972992},
 ... [ trace truncated for brevity ]
-{"name": "AllReduce", "cat": "COLL", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 772209.317383},
-{"name": "Group", "cat": "GROUP", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 772209.418945},
+{"name": "AllReduce", "cat": "COLL", "ph": "e", "id": 17, "pid": 225798, "tid": 1, "ts": 170633.535980},
+{"name": "AllReduce", "cat": "COLL_API", "ph": "e", "id": 17, "pid": 225798, "tid": 1, "ts": 170582.923981},
+{"name": "Group API", "cat": "GROUP_API", "ph": "e", "id": 17, "pid": 225798, "tid": 1, "ts": 170637.582001},
 {}]
 ```

 Details about the fields used in the trace can be found at this link:
 https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview?tab=t.0#heading=h.yr4qxyxotyw

-The trace above is obtained by running a `ncclAllReduce` operation on 8 GPUs, communicating with each other through
+The trace above is obtained by running a `ncclAllReduce` operation on 2 GPUs, communicating with each other through
 the network interface. The `Group` event encloses all traces that are related to the single `ncclAllReduce` call.
 (Note that for single collective invocations, where there are no explicit group calls, NCCL creates a group with only
 one collective and this is what is presented in the traces above).
@@ -161,38 +145,17 @@ The `AllReduce` entry presents information about the `ncclAllReduce` operation.
 - datatype    : NCCL datatype
 - algorithm   : algorithm used to process the ncclAllReduce
 - protocol    : protocol used to process the ncclAllReduce
- nMaxChannels: max number of channels used to process the ncclAllReduce
+- nChannels   : Number of channels used to process the ncclAllReduce

 If the proxy events are not active (e.g., the `ncclAllReduce` is intranode) the end timestamp will match the time
 consumed by the CPU to launch the collective. For more details refer to `ext-profiler/README.md`, section `Profiling
 of collective and p2p operations`.

-### Proxy Send
-The `Send` entry presents information about the `ProxyOp` processing in the progress thread. It contains the following
-info in the args field:
-
- Channel      : id of the channel used by this proxy operation to send data to the peer
- Peer         : peer rank
- Steps        : number of network steps required to transfer transSize bytes to the peer
- ChunkSize    : chunk size used by NCCL to pipeline data through the proxy thread
- transSize    : bytes transferred across the channel by this proxy operation
- POSTED       : struct containing the number of buffer posts to the GPU and the time stamp for the last post
- REM_FIFO_WAIT: struct containing the number of remote buffer waits and the time stamp for the last wait
- TRANSMITTED  : struct containing the number of network sends and the time stamp of the last send
- DONE         : struct containing the number of network sends completed and the time stamp of the last send completed
-
-In case of a network problem the POSTED, REM_FIFO_WAIT, TRANSMITTED and DONE might all have partially updated steps,
-which could help identify at which point the network problem occurred.
-
 The Proxy send trace gives a summary of the proxy progress thread activity for the channel. If more details are
 needed, these can be obtained by enabling the proxy step event (`ncclProfileProxyStep`). In which case the trace
 entries below are also reported by the profiler.

-#### Proxy SendBufferWait
-
-Presents, for every network step, the time the CPU proxy spends waiting for the channel staging buffer to become available.
-
-#### Proxy SendGPUWait
+#### Proxy SendGpuWait

 Presents, for every network step, the time the CPU proxy spends waiting for the GPU to provide the data in the staging
 buffer.
@@ -201,31 +164,6 @@ buffer.

 Presents, for every network step, the time the CPU proxy spends waiting for the `isend` to complete

-### Proxy Recv
-
-The `Recv` entry presents information about the `ProxyOp` processing in the progress thread. It contains the following
-info in the args field:
-
- Channel    : id of the channel used by this proxy operation to recv data from the peer
- Peer       : peer rank
- Steps      : number of network steps required to transfer transSize bytes from the peer
- ChunkSize  : chunk size used by NCCL to pipeline data through the proxy thread
- transSize  : bytes transferred across the channel by this proxy operation
- POSTED     : struct containing the number of recvs posted and the time stamp for the last recv posted
- RECEIVED   : struct containing the number of recvs completed and the time stamp for the last recv completed
- TRANSMITTED: struct containing the number of recvs flushed to the GPU memory and the time stamp for the last recv flushed
- DONE       : struct containing the number of flush completed and the time stamp for the last flush completed
-
-The Proxy Recv trace gives a summary of the proxy progress thread activity for the channel. If more details are
-needed, these can be obtained by enabling the proxy step event (`ncclProfileProxyStep`). In which case the trace
-entries below are also reported by the profiler.
-
-
-#### Proxy RecvBufferWait
-
-Presents, for every network step, the time the CPU proxy spends waiting for the staging buffer for the channel to
-become available.
-
 #### Proxy RecvWait

 Presents, for every network step, the time the CPU proxy spends waiting for a posted `irecv` to complete
@@ -234,6 +172,6 @@ Presents, for every network step, the time the CPU proxy spends waiting for a po

 Presents, for every network step, the time the CPU proxy spends waitng for the recv data to be flushed to the GPU

-#### Proxy RecvGPUWait
+#### Proxy RecvGpuWait

 Presents, for every network step, the time the CPU proxy spends waiting for the GPU to consume the recv data
@@ -1,30 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include <stdio.h>
-#include "event.h"
-
-int taskEventQueueEmpty(struct group* g) {
-  return g->eventHead == NULL;
-}
-
-void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event) {
-  event->next = NULL;
-  if (g->eventHead) g->eventTail->next = event;
-  else g->eventHead = event;
-  g->eventTail = event;
-}
-
-struct taskEventBase* taskEventQueueHead(struct group* g) {
-  return g->eventHead;
-}
-
-struct taskEventBase* taskEventQueueDequeue(struct group* g) {
-  struct taskEventBase* tmp = g->eventHead;
-  g->eventHead = g->eventHead->next;
-  if (g->eventHead == NULL) g->eventTail = NULL;
-  return tmp;
-}
@@ -10,10 +10,14 @@
 #include <sys/types.h>
 #include <stdint.h>
 #include <unistd.h>
+#include <cstring>
+#include "err.h"
 #include "profiler.h"
+#include "queue.h"
+#include <cuda_runtime.h>

 #define MAX_CHANNELS                     128 // Match RCCL's MAXCHANNELS
-#define MAX_STEPS                        16
+#define MAX_STEPS                        1024
 #define MAX_OPS                          16 // Up to 64K ranks for PAT
 #define MAX_EVENTS_PER_REQ               (8)

@@ -21,7 +25,7 @@ struct proxyOp;
 struct proxyStep;

 struct netPlugin {
-  uint8_t type;
+  uint64_t type;
  int pluginType;
  int pluginVer;
  uint8_t pluginEvent;
@@ -63,7 +67,7 @@ struct kernelCh {
 #define PROXY_STEP_MAX_STATES 3

 struct proxyStep {
-  uint8_t type;                     // type of event: network transfer
+  uint64_t type;                     // type of event: network transfer
  int state;
  int step;                         // network transfer id in given channel
  int isSend;                       // send/recv channel operation
@@ -76,7 +80,7 @@ struct proxyStep {
 };

 struct proxyOp {
-  uint8_t type;                     // type of event: proxy operation
+  uint64_t type;                     // type of event: proxy operation
  uint8_t channelId;                // channel id for this proxy operation
  pid_t pid;
  int rank;
@@ -97,7 +101,7 @@ struct group;
 struct context;

 struct proxyCtrl {
-  uint8_t type;
+  uint64_t type;
  struct context* ctx;              // profiler context
  double startTs;
  double stopTs;
@@ -107,12 +111,12 @@ struct proxyCtrl {

 // task level event base structure
 struct taskEventBase {
-  uint8_t type;                     // event type: collective/p2p
+  uint64_t type;                     // event type: collective/p2p
  int rank;                         // rank of the operation in NCCL communicator
  const char* func;                 // ncclFunc*
  int refCount;                     // number of references for this operation
-  struct group* parent;             // parent event group
-  struct taskEventBase* next;       // next top level event in group
+  void* parent;                     // parent API event
+  struct taskEventBase* next;       // next top level event
  double startTs;
  double stopTs;
 };
@@ -147,7 +151,7 @@ struct p2p {
 };

 struct group {
-  uint8_t type;
+  uint64_t type;
  struct context* ctx;              // profiler context
  int groupId;
  int refCount;
@@ -158,6 +162,70 @@ struct group {
  struct group* next;               // next group event in queue
 };

+struct collApi {
+  uint64_t type;
+  struct groupApi* parent;
+  struct context* ctx;              // profiler context
+  int collApiId;
+  int refCount;
+  cudaStream_t stream;
+  const char* func;
+  size_t count;
+  const char* datatype;
+  int root;
+  bool graphCaptured;
+  struct taskEventBase* eventHead;  // queue head for task events
+  struct taskEventBase* eventTail;  // queue tail for task events
+  double startTs;
+  double stopTs;
+  struct collApi* next;
+};
+
+struct p2pApi {
+  uint64_t type;
+  struct groupApi* parent;
+  struct context* ctx;              // profiler context
+  int p2pApiId;
+  int refCount;
+  const char* func;
+  cudaStream_t stream;
+  size_t count;
+  const char* datatype;
+  bool graphCaptured;
+  struct taskEventBase* eventHead;  // queue head for task events
+  struct taskEventBase* eventTail;  // queue tail for task events
+  double startTs;
+  double stopTs;
+  struct p2pApi* next;
+};
+
+struct kernelLaunch {
+  uint64_t type;
+  struct groupApi* parent;
+  cudaStream_t stream;
+  int kernelLaunchId;
+  double startTs;
+  double stopTs;
+  struct kernelLaunch* next;
+};
+
+struct groupApi {
+  uint64_t type;
+  struct context* ctx;
+  int groupApiId;
+  int refCount;
+  bool graphCaptured;
+  int groupDepth;
+  struct profilerQueue<struct p2pApi, &p2pApi::next> p2pApiEvents;
+  struct profilerQueue<struct collApi, &collApi::next> collApiEvents;
+  struct profilerQueue<struct kernelLaunch, &kernelLaunch::next> kernelLaunchEvents;
+  double endOfncclGroupStartTs;
+  double startOfncclGroupEndTs;
+  double startTs;
+  double stopTs;
+  struct groupApi* next;
+};
+
 // arrays for different event objects
 struct context {
  const char* commName;
@@ -165,6 +233,26 @@ struct context {
  int nranks;
  int rank;

+  int groupApiPoolSize;
+  int groupApiPoolBase;
+  int groupApiPoolIndex;
+  struct groupApi* groupApiPool;
+
+  int collApiPoolSize;
+  int collApiPoolBase;
+  int collApiPoolIndex;
+  struct collApi* collApiPool;
+
+  int p2pApiPoolSize;
+  int p2pApiPoolBase;
+  int p2pApiPoolIndex;
+  struct p2pApi* p2pApiPool;
+
+  int kernelLaunchPoolSize;
+  int kernelLaunchPoolBase;
+  int kernelLaunchPoolIndex;
+  struct kernelLaunch* kernelLaunchPool;
+
  int groupPoolSize;
  int groupPoolBase;
  int groupPoolIndex;
@@ -186,9 +274,50 @@ struct context {
  struct proxyCtrl* proxyCtrlPool;
 };

-int taskEventQueueEmpty(struct group* g);
-void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event);
-struct taskEventBase* taskEventQueueHead(struct group* g);
-struct taskEventBase* taskEventQueueDequeue(struct group* g);
+template <typename T>
+inline int taskEventQueueEmpty(T *obj) {
+  return obj->eventHead == NULL;
+}
+
+template <typename T>
+inline void taskEventQueueEnqueue(T* obj, struct taskEventBase* event) {
+  event->next = NULL;
+  if (obj->eventHead) obj->eventTail->next = event;
+  else obj->eventHead = event;
+  obj->eventTail = event;
+}
+
+template <typename T>
+inline struct taskEventBase* taskEventQueueHead(T *obj) {
+    return obj->eventHead;
+}
+
+template <typename T>
+inline struct taskEventBase* taskEventQueueDequeue(T* obj) {
+  struct taskEventBase* tmp = obj->eventHead;
+  obj->eventHead = obj->eventHead->next;
+  if (obj->eventHead == NULL) obj->eventTail = NULL;
+  return tmp;
+}
+
+template <typename T>
+inline void resetTaskEvents(T *obj, struct context* ctx) {
+  while (!taskEventQueueEmpty(obj)) {
+    struct taskEventBase* base = taskEventQueueDequeue(obj);
+    if (base->type == ncclProfileColl) {
+      struct collective* c = (struct collective *)base;
+      // reset event proxyOps & proxySteps
+      memset(c->nProxyOps, 0, sizeof(int)*MAX_CHANNELS);
+      // release collective events in the group and return them to the collective pool
+      __atomic_fetch_add(&ctx->collPoolBase, 1, __ATOMIC_RELAXED);
+    } else if (base->type == ncclProfileP2p) {
+      struct p2p* p = (struct p2p *)base;
+      // reset event proxyOp and proxySteps
+      memset(&p->op, 0, sizeof(struct proxyOp)*MAX_CHANNELS);
+      // release p2p events in the group and return them to the p2p pool
+      __atomic_fetch_add(&ctx->p2pPoolBase, 1, __ATOMIC_RELAXED);
+    }
+  }
+}

 #endif
@@ -11,17 +11,20 @@
 #include <stdlib.h>

 #include "common.h"
-#include "err.h"

 enum {
-  ncclProfileGroup     = (1 << 0),  // group event type
-  ncclProfileColl      = (1 << 1),  // host collective call event type
-  ncclProfileP2p       = (1 << 2),  // host point-to-point call event type
-  ncclProfileProxyOp   = (1 << 3),  // proxy operation event type
-  ncclProfileProxyStep = (1 << 4),  // proxy step event type
-  ncclProfileProxyCtrl = (1 << 5),  // proxy control event type
-  ncclProfileKernelCh  = (1 << 6),  // kernel channel event type
-  ncclProfileNetPlugin = (1 << 7),  // network plugin-defined, events
+  ncclProfileGroup          = (1 << 0),  // group event type
+  ncclProfileColl           = (1 << 1),  // host collective call event type
+  ncclProfileP2p            = (1 << 2),  // host point-to-point call event type
+  ncclProfileProxyOp        = (1 << 3),  // proxy operation event type
+  ncclProfileProxyStep      = (1 << 4),  // proxy step event type
+  ncclProfileProxyCtrl      = (1 << 5),  // proxy control event type
+  ncclProfileKernelCh       = (1 << 6),  // kernel channel event type
+  ncclProfileNetPlugin      = (1 << 7),  // network plugin-defined, events
+  ncclProfileGroupApi       = (1 << 8),  // Group API events
+  ncclProfileCollApi        = (1 << 9),  // Collective API events
+  ncclProfileP2pApi         = (1 << 10), // Point-to-Point API events
+  ncclProfileKernelLaunch   = (1 << 11), // Kernel launch events
 };

 typedef enum {
@@ -56,21 +59,27 @@ typedef enum {

  /* Kernel event states */
  ncclProfilerKernelChStop             = 22,
+
+  /* Group API States */
+  ncclProfilerEndGroupApiStart         = 23,
+  ncclProfilerBeginGroupApiEnd         = 24
 } ncclProfilerEventState_t;

 typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t;
 typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t;
 typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t;
 typedef ncclProfilerEventState_t ncclProfilerEventState_v4_t;
+typedef ncclProfilerEventState_t ncclProfilerEventState_v5_t;

+#include "profiler_v5.h"
 #include "profiler_v4.h"
 #include "profiler_v3.h"
 #include "profiler_v2.h"
 #include "profiler_v1.h"
 #include "profiler_net.h"

-typedef ncclProfiler_v4_t ncclProfiler_t;
-typedef ncclProfilerEventDescr_v4_t ncclProfilerEventDescr_t;
-typedef ncclProfilerEventStateArgs_v4_t ncclProfilerEventStateArgs_t;
+typedef ncclProfiler_v5_t ncclProfiler_t;
+typedef ncclProfilerEventDescr_v5_t ncclProfilerEventDescr_t;
+typedef ncclProfilerEventStateArgs_v5_t ncclProfilerEventStateArgs_t;

 #endif // end include guard
@@ -0,0 +1,152 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V5_H_
+#define PROFILER_V5_H_
+#include <stdbool.h>
+
+typedef struct {
+  uint64_t type;                // event type descriptor: ncclProfileGroupApi, ...
+  void* parentObj;              // pointer to the profiler parent object
+  int rank;                     // originating rank
+  union {
+    struct {
+      int graphCaptured;
+      int groupDepth;
+    } groupApi;
+
+    struct {
+      const char* func;
+      size_t count;
+      const char* datatype;
+      int root;
+      void* stream;
+      bool graphCaptured;
+    } collApi;
+
+    struct {
+      const char* func;
+      size_t count;
+      const char* datatype;
+      void* stream;
+      bool graphCaptured;
+    } p2pApi;
+
+    struct {
+      void* stream;
+    } kernelLaunch;
+
+    struct {
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      uint8_t nChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+      void* parentGroup; // for backward compatibility with v4
+    } coll;
+
+    struct {
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+      uint8_t nChannels;
+      void* parentGroup; // for backward compatibility with v4
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+
+    struct {
+      uint8_t channelId;
+      uint64_t pTimer;          // start timestamp from GPU globaltimer
+    } kernelCh;
+
+    struct {
+      int64_t id;
+      void* data;
+    } netPlugin;
+  };
+} ncclProfilerEventDescr_v5_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+  } proxyStep;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+
+  struct {
+    void* data;
+  } netPlugin;
+
+  struct {
+    uint64_t pTimer;
+  } kernelCh;
+} ncclProfilerEventStateArgs_v5_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  //  - commId         : communicator id
+  //  - commName       : user assigned communicator name
+  //  - nNodes         : number of nodes in communicator
+  //  - nranks         : number of ranks in communicator
+  //  - rank           : rank identifier in communicator
+  //  - logfn          : logger function
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, uint64_t commId, int* eActivationMask, const char* commName, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v5_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v5_t eState, ncclProfilerEventStateArgs_v5_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v5_t;
+
+#endif
@@ -6,7 +6,7 @@

 #include <stdio.h>
 #include <pthread.h>
-#include <string.h>
+#include <cstring>
 #include <linux/limits.h>
 #include <sys/time.h>
 #include <sys/types.h>
@@ -22,12 +22,20 @@ static int initialized;             // initialization counter for profiler
 static double startTime;            // profiler start time

 static const int defaultEActivationMask = ncclProfileColl | ncclProfileP2p;
-static const int defaultGroupPoolSize = 16;
-static const int defaultCollPoolSize = 16;
-static const int defaultP2pPoolSize = 1024;
+static const int defaultGroupApiPoolSize = 256;
+static const int defaultCollApiPoolSize = 256;
+static const int defaultP2pApiPoolSize = 256;
+static const int defaultKernelLaunchPoolSize = 256;
+static const int defaultGroupPoolSize = 256;
+static const int defaultCollPoolSize = 256;
+static const int defaultP2pPoolSize = 256;
 static const int defaultProxyCtrlPoolSize = 16;
-static const int defaultDetachPoolSize = 128;
+static const int defaultDetachPoolSize = 256;

+static int groupApiPoolSize;
+static int collApiPoolSize;
+static int p2pApiPoolSize;
+static int kernelLaunchPoolSize;
 static int groupPoolSize;
 static int collPoolSize;
 static int p2pPoolSize;
@@ -51,7 +59,7 @@ static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
 static pid_t pid;
 static int* eActivationMaskPtr;

-__hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) {
+__hidden ncclResult_t exampleProfilerInit(void** context, uint64_t commId, int* eActivationMask, const char* commName, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) {
  pthread_mutex_lock(&lock);
  if (__atomic_fetch_add(&initialized, 1, __ATOMIC_RELAXED) == 0) {
    // first thread initializes event mask, environment and detach pool
@@ -59,6 +67,18 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask,
    str = getenv("NCCL_PROFILE_EVENT_MASK");
    __atomic_store_n(eActivationMask, str ? atoi(str) : 0, __ATOMIC_RELAXED);

+    str = getenv("NCCL_PROFILE_GROUP_API_POOL_SIZE");
+    groupApiPoolSize = str ? atoi(str) : defaultGroupApiPoolSize;
+
+    str = getenv("NCCL_PROFILE_COLL_API_POOL_SIZE");
+    collApiPoolSize = str ? atoi(str) : defaultCollApiPoolSize;
+
+    str = getenv("NCCL_PROFILE_P2P_API_POOL_SIZE");
+    p2pApiPoolSize = str ? atoi(str) : defaultP2pApiPoolSize;
+
+    str = getenv("NCCL_PROFILE_KERNEL_LAUNCH_POOL_SIZE");
+    kernelLaunchPoolSize = str ? atoi(str) : defaultKernelLaunchPoolSize;
+
    str = getenv("NCCL_PROFILE_GROUP_POOL_SIZE");
    groupPoolSize = str ? atoi(str) : defaultGroupPoolSize;

@@ -95,12 +115,25 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask,

  // pre-allocate memory for event object pools in dedicated profiler context
  struct context* ctx = (struct context *)calloc(1, sizeof(*ctx));
+  if (ctx == nullptr) return ncclSystemError;
  ctx->commName = commName;
-  ctx->commHash = commHash;
+  ctx->commHash = commId;
  ctx->nranks = nranks;
  ctx->rank = rank;
  logFn = logfn;
-  INFO(NCCL_INIT, "PROFILER/Plugin: init commName: %s commHash: %lu nranks: %d rank: %d", commName ? commName : "", commHash, nranks, rank);
+  INFO(NCCL_INIT, "PROFILER/Plugin: init commName: %s commHash: %lu nranks: %d rank: %d", commName ? commName : "", commId, nranks, rank);
+
+  ctx->groupApiPool = (struct groupApi *)calloc(groupApiPoolSize, sizeof(*ctx->groupApiPool));
+  if (ctx->groupApiPool == NULL) goto fail;
+
+  ctx->collApiPool = (struct collApi *)calloc(collApiPoolSize, sizeof(*ctx->collApiPool));
+  if (ctx->collApiPool == NULL) goto fail;
+
+  ctx->p2pApiPool = (struct p2pApi *)calloc(p2pApiPoolSize, sizeof(*ctx->p2pApiPool));
+  if (ctx->p2pApiPool == NULL) goto fail;
+
+  ctx->kernelLaunchPool = (struct kernelLaunch *)calloc(kernelLaunchPoolSize, sizeof(*ctx->kernelLaunchPool));
+  if (ctx->kernelLaunchPool == NULL) goto fail;

  ctx->groupPool = (struct group *)calloc(groupPoolSize, sizeof(*ctx->groupPool));
  if (ctx->groupPool == NULL) goto fail;
@@ -130,16 +163,22 @@ fail:
  if (ctx->p2pPool) free(ctx->p2pPool);
  if (ctx->collPool) free(ctx->collPool);
  if (ctx->groupPool) free(ctx->groupPool);
+  if (ctx->collApiPool) free(ctx->collApiPool);
+  if (ctx->p2pApiPool) free(ctx->p2pApiPool);
+  if (ctx->kernelLaunchPool) free(ctx->kernelLaunchPool);
+  if (ctx->groupApiPool) free(ctx->groupApiPool);
  free(ctx);
  if (detachPool) free(detachPool);
  return ncclSystemError;
 }

+static const char* profilerDumpFile;
+
 __hidden ncclResult_t exampleProfilerFinalize(void* context) {
  FILE* fh = NULL;
  char filename[PATH_MAX] = { 0 };
  struct context* ctx = (struct context *)context;
-  const char* dump = getenv("NCCL_PROFILE_DUMP_FILE");
+  const char* dump = profilerDumpFile ? profilerDumpFile : getenv("NCCL_PROFILE_DUMP_FILE");
  if (dump) {
    sprintf(filename, "%s_%lu_%d.json", dump, ctx->commHash, ctx->rank);
    fh = fopen(filename, "w");
@@ -148,10 +187,12 @@ __hidden ncclResult_t exampleProfilerFinalize(void* context) {
  INFO(NCCL_INIT, "PROFILER/Plugin: finalize commName: %s commHash: %lu nranks: %d rank: %d", ctx->commName ? ctx->commName : "", ctx->commHash, ctx->nranks, ctx->rank);

  // print last N groups/collectives/p2ps
-  int start = (ctx->groupPoolIndex - groupPoolSize >= 0) ? ctx->groupPoolIndex - groupPoolSize : 0;
-  int end = ctx->groupPoolIndex;
+  // Note that since the v5 version of the profiler, group API events are now at the top of the hierarchy.
+  // Legacy Group events from v4 are still emitted for compatibility purposes when using the v4 profiler but excluded from this example.
+  int start = (ctx->groupApiPoolIndex - groupApiPoolSize >= 0) ? ctx->groupApiPoolIndex - groupApiPoolSize : 0;
+  int end = ctx->groupApiPoolIndex;
  for (int i = start; i < end; i++) {
-    printEvent(fh, &ctx->groupPool[i%groupPoolSize]);
+    printEvent(fh, &ctx->groupApiPool[i%groupApiPoolSize]);
  }

  start = (ctx->proxyCtrlPoolIndex - proxyCtrlPoolSize >= 0) ? ctx->proxyCtrlPoolIndex - proxyCtrlPoolSize : 0;
@@ -161,6 +202,10 @@ __hidden ncclResult_t exampleProfilerFinalize(void* context) {
  }

  free(ctx->groupPool);
+  free(ctx->collApiPool);
+  free(ctx->p2pApiPool);
+  free(ctx->kernelLaunchPool);
+  free(ctx->groupApiPool);
  free(ctx->collPool);
  free(ctx->p2pPool);
  free(ctx->proxyCtrlPool);
@@ -187,7 +232,113 @@ __hidden void updateEvent(void* handle);
 __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, ncclProfilerEventDescr_t* eDescr) {
  *eHandle = NULL;
  struct context* ctx = (struct context *)context;
-  if (eDescr->type == ncclProfileGroup) {
+  if (eDescr->type == ncclProfileGroupApi) {
+    struct groupApi* event;
+    int groupApiId = __atomic_fetch_add(&ctx->groupApiPoolIndex, 1, __ATOMIC_RELAXED);
+    if ((groupApiId - __atomic_load_n(&ctx->groupApiPoolBase, __ATOMIC_RELAXED)) < groupApiPoolSize) {
+      // if there are available group API events grab one
+      event = &ctx->groupApiPool[groupApiId%groupApiPoolSize];
+      // Make sure all child events of the picked group API event are cleared
+      while (!profilerQueueEmpty(&event->collApiEvents)) {
+        struct collApi *collApiEvent = profilerQueueDequeue(&event->collApiEvents);
+        resetTaskEvents(collApiEvent, ctx);
+        __atomic_fetch_add(&ctx->collApiPoolBase, 1, __ATOMIC_RELAXED);
+      }
+      while (!profilerQueueEmpty(&event->p2pApiEvents)) {
+        struct p2pApi *p2pApiEvent = profilerQueueDequeue(&event->p2pApiEvents);
+        resetTaskEvents(p2pApiEvent, ctx);
+        __atomic_fetch_add(&ctx->p2pApiPoolBase, 1, __ATOMIC_RELAXED);
+      }
+      while (!profilerQueueEmpty(&event->kernelLaunchEvents)) {
+        profilerQueueDequeue(&event->kernelLaunchEvents);
+        __atomic_fetch_add(&ctx->kernelLaunchPoolBase, 1, __ATOMIC_RELAXED);
+      }
+    } else {
+      // else drop this event
+      __atomic_fetch_sub(&ctx->groupApiPoolIndex, 1, __ATOMIC_RELAXED);
+      return ncclSuccess;
+    }
+    event->type = ncclProfileGroupApi;
+    event->ctx = ctx;
+    event->groupApiId = groupApiId;
+    event->graphCaptured = eDescr->groupApi.graphCaptured;
+    event->groupDepth = eDescr->groupApi.groupDepth;
+    event->startTs = gettime() - startTime;
+    *eHandle = event;
+  } else if (eDescr->type == ncclProfileCollApi) {
+    if (eDescr->parentObj == NULL) return ncclSuccess;
+    struct collApi* event;
+    int collApiId = __atomic_fetch_add(&ctx->collApiPoolIndex, 1, __ATOMIC_RELAXED);
+    if ((collApiId - __atomic_load_n(&ctx->collApiPoolBase, __ATOMIC_RELAXED)) < collApiPoolSize) {
+      // if there are available Coll API events grab one
+      event = &ctx->collApiPool[collApiId%collApiPoolSize];
+      resetTaskEvents(event, ctx);
+    } else {
+      // else drop this event
+      __atomic_fetch_sub(&ctx->collApiPoolIndex, 1, __ATOMIC_RELAXED);
+      return ncclSuccess;
+    }
+    event->type = ncclProfileCollApi;
+    event->collApiId = collApiId;
+    event->ctx = ctx;
+    event->func = eDescr->collApi.func;
+    event->stream = (cudaStream_t) eDescr->collApi.stream;
+    event->count = eDescr->collApi.count;
+    event->datatype = eDescr->collApi.datatype;
+    event->root = eDescr->collApi.root;
+    event->graphCaptured = eDescr->collApi.graphCaptured;
+    struct groupApi* parent = (struct groupApi *) eDescr->parentObj;
+    event->parent = parent;
+    profilerQueueEnqueue(&parent->collApiEvents, event);
+    __atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED);
+    *eHandle = event;
+  } else if (eDescr->type == ncclProfileP2pApi) {
+    if (eDescr->parentObj == NULL) return ncclSuccess;
+    struct p2pApi* event;
+    int p2pApiId = __atomic_fetch_add(&ctx->p2pApiPoolIndex, 1, __ATOMIC_RELAXED);
+    if ((p2pApiId - __atomic_load_n(&ctx->p2pApiPoolBase, __ATOMIC_RELAXED)) < p2pApiPoolSize) {
+      // if there are available p2p API events grab one
+      event = &ctx->p2pApiPool[p2pApiId%p2pApiPoolSize];
+      resetTaskEvents(event, ctx);
+    } else {
+      // else drop this event
+      __atomic_fetch_sub(&ctx->p2pApiPoolIndex, 1, __ATOMIC_RELAXED);
+      return ncclSuccess;
+    }
+    event->type = ncclProfileP2pApi;
+    event->p2pApiId = p2pApiId;
+    event->ctx = ctx;
+    event->func = eDescr->p2pApi.func;
+    event->stream = (cudaStream_t) eDescr->p2pApi.stream;
+    event->count = eDescr->p2pApi.count;
+    event->datatype = eDescr->p2pApi.datatype;
+    event->graphCaptured = eDescr->p2pApi.graphCaptured;
+    struct groupApi* parent = (struct groupApi *) eDescr->parentObj;
+    event->parent = parent;
+    profilerQueueEnqueue(&parent->p2pApiEvents, event);
+    __atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED);
+    *eHandle = event;
+  } else if (eDescr->type == ncclProfileKernelLaunch) {
+    if (eDescr->parentObj == NULL) return ncclSuccess;
+    struct kernelLaunch* event;
+    int kernelLaunchId = __atomic_fetch_add(&ctx->kernelLaunchPoolIndex, 1, __ATOMIC_RELAXED);
+    if ((kernelLaunchId - __atomic_load_n(&ctx->kernelLaunchPoolBase, __ATOMIC_RELAXED)) < kernelLaunchPoolSize) {
+      // if there are available kernel API events grab one
+      event = &ctx->kernelLaunchPool[kernelLaunchId%kernelLaunchPoolSize];
+    } else {
+      // else drop this event
+      __atomic_fetch_sub(&ctx->kernelLaunchPoolIndex, 1, __ATOMIC_RELAXED);
+      return ncclSuccess;
+    }
+    event->type = ncclProfileKernelLaunch;
+    event->stream = (cudaStream_t) eDescr->kernelLaunch.stream;
+    struct groupApi* parent = (struct groupApi *) eDescr->parentObj;
+    event->parent = parent;
+    profilerQueueEnqueue(&parent->kernelLaunchEvents, event);
+    __atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED);
+    *eHandle = event;
+  } else if (eDescr->type == ncclProfileGroup) {
+    if (eDescr->parentObj == NULL) return ncclSuccess;
    struct group* event;
    int groupId = __atomic_fetch_add(&ctx->groupPoolIndex, 1, __ATOMIC_RELAXED);
    if ((groupId - __atomic_load_n(&ctx->groupPoolBase, __ATOMIC_RELAXED)) < groupPoolSize) {
@@ -222,7 +373,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
    debugEvent(event, "GroupStart");
  } else if (eDescr->type == ncclProfileColl) {
    // the parent might be null if we run out of events
-    struct group* parent = (struct group *)eDescr->parentObj;
+    struct collApi* parent = (struct collApi *)eDescr->parentObj;
    if (parent == NULL) return ncclSuccess;

    struct collective* event;
@@ -253,12 +404,12 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
    event->proto = eDescr->coll.proto;
    *eHandle = event;
    taskEventQueueEnqueue(parent, (struct taskEventBase *)event);
-    // increment the group ref counter so the event will staty open
+    // increment the group ref counter so the event will stay open
    __atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED);
    debugEvent(event, "CollStart");
  } else if (eDescr->type == ncclProfileP2p) {
    // the parent might be null if we run out of events
-    struct group* parent = (struct group *)eDescr->parentObj;
+    struct p2pApi* parent = (struct p2pApi*) eDescr->parentObj;
    if (parent == NULL) return ncclSuccess;

    struct p2p* event;
@@ -458,8 +609,34 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
 }

 void updateEvent(void* handle) {
-  uint8_t type = *(uint8_t *)handle;
-  if (type == ncclProfileGroup) {
+  uint64_t type = *(uint64_t *)handle;
+  if (type == ncclProfileGroupApi) {
+    struct groupApi* event = (struct groupApi*) handle;
+    if (__atomic_sub_fetch(&event->refCount, 1, __ATOMIC_RELAXED) == 0) {
+      event->stopTs = gettime() - startTime;
+      __atomic_fetch_add(&event->ctx->groupApiPoolBase, 1, __ATOMIC_RELAXED);
+    }
+  } else if (type == ncclProfileCollApi) {
+    struct collApi* event = (struct collApi*) handle;
+    if (__atomic_sub_fetch(&event->refCount, 1, __ATOMIC_RELAXED) == 0) {
+      event->stopTs = gettime() - startTime;
+      __atomic_fetch_add(&event->ctx->collApiPoolBase, 1, __ATOMIC_RELAXED);
+    }
+    updateEvent(event->parent);
+    return;
+  } else if (type == ncclProfileP2pApi) {
+    struct p2pApi* event = (struct p2pApi*) handle;
+    if (__atomic_sub_fetch(&event->refCount, 1, __ATOMIC_RELAXED) == 0) {
+      event->stopTs = gettime() - startTime;
+      __atomic_fetch_add(&event->ctx->p2pApiPoolBase, 1, __ATOMIC_RELAXED);
+    }
+    updateEvent(event->parent);
+    event->stopTs = gettime() - startTime;
+  } else if (type == ncclProfileKernelLaunch) {
+    struct kernelLaunch* event = (struct kernelLaunch*) handle;
+    event->stopTs = gettime() - startTime;
+    updateEvent(event->parent);
+  } else if (type == ncclProfileGroup) {
    struct group* event = (struct group *)handle;
    if (__atomic_sub_fetch(&event->refCount, 1, __ATOMIC_RELAXED) == 0) {
      event->stopTs = gettime() - startTime;
@@ -527,25 +704,35 @@ __hidden ncclResult_t exampleProfilerStopEvent(void* eHandle) {
  // the event handle might be null if we run out of events
  if (eHandle == NULL) return ncclSuccess;

-  uint8_t type = *(uint8_t *)eHandle;
-  if (type == ncclProfileGroup) {
-    // stopping the group event in NCCL core does not
-    // mean the group has completed. It means the group
-    // was submitted/enqueued so we need to keep the event open
+  uint64_t type = *(uint64_t *)eHandle;
+  // Stopping API events, Kernel Launch events, collective/p2p task events
+  // in NCCL core do not mean that they are complete. It means that the
+  // operation was enqueued so we need to keep the events open
+  if (type == ncclProfileGroupApi) {
+    struct groupApi* event = (struct groupApi*) eHandle;
+    event->stopTs = gettime() - startTime;
+    return ncclSuccess;
+  } else if (type == ncclProfileCollApi) {
+    struct collApi* event = (struct collApi*) eHandle;
+    event->stopTs = gettime() - startTime;
+    return ncclSuccess;
+  } else if (type == ncclProfileP2pApi) {
+    struct p2pApi* event = (struct p2pApi*) eHandle;
+    event->stopTs = gettime() - startTime;
+    return ncclSuccess;
+  } else if (type == ncclProfileKernelLaunch) {
+    struct kernelLaunch* event = (struct kernelLaunch*) eHandle;
+    event->stopTs = gettime() - startTime;
+    return ncclSuccess;
+  } else if (type == ncclProfileGroup) {
    struct group* event = (struct group *)eHandle;
    event->stopTs = gettime() - startTime;
    return ncclSuccess;
  } else if (type == ncclProfileColl) {
-    // stopping the collective event in NCCL core does not
-    // mean the collective has completed. It means the collective
-    // was submitted/enqueued so we need to keep the event open
    struct collective* event = (struct collective *)eHandle;
    event->base.stopTs = gettime() - startTime;
    return ncclSuccess;
  } else if (type == ncclProfileP2p) {
-    // stopping the p2p event in NCCL core does not
-    // mean the p2p has completed. It means the p2p
-    // was submitted/enqueued so we need to keep the event open
    struct p2p* event = (struct p2p *)eHandle;
    event->base.stopTs = gettime() - startTime;
    return ncclSuccess;
@@ -559,8 +746,15 @@ __hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfile
  // the event handle might be null if we run out of events
  if (eHandle == NULL) return ncclSuccess;

-  uint8_t type = *(uint8_t *)eHandle;
-  if (type == ncclProfileProxyOp) {
+  uint64_t type = *(uint64_t *)eHandle;
+  if (type == ncclProfileGroupApi) {
+    struct groupApi* event = (struct groupApi*) eHandle;
+    if (eState == ncclProfilerEndGroupApiStart) {
+      event->endOfncclGroupStartTs = gettime() - startTime;
+    } else if (eState == ncclProfilerBeginGroupApiEnd) {
+      event->startOfncclGroupEndTs = gettime() - startTime;
+    }
+  } else if (type == ncclProfileProxyOp) {
    struct proxyOp* event = (struct proxyOp *)eHandle;
    if (eState == ncclProfilerProxyOpInProgress_v4) {
      event->progrTs = gettime() - startTime;
@@ -592,6 +786,8 @@ __hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfile
      case ncclProfilerProxyStepRecvGPUWait:
        event->timestamp[PROXY_STEP_RECV_GPU_WAIT] = gettime() - startTime;
        break;
+      default:
+        break;
    }
  } else if (type == ncclProfileProxyCtrl) {
    struct proxyCtrl* event = (struct proxyCtrl *)eHandle;
@@ -609,7 +805,7 @@ __hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfile
  return ncclSuccess;
 }

-ncclProfiler_t ncclProfiler_v4 = {
+ncclProfiler_t ncclProfiler_v5 = {
  "Example-profiler",
  exampleProfilerInit,
  exampleProfilerStartEvent,
@@ -618,14 +814,15 @@ ncclProfiler_t ncclProfiler_v4 = {
  exampleProfilerFinalize,
 };

-int exampleProfilerStart(int eActivationMask) {
+__attribute__((visibility("default"))) int exampleProfilerStart(int eActivationMask, const char* name) {
+  profilerDumpFile = name;
  if (__atomic_load_n(&initialized, __ATOMIC_RELAXED)) {
    __atomic_store_n(eActivationMaskPtr, eActivationMask, __ATOMIC_RELAXED);
  }
  return ncclSuccess;
 }

-int exampleProfilerStop(void) {
+__attribute__((visibility("default"))) int exampleProfilerStop(void) {
  if (__atomic_load_n(&initialized, __ATOMIC_RELAXED)) {
    __atomic_store_n(eActivationMaskPtr, 0, __ATOMIC_RELAXED);
  }
@@ -7,7 +7,8 @@
 #ifndef PLUGIN_H_
 #define PLUGIN_H_

-int exampleProfilerStart(int eActivationMask);
-int exampleProfilerStop(void);
+__attribute__((visibility("default"))) int exampleProfilerStart(int eActivationMask, const char* name);
+__attribute__((visibility("default"))) int exampleProfilerStop(void);
+

 #endif
@@ -5,15 +5,59 @@
 ************************************************************************/

 #include <stdio.h>
+#include "err.h"
 #include "profiler.h"
 #include "event.h"
 #include "print_event.h"
+#include <cuda_runtime.h>

 #define __hidden __attribute__ ((visibility("hidden")))

 // FIXME: chrome tracing asynchronous events (following used) allow event nesting for events that have same id and category
 // It appears that nesting more than three events causes issues. Therefore, every event is given an increasing id and a
-// category that matches the type of event (GROUP, COLL, P2P, PROXY, NET)
+// category that matches the type of event (GROUP API, COLL API, P2P API, GROUP, COLL, P2P, PROXY, NET)
+static __thread int groupApiId;
+__hidden void printGroupApiEventHeader(FILE* fh, struct groupApi* event) {
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GROUP_API\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"groupApiId\": %d, \"groupDepth\":%d}},\n",
+          "Group API", groupApiId, getpid(), 1, event->startTs, event->groupApiId, event->groupDepth);
+}
+
+__hidden void printGroupApiEventTrailer(FILE* fh, struct groupApi* event) {
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GROUP_API\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+          "Group API", groupApiId++, getpid(), 1, event->stopTs);
+}
+
+static __thread int p2pApiId;
+__hidden void printP2pApiEventHeader(FILE* fh, struct p2pApi* event) {
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P_API\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"count\": %lu, \"datatype\": %s, \"GraphCaptured\":%d, \"Stream\": %p}},\n",
+      event->func, p2pApiId, getpid(), 1, event->startTs, event->count, event->datatype, event->graphCaptured, event->stream);
+}
+
+__hidden void printP2pApiEventTrailer(FILE* fh, struct p2pApi* event) {
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P_API\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+          event->func, p2pApiId++, getpid(), 1, event->stopTs);
+}
+
+static __thread int collApiId;
+__hidden void printCollApiEventHeader(FILE* fh, struct collApi* event) {
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL_API\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"count\": %lu, \"datatype\": %s, \"root\": %d, \"GraphCaptured\":%d, \"Stream\": %p}},\n",
+      event->func, collApiId, getpid(), 1, event->startTs, event->count, event->datatype, event->root, event->graphCaptured, event->stream);
+}
+
+__hidden void printCollApiEventTrailer(FILE* fh, struct collApi* event) {
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL_API\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+      event->func, collApiId++, getpid(), 1, event->stopTs);
+}
+
+static __thread int kernelLaunchId;
+__hidden void printKernelLaunchEventHeader(FILE* fh, struct kernelLaunch* event) {
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"KERNEL_LAUNCH\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"groupId\": %d, \"Stream\": %p}},\n", "KernelLaunch", kernelLaunchId, getpid(), 1, event->startTs, event->kernelLaunchId, event->stream);
+}
+
+__hidden void printKernelLaunchEventTrailer(FILE* fh, struct kernelLaunch* event) {
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"KERNEL_LAUNCH\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", "KernelLaunch", kernelLaunchId++, getpid(), 1, event->stopTs);
+}
+
 static __thread int groupId;
 __hidden void printGroupEventHeader(FILE* fh, struct group* event) {
  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GROUP\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"groupId\": %d}},\n",
@@ -28,7 +72,7 @@ __hidden void printGroupEventTrailer(FILE* fh, struct group* event) {
 static __thread int collId;
 __hidden void printCollEventHeader(FILE* fh, struct collective* event) {
  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"SeqNum\": %lu, \"CommHash\": %lu, \"Rank\": %d, \"Count\": %lu, \"Datatype\": \"%s\", \"Algorithm\": \"%s\", \"Protocol\": \"%s\", \"nChannels\": %d}},\n",
-          event->base.func, collId, getpid(), 1, event->base.startTs, event->seqNumber, event->base.parent->ctx->commHash, event->base.rank, event->count, event->datatype, event->algo, event->proto, event->nChannels);
+          event->base.func, collId, getpid(), 1, event->base.startTs, event->seqNumber, ((struct collApi*)event->base.parent)->ctx->commHash, event->base.rank, event->count, event->datatype, event->algo, event->proto, event->nChannels);
 }

 __hidden void printCollEventTrailer(FILE* fh, struct collective* event) {
@@ -39,7 +83,7 @@ __hidden void printCollEventTrailer(FILE* fh, struct collective* event) {
 static __thread int p2pId;
 __hidden void printP2pEventHeader(FILE* fh, struct p2p* event) {
  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"CommHash\": %lu, \"Rank\": %d, \"Peer\": %d, \"Count\": %lu, \"Datatype\": \"%s\", \"nChannels\": %d}},\n",
-          event->base.func, p2pId, getpid(), 1, event->base.startTs, event->base.parent->ctx->commHash, event->base.rank, event->peer, event->count, event->datatype, event->nChannels);
+          event->base.func, p2pId, getpid(), 1, event->base.startTs, ((struct p2pApi*)event->base.parent)->ctx->commHash, event->base.rank, event->peer, event->count, event->datatype, event->nChannels);
 }

 __hidden void printP2pEventTrailer(FILE* fh, struct p2p* event) {
@@ -173,7 +217,7 @@ void debugEvent(void* eHandle, const char* tag) {
  char filename[64] = { 0 };
  sprintf(filename, "EventDebug-%d", getpid());
  FILE* fh = fopen(filename, "a+");
-  uint8_t type = *(uint8_t *)eHandle;
+  uint64_t type = *(uint64_t *)eHandle;
  if (type == ncclProfileGroup) {
    struct group* event = (struct group *)eHandle;
    fprintf(fh, "Group event %p tag = %s {\n", event, tag);
@@ -241,8 +285,51 @@ void debugEvent(void* eHandle, const char* tag) {

 void printEvent(FILE* fh, void* handle) {
  if (handle == NULL || fh == NULL) return;
-  uint8_t type = *(uint8_t *)handle;
-  if (type == ncclProfileGroup) {
+  uint64_t type = *(uint64_t *)handle;
+  if (type == ncclProfileGroupApi) {
+    struct groupApi* g = (struct groupApi*) handle;
+    printGroupApiEventHeader(fh, g);
+    struct kernelLaunch* kernelLaunchHead = profilerQueueHead(&g->kernelLaunchEvents);
+    while (kernelLaunchHead != NULL) {
+      printEvent(fh, kernelLaunchHead);
+      kernelLaunchHead = kernelLaunchHead->next;
+    }
+    struct collApi* collApiHead = profilerQueueHead(&g->collApiEvents);
+    while (collApiHead != NULL) {
+      printEvent(fh, collApiHead);
+      collApiHead = collApiHead->next;
+    }
+    struct p2pApi* p2pApiHead = profilerQueueHead(&g->p2pApiEvents);
+    while (p2pApiHead != NULL) {
+      printEvent(fh, p2pApiHead);
+      p2pApiHead = p2pApiHead->next;
+    }
+    printGroupApiEventTrailer(fh, g);
+  } else if (type == ncclProfileCollApi) {
+    struct collApi* collApiEvent = (struct collApi *) handle;
+    printCollApiEventHeader(fh, collApiEvent);
+    struct taskEventBase* base = taskEventQueueHead(collApiEvent);
+    while (base) {
+      struct taskEventBase* next = base->next;
+      printEvent(fh, base);
+      base = next;
+    }
+    printCollApiEventTrailer(fh, collApiEvent);
+  } else if (type == ncclProfileP2pApi) {
+    struct p2pApi* p2pApiEvent = (struct p2pApi *) handle;
+    printP2pApiEventHeader(fh, p2pApiEvent);
+    struct taskEventBase* base = taskEventQueueHead(p2pApiEvent);
+    while (base) {
+      struct taskEventBase* next = base->next;
+      printEvent(fh, base);
+      base = next;
+    }
+    printP2pApiEventTrailer(fh, p2pApiEvent);
+  } else if (type == ncclProfileKernelLaunch) {
+    struct kernelLaunch* kernelLaunchEvent = (struct kernelLaunch *) handle;
+    printKernelLaunchEventHeader(fh, kernelLaunchEvent);
+    printKernelLaunchEventTrailer(fh, kernelLaunchEvent);
+  } else if (type == ncclProfileGroup) {
    struct group* g = (struct group *)handle;
    printGroupEventHeader(fh, g);
    struct taskEventBase* base = taskEventQueueHead(g);
@@ -0,0 +1,50 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef QUEUE_H
+#define QUEUE_H
+
+template<typename T, T *T::*next>
+struct profilerQueue {
+  T *head, *tail;
+};
+
+template<typename T, T *T::*next>
+ inline void profilerQueueConstruct(profilerQueue<T,next> *me) {
+  me->head = nullptr;
+  me->tail = nullptr;
+}
+
+template<typename T, T *T::*next>
+ inline bool profilerQueueEmpty(profilerQueue<T,next> *me) {
+  return me->head == nullptr;
+}
+
+template<typename T, T *T::*next>
+inline T* profilerQueueHead(profilerQueue<T,next> *me) {
+  return me->head;
+}
+
+template<typename T, T *T::*next>
+ inline T* profilerQueueTail(profilerQueue<T,next> *me) {
+  return me->tail;
+}
+
+template<typename T, T *T::*next>
+ inline void profilerQueueEnqueue(profilerQueue<T,next> *me, T *x) {
+  x->*next = nullptr;
+  (me->head ? me->tail->*next : me->head) = x;
+  me->tail = x;
+}
+
+template<typename T, T *T::*next>
+ inline T* profilerQueueDequeue(profilerQueue<T,next> *me) {
+  T *ans = me->head;
+  me->head = ans->*next;
+  if (me->head == nullptr) me->tail = nullptr;
+  return ans;
+}
+
+#endif
@@ -0,0 +1,22 @@
+.PHONY: build-CoMMA
+
+all: build-CoMMA
+
+build-CoMMA: clone-CoMMA
+	cd CoMMA && cargo build
+
+clone-CoMMA:
+	@if [ ! -d CoMMA ] ; then \
+		git clone https://github.com/google/CoMMA.git; \
+		ln -s $(PWD)/.. CoMMA/third_party/nccl/ext-profiler; \
+	fi
+
+clean:
+	@if [ -d CoMMA ] ; then \
+		cd CoMMA && cargo clean; \
+	fi
+
+delete:
+	@if [ -d CoMMA ] ; then \
+		rm -rf CoMMA; \
+	fi
@@ -0,0 +1,62 @@
+#
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+# Variables
+NCCL_HOME := ../../build
+INC := -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl
+PLUGIN_SO := libnccl-profiler-inspector.so
+VERSION_FILE := version.cc
+
+# Compiler and flags
+CXX := g++
+CXXFLAGS := -g -O3 -fPIC -shared -march=native -DNDEBUG -Wall -Wextra
+
+ifeq ($(DEBUG), 1)
+CXXFLAGS += -g2 -ggdb3 -rdynamic -funwind-tables -fno-omit-frame-pointer
+endif
+
+ifeq ($(ASAN), 1)
+CXXFLAGS += -fsanitize=address
+LDFLAGS += -fsanitize=address -static-libasan
+NVLDFLAGS += -Xcompiler -fsanitize=address,-static-libasan
+endif
+
+ifeq ($(UBSAN), 1)
+CXXFLAGS += -fsanitize=undefined
+LDFLAGS += -fsanitize=undefined -static-libubsan
+NVLDFLAGS += -Xcompiler -fsanitize=undefined,-static-libubsan
+endif
+
+# Source files
+SOURCES := inspector_plugin.cc inspector.cc json.cc
+
+# Default target
+all: $(PLUGIN_SO)
+
+# Rule to build the plugin
+$(PLUGIN_SO): $(VERSION_FILE) $(SOURCES)
+	@echo "Compiling to create $@ from $^"
+	$(CXX) $(INC) $(CXXFLAGS) -o $@ -Wl,-soname,$(PLUGIN_SO) $^
+
+# Rule to generate version.cc
+$(VERSION_FILE):
+	@GIT_INFO=$$(./utils/extract_git_version.sh); \
+	echo '#include "version.h"' > $(VERSION_FILE).tmp; \
+	echo 'const char* get_git_version_info() { return "'$$GIT_INFO'"; }' >> $(VERSION_FILE).tmp; \
+	if ! cmp $(VERSION_FILE).tmp $(VERSION_FILE); then \
+		echo "updating ${VERSION_FILE} file -> $$GIT_INFO"; \
+		mv $(VERSION_FILE).tmp $(VERSION_FILE); \
+	else \
+		echo "${VERSION_FILE} up to date -> $$GIT_INFO"; \
+		rm $(VERSION_FILE).tmp; \
+	fi
+
+# Clean target
+clean:
+	rm -f $(VERSION_FILE) $(PLUGIN_SO)
+
+# Phony targets
+.PHONY: all clean
@@ -0,0 +1,216 @@
+# NCCL Inspector Plugin
+
+The NCCL Inspector is a plugin for the NVIDIA Collective Communications Library (NCCL) that provides detailed, per-communicator, per-collective performance and metadata logging. It is designed to help users analyze and debug NCCL collective operations by generating structured JSON output for each operation.
+
+## Related Documentation
+
+- **[Performance Exporter](exporter/example/README.md)** - Tool for analyzing and visualizing NCCL performance data from inspector logs
+
+## Folder Location
+
+The Inspector plugin source is located in:
+
+```
+ext-profiler/inspector/
+```
+
+## Building the Inspector Plugin
+
+To build the Inspector plugin, run:
+
+```bash
+make
+```
+
+The build system will automatically detect CUDA and NCCL installations from your environment. If you need to specify custom paths, you can set `CUDA_HOME` and `NCCL_HOME` environment variables or pass them as make arguments.
+
+### Build Options
+
+The Makefile supports several build options:
+
+- **DEBUG=1**: Enable debug build with additional debugging information
+- **ASAN=1**: Enable Address Sanitizer for memory error detection
+- **UBSAN=1**: Enable Undefined Behavior Sanitizer
+
+Example debug build:
+```bash
+make DEBUG=1
+```
+
+### Build Output
+
+The build process creates:
+- `libnccl-profiler-inspector.so`: The main inspector plugin library
+- `version.cc`: Auto-generated version information from git
+
+## Using NCCL Inspector
+
+### Key Differences from Normal NCCL Usage
+
+The main difference between running NCCL with the Inspector plugin versus running NCCL normally is the addition of environment variables that enable detailed performance logging:
+
+**Normal NCCL Run:**
+```bash
+# Standard NCCL execution
+./your_nccl_application
+```
+
+**NCCL Inspector Run:**
+```bash
+# NCCL Inspector enabled execution
+export NCCL_PROFILER_PLUGIN=/path/to/nccl/ext-profiler/inspector/libnccl-profiler-inspector.so
+export NCCL_INSPECTOR_ENABLE=1
+export NCCL_INSPECTOR_DUMP_THREAD_INTERVAL_MICROSECONDS=500
+./your_nccl_application
+```
+
+### Required Environment Variables
+
+- `NCCL_PROFILER_PLUGIN=/path/to/nccl/ext-profiler/inspector/libnccl-profiler-inspector.so`
+  Loads the Inspector plugin into NCCL.
+- `NCCL_INSPECTOR_ENABLE=1`
+  Enables the Inspector plugin.
+- `NCCL_INSPECTOR_DUMP_THREAD_INTERVAL_MICROSECONDS=<interval>`
+  Sets the interval (in microseconds) for the internal dump thread to write output. Example: `500`.
+- `NCCL_INSPECTOR_DUMP_DIR=<output_dir>` (optional)
+  Sets the output directory for logs. If not set, defaults to `nccl-inspector-unknown-jobid` or `nccl-inspector-<slurm_job_id>` if running under SLURM.
+- `NCCL_INSPECTOR_DUMP_VERBOSE=<0|1>` (optional)
+  Enables verbose output including event trace information. Set to `1` to enable, `0` to disable (default).
+
+### Example Usage
+
+**Single Node:**
+```bash
+export NCCL_PROFILER_PLUGIN=/path/to/nccl/ext-profiler/inspector/libnccl-profiler-inspector.so
+export NCCL_INSPECTOR_ENABLE=1
+export NCCL_INSPECTOR_DUMP_THREAD_INTERVAL_MICROSECONDS=500
+./build/test/perf/all_reduce_perf -b 8 -e 16G -f 2 -g 8
+```
+
+**Multi-Node (SLURM):**
+```bash
+# Add these environment variables to your SLURM script
+export NCCL_PROFILER_PLUGIN=/path/to/nccl/ext-profiler/inspector/libnccl-profiler-inspector.so
+export NCCL_INSPECTOR_ENABLE=1
+export NCCL_INSPECTOR_DUMP_THREAD_INTERVAL_MICROSECONDS=500
+export NCCL_INSPECTOR_DUMP_DIR=/path/to/logs/${SLURM_JOB_ID}/
+
+# Then run your normal NCCL application
+srun your_nccl_application
+```
+
+## Example Scripts
+
+For detailed example scripts showing how to integrate NCCL Inspector with different workloads, see the **[test/examples/](test/examples/)** directory:
+
+- **Single Node Example**: Basic NCCL performance testing with inspector
+- **Multi-Node SLURM Example**: Comprehensive multi-node testing with various collective operations
+- **Training Workload Example**: Integration with distributed training workloads
+
+## Output Example
+
+Each output file contains JSON objects with the following structure:
+
+```json
+{
+  "header": {
+    "id": "0x7f8c496ae9f661",
+    "rank": 2,
+    "n_ranks": 8,
+    "nnodes": 1
+  },
+  "metadata": {
+    "inspector_output_format_version": "v4.0",
+    "git_rev": "",
+    "rec_mechanism": "profiler_plugin",
+    "dump_timestamp_us": 1748030377748202,
+    "hostname": "example-hostname",
+    "pid": 1639453
+  },
+  "coll_perf": {
+    "coll": "AllReduce",
+    "coll_sn": 1407,
+    "coll_msg_size_bytes": 17179869184,
+    "coll_exec_time_us": 61974,
+    "coll_algobw_gbs": 277.210914,
+    "coll_busbw_gbs": 485.119099
+  }
+}
+```
+
+## Output Example Verbose
+
+To enable verbose output with event trace information, set the `NCCL_INSPECTOR_DUMP_VERBOSE=1` environment variable:
+
+```bash
+export NCCL_INSPECTOR_DUMP_VERBOSE=1
+```
+
+This will include additional event trace information in the JSON output, showing the sequence of callbacks and timestamps for each individual event.
+
+```json
+{
+  "header": {
+    "id": "0xe62dedaa97644a",
+    "rank": 4,
+    "n_ranks": 8,
+    "nnodes": 1
+  },
+  "metadata": {
+    "inspector_output_format_version": "v4.0",
+    "git_rev": "9019a1912-dirty",
+    "rec_mechanism": "nccl_profiler_interface",
+    "dump_timestamp_us": 1752867229276385,
+    "hostname": "example-hostname",
+    "pid": 438776
+  },
+  "coll_perf": {
+    "coll": "ReduceScatter",
+    "coll_sn": 1231,
+    "coll_msg_size_bytes": 2147483648,
+    "coll_exec_time_us": 41057,
+    "coll_timing_source": "kernel_gpu",
+    "coll_algobw_gbs": 418.439467,
+    "coll_busbw_gbs": 366.134533,
+    "event_trace_sn": {
+      "coll_start_sn": 1,
+      "coll_stop_sn": 2,
+      "kernel_events": [
+        {
+          "channel_id": 0,
+          "kernel_start_sn": 3,
+          "kernel_stop_sn": 48,
+          "kernel_record_sn": 47
+        }
+      ]
+    },
+    "event_trace_ts": {
+      "coll_start_ts": 1752867229235059,
+      "coll_stop_ts": 1752867229235064,
+      "kernel_events": [
+        {
+          "channel_id": 0,
+          "kernel_start_ts": 1752867229235181,
+          "kernel_stop_ts": 1752867229275811,
+          "kernel_record_ts": 1752867229275811
+        }
+      ]
+    }
+  }
+}
+```
+
+Multiple such JSON objects are written, one per collective operation per communicator.
+
+## Output Directory
+
+- By default, output files are written to:
+  - `nccl-inspector-unknown-jobid` (if no SLURM job ID is present)
+  - `nccl-inspector-<slurm_job_id>` (if running under SLURM)
+- You can override this with the `NCCL_INSPECTOR_DUMP_DIR` environment variable.
+
+## Additional Notes
+
+- The plugin is compatible with standard NCCL workflows and can be used in both single-node and multi-node (SLURM) environments.
+- For more details, see the source code and comments in `ext-profiler/inspector/`.
+
@@ -0,0 +1,151 @@
+# NCCL Inspector Performance Summary Exporter
+
+This tool processes NCCL Inspector log files and generates comprehensive performance analysis reports including visualizations and statistical summaries.
+One can build similar exporters to integrate with various observability systems like Elastic, Prometheus or other Custom Metric systems.
+
+## Features
+
+- **Performance Analysis**: Generates statistical summaries for collective operations
+- **Communication Type Classification**: Automatically categorizes communication patterns
+- **Visualizations**: Creates scatter plots, histograms, and box plots for performance metrics
+- **Data Export**: Converts logs to Parquet format for efficient processing
+- **Multi-format Log Support**: Processes `.log`, `.log.gz`, `.jsonl`, and `.jsonl.gz` files
+- **Parallel Processing**: Utilizes multi-core processing for faster analysis
+
+## Requirements
+
+- Python 3.7+
+- Access to NCCL Inspector log files
+
+## Installation
+
+### Clone the Repository
+
+```bash
+git clone https://github.com/NVIDIA/nccl.git
+cd nccl/ext-profiler/inspector/exporter/example
+```
+
+Install the required dependencies using the provided `requirements.txt` file:
+
+```bash
+pip install -r requirements.txt
+```
+
+## Usage
+
+The script processes NCCL Inspector log files from a specified directory.
+
+**Note:** To generate NCCL Inspector log files, you need to run your NCCL application with the inspector plugin enabled. The log files will be output to a directory specified by the `NCCL_INSPECTOR_DUMP_DIR` environment variable. For detailed setup instructions and environment variable configuration, see the [Inspector README](../../../README.md).
+
+### Basic Usage
+
+```bash
+python perf_summary_exporter.py --input_dir /path/to/nccl/inspector/logs
+```
+
+This mode processes all log files in the specified directory and its subdirectories recursively.
+
+### Command Line Arguments
+
+- `--input_dir <path>`: **Required**. Directory containing NCCL Inspector log files (searches recursively in subdirectories)
+- `--output_dir <name>`: **Optional**. Custom output directory name (default: `<input_directory_name>-analysis`)
+
+## Output
+
+The tool generates:
+
+1. **Parquet Files**: One per log file containing processed log data (stored in `parquet_files/` subdirectory)
+2. **Summary Directory**: Contains comprehensive analysis results
+3. **Visualizations**: Scatter plots, histograms, and box plots for each message size
+4. **CSV Files**: Detailed summaries for each message size and collective type
+5. **Log File**: Processing log with detailed information
+
+## Example Output Structure
+
+```
+<output_dir_name>/
+├── output.log
+├── parquet_files/
+│   ├── <filename1>.parquet
+│   ├── <filename2>.parquet
+│   └── ...
+└── summary/
+    ├── scatter_plot_<comm_type>_<coll_type>.png
+    ├── combined_scatter_plot_<comm_type>_<coll_type>.png
+    └── msg_size_<human_readable_size>/
+        ├── histograms/
+        │   └── histogram_<comm_type>_<coll_type>_<size>.png
+        ├── boxplots/
+        │   └── boxplot_<comm_type>_<coll_type>_<size>.png
+        └── summary_<comm_type>_<coll_type>_<size>.csv
+```
+
+## Supported Communicator Types
+
+- `single-rank`
+- `nvlink-only`
+- `hca-only`
+- `mixed`
+
+## Supported Collective Types
+
+- `AllReduce`
+- `AllGather`
+- `ReduceScatter`
+- `Broadcast`
+
+## Log File Formats
+
+### Supported Formats
+
+- `.log` - Plain text JSON lines
+- `.log.gz` - Compressed JSON lines
+- `.jsonl` - JSON lines format
+- `.jsonl.gz` - Compressed JSON lines
+
+### Expected JSON Structure
+
+```json
+{
+  "header": {
+    "id": "0x9e7a479f95a66c",
+    "rank": 31,
+    "n_ranks": 32,
+    "nnodes": 4
+  },
+  "metadata": {
+    "inspector_output_format_version": "v4.0",
+    "git_rev": "75e61acda-dirty",
+    "rec_mechanism": "nccl_profiler_interface",
+    "dump_timestamp_us": 1749490229087081,
+    "hostname": "example-hostname",
+    "pid": 468528
+  },
+  "coll_perf": {
+    "coll": "ReduceScatter",
+    "coll_sn": 129,
+    "coll_msg_size_bytes": 65536,
+    "coll_exec_time_us": 110,
+    "coll_timing_source": "kernel_gpu",
+    "coll_algobw_gbs": 19.065018,
+    "coll_busbw_gbs": 18.469236
+  }
+}
+```
+
+## Troubleshooting
+
+### Common Issues
+
+1. **No log files found**: Ensure the log directory path is correct and contains valid log files
+2. **Missing dependencies**: Ensure all requirements are installed in your virtual environment
+3. **Mixed file formats**: The tool will exit if it detects mixed `.log`, `.log.gz`, `.jsonl`, and `.jsonl.gz` files in the same directory. This is typically indicative of corrupt input directories caused by multiple overlapping NCCL Inspector runs with different output format options. Clean the directory and re-run with consistent settings.
+
+### Log Files
+
+The tool creates detailed logs in the output directory. Check `output.log` for processing information and any error messages.
+
+## Support
+
+Please refer to the github issues page at https://github.com/NVIDIA/nccl/issues. Your question may already have been asked by another user. If not, feel free to create a new issue and refer to the "inspector plugin" in the title.
@@ -0,0 +1,548 @@
+from pathlib import Path
+import argparse
+import glob
+import gzip
+import sys
+import pandas as pd
+from concurrent.futures import ProcessPoolExecutor
+import json
+from tqdm.auto import tqdm
+import duckdb
+import math
+import matplotlib.pyplot as plt
+import matplotlib.dates
+from matplotlib.gridspec import GridSpec
+import os
+import logging
+import contextlib
+from datetime import datetime
+import numpy as np
+
+def setup_logging(output_dir):
+    log_file = output_dir / "output.log"
+    logging.basicConfig(
+        filename=log_file,
+        level=logging.INFO,
+        format="%(asctime)s - %(levelname)s - %(message)s",
+    )
+
+
+@contextlib.contextmanager
+def smart_open(filename, mode="r"):
+    if filename.endswith(".gz"):
+        opener = gzip.open
+    else:
+        opener = open
+
+    with opener(filename, mode) as f:
+        yield f
+
+
+def get_log_files_and_output_dir():
+    parser = argparse.ArgumentParser(description="Process log files in a directory.")
+    parser.add_argument(
+        "--input_dir",
+        type=str,
+        help="The directory containing NCCL Inspector log files to process.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        help="Custom output directory name (default: auto-generated from input directory)."
+    )
+    args = parser.parse_args()
+
+    if args.input_dir:
+        # Use the provided input directory
+        root_dir = Path(args.input_dir)
+        if not root_dir.exists():
+            raise FileNotFoundError(f"Input directory not found: {root_dir}")
+
+    logfiles = list(glob.iglob(str(Path(root_dir) / "**" / "*.log"), recursive=True))
+    gzlogfiles = list(
+        glob.iglob(str(Path(root_dir) / "**" / "*.log.gz"), recursive=True)
+    )
+    jsonlfiles = list(
+        glob.iglob(str(Path(root_dir) / "**" / "*.jsonl"), recursive=True)
+    )
+    gzjsonlfiles = list(
+        glob.iglob(str(Path(root_dir) / "**" / "*.jsonl.gz"), recursive=True)
+    )
+    if (
+            sum((1 for x in [logfiles, gzlogfiles, jsonlfiles, gzjsonlfiles] if len(x) > 0))
+            > 1
+    ):
+        ### TODO: we could probably generate some logic to pick the "right" file to load, but for now, bail
+        logging.critical("Appear to have mixed .log/.log.gz/.jsonl/.jsonl.gz; bailing!")
+        sys.exit(1)
+
+    files = logfiles + gzlogfiles + jsonlfiles + gzjsonlfiles
+
+    if not files:
+        print("No inspector logs found")
+        sys.exit(1)
+
+    # Generate output directory name from input directory
+    if args.output_dir:
+        output_dir_name = args.output_dir
+    else:
+        output_dir_name = f"{root_dir.name}-analysis"
+
+    return files, output_dir_name
+
+def bytes_to_human_readable(size_bytes):
+    """
+    Convert bytes to human-readable format using decimal (SI) units.
+
+    Uses powers of 1000 (decimal/SI standard):
+    - 1 KB = 1,000 bytes
+    - 1 MB = 1,000,000 bytes
+    - 1 GB = 1,000,000,000 bytes
+
+    Not binary units (powers of 1024):
+    - Does NOT use KiB, MiB, GiB (1024-based)
+
+    Args:
+        size_bytes: Number of bytes to convert
+
+    Returns:
+        Human-readable string (e.g., "1.50MB", "2.34GB")
+    """
+    if size_bytes == 0:
+        return "0B"
+    size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
+    i = int(math.log10(int(size_bytes)) / 3)
+    s = round(size_bytes * math.pow(10, -3 * i), 2)
+    return f"{s:.2f}{size_name[i]}"
+
+def timestamp_to_datetime(timestamp_us):
+    """Convert microsecond timestamp to datetime string"""
+    return datetime.fromtimestamp(timestamp_us / 1000000).strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
+
+def microseconds_to_human_readable(microseconds):
+    """Convert microseconds to human readable format"""
+    if microseconds < 1000:
+        return f"{microseconds:.1f}μs"
+    elif microseconds < 1000000:
+        return f"{microseconds/1000:.1f}ms"
+    else:
+        return f"{microseconds/1000000:.1f}s"
+
+def get_comm_type(row) -> str:
+    if row["n_ranks"] == 1:
+        return "single-rank"
+    elif row["nnodes"] == 1:
+        return "nvlink-only"
+    elif row["n_ranks"] == row["nnodes"]:
+        return "hca-only"
+    else:
+        return "mixed"
+
+def parse_file(filepath: Path, output_dir):
+    filename = Path(filepath).stem
+    parquet_file = output_dir / f"{filename}.parquet"
+
+    # Check if parquet file exists and is newer than source file
+    if parquet_file.exists():
+        source_mtime = Path(filepath).stat().st_mtime
+        parquet_mtime = parquet_file.stat().st_mtime
+        if parquet_mtime >= source_mtime:
+            logging.info(f"Parquet file {parquet_file} is up to date. Skipping...")
+            return
+        else:
+            logging.info(f"Source file {filepath} is newer than parquet. Regenerating...")
+
+    # Check if file is empty or too small
+    file_size = Path(filepath).stat().st_size
+    if file_size == 0:
+        logging.warning(f"Skipping empty file: {filepath}")
+        return
+
+    recs = []
+    try:
+        with smart_open(filepath, "r") as infile:
+            for lineno, line in enumerate(infile):
+                try:
+                    json_recs = json.loads(line)
+                except json.JSONDecodeError:
+                    logging.error(f"Failed to parse line {filepath}:{lineno}")
+                    continue
+
+                # Validate that required fields exist
+                if not all(key in json_recs for key in ["header", "metadata", "coll_perf"]):
+                    logging.error(f"Missing required fields in {filepath}:{lineno}")
+                    continue
+
+                header = json_recs["header"]
+                metadata = json_recs["metadata"]
+                comm_type = get_comm_type(header)
+                coll_perf = json_recs["coll_perf"]
+                recs.append(
+                    dict(
+                        **header,
+                        comm_type=comm_type,
+                        **coll_perf,
+                        **metadata,
+                    )
+                )
+    except Exception as e:
+        logging.error(f"Error reading file {filepath}: {e}")
+        return
+
+    # Skip files with no valid records
+    if not recs:
+        logging.warning(f"No valid records found in file: {filepath}. Skipping...")
+        return
+
+    df = pd.DataFrame(recs)
+    df.to_parquet(parquet_file)
+    logging.info(f"Created parquet file {parquet_file} with {len(recs)} records")
+
+def create_per_node_parquet_files(files, output_dir):
+    output_dir = Path(output_dir) / "parquet_files"
+    output_dir.mkdir(parents=True, exist_ok=True)
+    max_workers = min(64, len(files), os.cpu_count() or 1)
+    with ProcessPoolExecutor(max_workers=max_workers) as executor:
+        list(
+            tqdm(
+                executor.map(parse_file, files, [output_dir] * len(files)),
+                total=len(files),
+                desc="Processing files",
+                unit="file",
+            )
+        )
+    return output_dir
+
+def generate_scatter_plot(df, comm_type, coll_type, output_file):
+    plt.figure(figsize=(10, 6), dpi=100)
+    distinct_msg_sizes = df["coll_msg_size_bytes"].unique()
+
+    for msg_size in distinct_msg_sizes:
+        df_msg_size = df[df["coll_msg_size_bytes"] == msg_size]
+        mean_busbw = df_msg_size["mean_coll_busbw_gbs"].mean()
+        plt.scatter(
+            df_msg_size["coll_sn"],
+            df_msg_size["mean_coll_busbw_gbs"],
+            label=f"MsgSize: {bytes_to_human_readable(msg_size)} (Mean: {mean_busbw:.2f} GB/s)",
+            alpha=0.5,
+        )
+
+    plt.xlabel("Operation Sequence Number")
+    plt.ylabel("Mean Collective Bus BW (GB/s)")
+    plt.title(f"Comm Type: {comm_type}, Coll Type: {coll_type}")
+    plt.legend(title="Message Size", loc="upper right")
+    plt.tight_layout()
+    plt.savefig(output_file)
+    plt.close()
+    logging.info(f"Scatter plot saved to {output_file}")
+
+def generate_combined_scatter_plot(df, comm_type, coll_type, output_file, max_cols=3):
+    distinct_msg_sizes = df["coll_msg_size_bytes"].unique()
+    num_plots = len(distinct_msg_sizes)
+
+    # Compute number of rows and columns
+    num_cols = min(max_cols, num_plots)  # Limit max columns
+    num_rows = (num_plots + num_cols - 1) // num_cols  # Calculate rows dynamically
+
+    # Create figure with GridSpec
+    fig = plt.figure(figsize=(5 * num_cols, 5 * num_rows), dpi=100)
+    gs = GridSpec(num_rows, num_cols, figure=fig)
+
+    for i, msg_size in enumerate(distinct_msg_sizes):
+        row, col = divmod(i, num_cols)  # Determine row & column index
+        ax = fig.add_subplot(gs[row, col])  # Create subplot at position
+
+        df_msg_size = df[df["coll_msg_size_bytes"] == msg_size]
+        mean_busbw = df_msg_size["mean_coll_busbw_gbs"].mean()
+        ax.scatter(
+            df_msg_size["coll_sn"],
+            df_msg_size["mean_coll_busbw_gbs"],
+            label=f"MsgSize: {bytes_to_human_readable(msg_size)} (Mean: {mean_busbw:.2f} GB/s)",
+            alpha=0.5,
+        )
+        ax.set_xlabel("Op Seq No")
+        ax.set_ylabel("Mean Collective Bus BW (GB/s)")
+        ax.set_title(f"Message Size: {bytes_to_human_readable(msg_size)}({msg_size})")
+        ax.legend(loc="upper right")
+
+    fig.suptitle(f"Comm Type: {comm_type}, Coll Type: {coll_type}", ha="center", y=0.98)
+
+    plt.tight_layout()
+    plt.savefig(output_file)
+    plt.close()
+    logging.info(f"Combined scatter plot saved to {output_file}")
+
+def generate_histogram(df, comm_type, coll_type, output_file, message_size):
+    plt.figure(figsize=(10, 6), dpi=100)
+    data_range = df["mean_coll_busbw_gbs"].max() - df["mean_coll_busbw_gbs"].min()
+    num_bins = min(50, int(data_range) + 1)
+    plt.hist(
+        df["mean_coll_busbw_gbs"],
+        bins=num_bins,
+        alpha=0.7,
+        color="b",
+        edgecolor="black",
+        linewidth=1.2,
+    )
+    plt.xlabel("Mean Collective Bus BW (GB/s)")
+    plt.ylabel("Frequency")
+    plt.title(
+        f"Comm Type: {comm_type}, Coll Type: {coll_type} Mean Collective Bus BW Histogram\nMsg Size: {message_size}"
+    )
+    plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f"{y:.0f}"))
+    plt.gca().xaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f"{x:.2f} GB/s"))
+    plt.gca().xaxis.get_offset_text().set_visible(False)
+    plt.tight_layout()
+    plt.savefig(output_file)
+    plt.close()
+    logging.info(f"Histogram saved to {output_file}")
+
+def generate_boxplot(df, comm_type, coll_type, output_file, message_size):
+    plt.figure(figsize=(10, 6))
+    boxprops = dict(linestyle="-", linewidth=2, color="blue")
+    flierprops = dict(marker="o", color="red", alpha=0.5)
+    medianprops = dict(linestyle="-", linewidth=2.5, color="orange")
+    whiskerprops = dict(linestyle="--", linewidth=2, color="green")
+    capprops = dict(linestyle="-", linewidth=2, color="black")
+
+    plt.boxplot(
+        df["mean_coll_busbw_gbs"],
+        vert=False,
+        patch_artist=True,
+        boxprops=boxprops,
+        flierprops=flierprops,
+        medianprops=medianprops,
+        whiskerprops=whiskerprops,
+        capprops=capprops,
+    )
+
+    plt.xlabel("Mean Coll Bus BW (GB/s)")
+    plt.title(
+        f"Box Plot of Coll Bus BW (CommType: {comm_type} - Coll Type: {coll_type} - Msg Size: {message_size})"
+    )
+
+    # Adding labels for min, max, and median
+    stats = df["mean_coll_busbw_gbs"].describe(percentiles=[0.5])
+    plt.annotate(
+        f"Min: {stats['min']:.2f}",
+        xy=(stats["min"], 1),
+        xytext=(stats["min"], 1.1),
+        arrowprops=dict(facecolor="black", shrink=0.05),
+    )
+    plt.annotate(
+        f"Median: {stats['50%']:.2f}",
+        xy=(stats["50%"], 1),
+        xytext=(stats["50%"], 1.1),
+        arrowprops=dict(facecolor="black", shrink=0.05),
+    )
+    plt.annotate(
+        f"Max: {stats['max']:.2f}",
+        xy=(stats["max"], 1),
+        xytext=(stats["max"], 1.1),
+        arrowprops=dict(facecolor="black", shrink=0.05),
+    )
+
+    plt.tight_layout()
+    plt.savefig(output_file)
+    plt.close()
+    logging.info(f"Box plot saved to {output_file}")
+
+
+def summarize_data_per_comm_coll_type(output_root, comm_type, coll_type, output_dir_name):
+    """Summarize parquet data per communication and collective type using DuckDB"""
+    logging.info(f"Summarizing data per comm/coll type for {output_dir_name}, {comm_type} and {coll_type}")
+
+    # Check if there are any parquet files
+    parquet_dir = output_root / "parquet_files"
+    parquet_files = list(parquet_dir.glob("*.parquet"))
+    if not parquet_files:
+        logging.warning(f"No parquet files found for {comm_type} and {coll_type}")
+        return None
+
+    # Clean up invalid/empty parquet files by moving them to a separate directory
+    invalid_dir = parquet_dir / "invalid"
+    invalid_dir.mkdir(exist_ok=True)
+
+    invalid_count = 0
+    for pf in parquet_files:
+        try:
+            # Check file size first
+            if pf.stat().st_size == 0:
+                logging.warning(f"Moving zero-byte parquet file {pf} to invalid directory")
+                pf.rename(invalid_dir / pf.name)
+                invalid_count += 1
+                continue
+
+            # Use pyarrow to check parquet metadata without reading data
+            import pyarrow.parquet as pq
+            parquet_file = pq.ParquetFile(pf)
+            if parquet_file.metadata.num_rows == 0:
+                logging.warning(f"Moving empty parquet file {pf} (0 rows) to invalid directory")
+                pf.rename(invalid_dir / pf.name)
+                invalid_count += 1
+        except Exception as e:
+            logging.warning(f"Moving invalid parquet file {pf} to invalid directory: {e}")
+            pf.rename(invalid_dir / pf.name)
+            invalid_count += 1
+
+    # Check if any valid files remain
+    remaining_files = list(parquet_dir.glob("*.parquet"))
+    if not remaining_files:
+        logging.warning(f"No valid parquet files found for {comm_type} and {coll_type} (moved {invalid_count} invalid files)")
+        return None
+
+    logging.info(f"Found {len(remaining_files)} valid parquet files (moved {invalid_count} invalid files)")
+
+    try:
+        duckdb.execute(
+            f"CREATE OR REPLACE VIEW logs AS SELECT * FROM read_parquet('{parquet_dir}/*.parquet')"
+        )
+        df = duckdb.execute(f"""
+            SELECT
+                id,
+                coll_sn,
+                coll_msg_size_bytes,
+                AVG(coll_busbw_gbs) as mean_coll_busbw_gbs,
+                COUNT(*) as log_count,
+                ARRAY_DISTINCT(LIST(n_ranks)) as n_ranks,
+                ARRAY_DISTINCT(LIST(nnodes)) as nnodes,
+                MIN(dump_timestamp_us) as coll_start_timestamp_us,
+                MAX(dump_timestamp_us) as coll_end_timestamp_us,
+                (MAX(dump_timestamp_us) - MIN(dump_timestamp_us)) as coll_duration_us
+            FROM logs
+            WHERE coll = '{coll_type}' and comm_type = '{comm_type}'
+            GROUP BY id, coll_sn, coll_msg_size_bytes
+            ORDER BY coll_sn
+        """).df()
+    except Exception as e:
+        logging.error(f"Error executing DuckDB query for {comm_type} and {coll_type}: {e}")
+        return None
+
+    if df.empty:
+        logging.info(f"No data for {comm_type} and {coll_type}")
+        return None
+
+    # Add human-readable formatting
+    df["human_readable_coll_msg_size_bytes"] = df["coll_msg_size_bytes"].apply(
+        bytes_to_human_readable
+    )
+
+    # Log example of time range data for first few rows
+    if len(df) > 0:
+        sample_row = df.iloc[0]
+        start_time = timestamp_to_datetime(sample_row['coll_start_timestamp_us'])
+        end_time = timestamp_to_datetime(sample_row['coll_end_timestamp_us'])
+        duration = microseconds_to_human_readable(sample_row['coll_duration_us'])
+        logging.info(f"Example time range - ID: {sample_row['id']}, Coll_SN: {sample_row['coll_sn']}, "
+                     f"Start: {start_time}, End: {end_time}, Duration: {duration}")
+
+    return df
+
+
+def generate_visualizations(df, output_root, comm_type, coll_type):
+    """Generate all visualizations and save CSV files for the processed data"""
+    logging.info(f"Generating visualizations for {comm_type} and {coll_type}")
+
+    summary_dir = output_root / "summary"
+    summary_dir.mkdir(parents=True, exist_ok=True)
+
+    # Scatter Plot for all message sizes
+    output_file = summary_dir / f"scatter_plot_{comm_type}_{coll_type}.png"
+    generate_scatter_plot(df, comm_type, coll_type, output_file)
+
+    # Combined Scatter Plot for all message sizes
+    output_file = summary_dir / f"combined_scatter_plot_{comm_type}_{coll_type}.png"
+    generate_combined_scatter_plot(df, comm_type, coll_type, output_file)
+
+    distinct_msg_sizes = df["coll_msg_size_bytes"].unique()
+    for msg_size in distinct_msg_sizes:
+        hr_msg_size = bytes_to_human_readable(msg_size)
+        msg_size_dir = summary_dir / f"msg_size_{msg_size}_{hr_msg_size}"
+        msg_size_hist_dir = msg_size_dir / "histograms"
+        msg_size_boxplot_dir = msg_size_dir / "boxplots"
+        msg_size_dir.mkdir(parents=True, exist_ok=True)
+        msg_size_hist_dir.mkdir(parents=True, exist_ok=True)
+        msg_size_boxplot_dir.mkdir(parents=True, exist_ok=True)
+
+        df_msg_size = df[df["coll_msg_size_bytes"] == msg_size]
+
+        # Add human-readable time formatting
+        df_msg_size = df_msg_size.copy()
+        df_msg_size["coll_start_datetime"] = df_msg_size["coll_start_timestamp_us"].apply(timestamp_to_datetime)
+        df_msg_size["coll_end_datetime"] = df_msg_size["coll_end_timestamp_us"].apply(timestamp_to_datetime)
+        df_msg_size["coll_duration_human"] = df_msg_size["coll_duration_us"].apply(microseconds_to_human_readable)
+
+        # Histogram
+        output_file = (
+            msg_size_hist_dir / f"histogram_{comm_type}_{coll_type}_{msg_size}.png"
+        )
+        generate_histogram(
+            df_msg_size,
+            comm_type,
+            coll_type,
+            output_file,
+            bytes_to_human_readable(msg_size),
+        )
+
+        # Box Plot
+        output_file = (
+            msg_size_boxplot_dir / f"boxplot_{comm_type}_{coll_type}_{msg_size}.png"
+        )
+        generate_boxplot(
+            df_msg_size,
+            comm_type,
+            coll_type,
+            output_file,
+            bytes_to_human_readable(msg_size),
+        )
+
+        output_file = msg_size_dir / f"summary_{comm_type}_{coll_type}_{msg_size}.csv"
+        df_msg_size.to_csv(output_file, index=False)
+        logging.info(
+            f"Summary for {comm_type}, {coll_type}, and msg_size {msg_size} written to {output_file}"
+        )
+
+
+def generate_summary(output_root, comm_type, coll_type, output_dir_name):
+    """Generate summary by summarizing data per comm/coll type and creating visualizations"""
+    logging.info(f"Generating summary for {output_dir_name}, {comm_type} and {coll_type}")
+
+    # Step 1: Summarize data per communication and collective type
+    df = summarize_data_per_comm_coll_type(output_root, comm_type, coll_type, output_dir_name)
+
+    # Step 2: Generate visualizations if data exists
+    if df is not None:
+        generate_visualizations(df, output_root, comm_type, coll_type)
+    else:
+        logging.warning(f"No data found for {comm_type} and {coll_type} - skipping visualization generation")
+
+
+def generate_summary_wrapper(args):
+    return generate_summary(*args)
+
+
+if __name__ == "__main__":
+    files, output_dir_name = get_log_files_and_output_dir()
+    print(f"Number of log files found: {len(files)}")
+    print(f"Output directory: {output_dir_name}")
+    output_dir = Path(output_dir_name)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    setup_logging(output_dir)
+    create_per_node_parquet_files(files, output_dir)
+    comm_types = ["single-rank", "nvlink-only", "hca-only", "mixed"]
+    coll_types = ["AllReduce", "AllGather", "ReduceScatter", "Broadcast"]
+    summary_args = [
+        (output_dir, comm_type, coll_type, output_dir_name)
+        for comm_type in comm_types
+        for coll_type in coll_types
+    ]
+    max_workers = min(64, len(summary_args), os.cpu_count() or 1)
+    with ProcessPoolExecutor(max_workers=max_workers) as executor:
+        list(
+            tqdm(
+                executor.map(generate_summary_wrapper, summary_args),
+                total=len(summary_args),
+                desc="Generating summaries",
+            )
+        )
+        print("Done!")
@@ -0,0 +1,6 @@
+pandas>=1.3.0
+tqdm>=4.60.0
+duckdb>=0.8.0
+matplotlib>=3.3.0
+pyarrow>=5.0.0
+numpy>=1.21.0
@@ -0,0 +1,198 @@
+#pragma once
+
+#include <pthread.h>
+
+#include "json.h"
+#include "common.h"
+#include "version.h"
+
+#define MAX_CHANNELS                     64
+
+#define INS_CHK_GOTO(call, res, label)                                  \
+  do {                                                                  \
+    res = call;                                                         \
+    if (inspectorSuccess != res) {                                      \
+      INFO(NCCL_INSPECTOR, "%s:%d -> error %d: %s", __FILE__, __LINE__, res, \
+           inspectorErrorString(res));                                  \
+      goto label;                                                       \
+    }                                                                   \
+  } while (0);
+
+
+typedef enum {
+  ncclFuncBroadcast = 0,
+  ncclFuncReduce = 1,
+  ncclFuncAllGather = 2,
+  ncclFuncReduceScatter = 3,
+  ncclFuncAllReduce = 4,
+  ncclFuncSendRecv = 5,
+  ncclFuncSend = 6,
+  ncclFuncRecv = 7,
+  ncclNumFuncs = 8
+} ncclFunc_t;
+
+typedef enum {
+  inspectorSuccess = 0,
+  inspectorUninitializedError,
+  inspectorMemoryError,
+  inspectorFileOpenError,
+  inspectorDisabledError,
+  inspectorLockError,
+  inspectorPthreadError,
+  inspectorJsonError,
+  inspectorCudaError,
+  inspectorBadHash,
+  inspectorDeleteUnknownCommError,
+  inspectorAddDuplicateCommError,
+  inspectorNop,
+  inspectorNullTally,
+  inspectorGlobalInitError,
+  inspectorReturn,
+} inspectorResult_t;
+
+typedef enum {
+  inspectorTimingSourceKernelGpu = 0,
+  inspectorTimingSourceKernelCpu = 1,
+  inspectorTimingSourceCollectiveCpu = 2,
+} inspectorTimingSource_t;
+
+struct inspectorEventTraceInfo {
+  uint64_t ts;
+  uint64_t sn;
+};
+
+typedef enum {
+  NCCL_INSP_EVT_TRK_COLL_START = 0,
+  NCCL_INSP_EVT_TRK_COLL_STOP = 1,
+  NCCL_INSP_EVT_TRK_COLL_NEVT = 2,
+} inspectorEventTrkColl_t;
+
+typedef enum {
+  NCCL_INSP_EVT_TRK_KERNEL_START = 0,
+  NCCL_INSP_EVT_TRK_KERNEL_STOP = 1,
+  NCCL_INSP_EVT_TRK_KERNEL_RECORD = 2,
+  NCCL_INSP_EVT_TRK_KERNEL_NEVT = 3,
+} inspectorEventTrkKernel_t;
+
+struct inspectorEventTrkKernelInfo {
+  struct inspectorEventTraceInfo evntTrace[NCCL_INSP_EVT_TRK_KERNEL_NEVT];
+};
+
+struct inspectorEventTrkCollInfo {
+  int sn;
+  uint32_t nChannels;
+  struct inspectorEventTraceInfo evntTrace[NCCL_INSP_EVT_TRK_COLL_NEVT];
+  struct inspectorEventTrkKernelInfo kernelCh[MAX_CHANNELS];
+};
+
+struct inspectorCompletedCollInfo {
+  ncclFunc_t func;
+  uint64_t sn;
+  size_t msgSizeBytes;
+  uint64_t execTimeUsecs;
+  inspectorTimingSource_t timingSource;
+  double algoBwGbs;
+  double busBwGbs;
+  // Event trace information
+  struct inspectorEventTrkCollInfo collEvtTrk;
+};
+
+enum {
+  NCCL_COMM_HASH_LENGTH = 17
+};
+
+struct inspectorCommInfo {
+  struct inspectorCommInfo* next;
+
+  const char* commName;
+  uint64_t commHash;
+  char commHashStr[NCCL_COMM_HASH_LENGTH];
+  int rank;
+  int nranks;
+  int nnodes;
+
+  bool dump;
+  struct inspectorCompletedCollInfo completedCollInfo;
+  pthread_rwlock_t guard;
+};
+
+struct inspectorKernelChInfo {
+  uint64_t type;
+  int refCount; /*unused*/
+  struct inspectorCollInfo *collInfo;
+  uint8_t channelId;
+  uint64_t tsStartUsec;
+  uint64_t tsCompletedUsec;
+  uint64_t startGpuClk;
+  uint64_t stopGpuClk;
+};
+
+struct inspectorCollInfo {
+  uint64_t type;
+  int refCount;
+  struct inspectorCommInfo *commInfo;
+  const char* func;
+  uint64_t sn;
+  size_t msgSizeBytes;
+  uint64_t tsStartUsec;
+  uint64_t tsCompletedUsec;
+  uint32_t nChannels;
+  uint32_t nKernelChStarted;
+  uint32_t nKernelChCompleted;
+  pthread_rwlock_t guard;
+  struct inspectorKernelChInfo kernelCh[MAX_CHANNELS];
+  struct inspectorEventTrkCollInfo collEvtTrk;
+};
+
+
+
+extern ncclDebugLogger_t logFn;
+#define VERSION(...) logFn(NCCL_LOG_VERSION, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
+#define INFO(FLAGS, ...) logFn(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
+#define WARN(...) logFn(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
+
+inline int ncclTypeSize(ncclDataType_t type) {
+  switch (type) {
+  case ncclInt8:
+  case ncclUint8:
+  case ncclFloat8e4m3:
+  case ncclFloat8e5m2:
+    return 1;
+  case ncclFloat16:
+  case ncclBfloat16:
+    return 2;
+  case ncclInt32:
+  case ncclUint32:
+  case ncclFloat32:
+    return 4;
+  case ncclInt64:
+  case ncclUint64:
+  case ncclFloat64:
+    return 8;
+  default:
+    return -1;
+  }
+}
+
+const char* inspectorErrorString(inspectorResult_t result);
+
+inspectorResult_t inspectorLockInit(pthread_rwlock_t* lockRef);
+inspectorResult_t inspectorLockDestroy(pthread_rwlock_t* lockRef);
+inspectorResult_t inspectorLockRd(pthread_rwlock_t* lockRef);
+inspectorResult_t inspectorLockWr(pthread_rwlock_t* lockRef);
+inspectorResult_t inspectorUnlockRWLock(pthread_rwlock_t* lockRef);
+inspectorResult_t inspectorGlobalInit(int rank);
+inspectorResult_t inspectorGlobalFinalize();
+uint64_t inspectorGetTime();
+inspectorResult_t inspectorAddComm(struct inspectorCommInfo **commInfo,
+                                   const char* commName, uint64_t commHash,
+                                   int nNodes, int nranks, int rank);
+inspectorResult_t inspectorDelComm(struct inspectorCommInfo *commInfo);
+
+void inspectorUpdateCollPerf(struct inspectorCompletedCollInfo *completedColl,
+                             struct inspectorCollInfo *collInfo);
+ncclDataType_t inspectorStringToDatatype(const char* str);
+
+void inspectorComputeCollBw(struct inspectorCommInfo *commInfo,
+                            struct inspectorCompletedCollInfo *completedColl,
+                            ncclFunc_t collType);
@@ -0,0 +1,493 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <stdio.h>
+#include <pthread.h>
+#include <string.h>
+#include <linux/limits.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include "profiler.h"
+#include "inspector.h"
+
+#define __hidden __attribute__ ((visibility("hidden")))
+
+static int gInitialized;
+
+static pthread_mutex_t gLock = PTHREAD_MUTEX_INITIALIZER;
+
+
+/*
+ * Description:
+ *   Records an event trace with timestamp and sequence number
+ *
+ * Thread Safety:
+ *   Not thread-safe - must be called with proper locking. This function
+ *   is designed to be called from within locked sections where the
+ *   collective info structure is already protected.
+ *
+ * Input:
+ *   struct inspectorEventTraceInfo* evtTrace - event trace array
+ *   int eventIndex - index in the event trace array (must be valid)
+ *   struct inspectorCollInfo* collInfo - collective info structure (must not be NULL)
+ *
+ * Output:
+ *   Event trace is updated with current timestamp and next sequence
+ *   number from collective
+ *
+ * Return:
+ *   uint64_t - the sequence number assigned to this event
+ *
+ * Preconditions:
+ *   - collInfo must not be NULL
+ *   - eventIndex must be within valid bounds for evtTrace array
+ *   - Function must be called from within a locked section
+ */
+static uint64_t inspectorRecordEventTrace(struct inspectorEventTraceInfo* evtTrace,
+                                          int eventIndex,
+                                          struct inspectorCollInfo* collInfo) {
+  evtTrace[eventIndex].ts = inspectorGetTime();
+  evtTrace[eventIndex].sn = ++collInfo->collEvtTrk.sn; // Increment coll sequence counter
+
+  return evtTrace[eventIndex].sn;
+}
+
+/*
+ * Description:
+ *
+ *   Initializes the NCCL Inspector plugin and global state for a
+ *   communicator.
+ *
+ * Thread Safety:
+ *   Thread-safe (uses mutex for initialization).
+ *
+ * Input:
+ *   void** context - pointer to plugin context.
+ *   int* eActivationMask - pointer to activation mask output.
+ *   const char* commName - communicator name.
+ *   uint64_t commHash - communicator hash.
+ *   int nNodes - number of nodes.
+ *   int nranks - number of ranks.
+ *   int rank - rank.
+ *   ncclDebugLogger_t logfn - logger function pointer.
+ *
+ * Output:
+ *   context is set to plugin context; eActivationMask is set.
+ *
+ * Return:
+ *   ncclResult_t - success or error code.
+ *
+ */
+__hidden ncclResult_t inspectorPluginInit(void** context, uint64_t commHash,
+                                          int* eActivationMask,
+                                          const char* commName,
+                                          int nNodes, int nranks, int rank,
+                                          ncclDebugLogger_t logfn) {
+  inspectorResult_t res = inspectorSuccess;
+  *context = nullptr;
+  logFn = logfn;
+
+  pthread_mutex_lock(&gLock);
+  if (++gInitialized == 1) {
+    res = inspectorGlobalInit(rank);
+    if (res != inspectorSuccess) {
+      WARN("Inspector Init Failed %s:%d -> error %d: %s",__FILE__, __LINE__, res,
+           inspectorErrorString(res));
+      gInitialized = 0;
+      pthread_mutex_unlock(&gLock);
+      return ncclInternalError;
+    }
+  }
+  pthread_mutex_unlock(&gLock);
+
+  INS_CHK_GOTO(inspectorAddComm((struct inspectorCommInfo **)context,
+                                commName, commHash,
+                                nNodes, nranks, rank), res, success);
+  *eActivationMask = ncclProfileColl | ncclProfileKernelCh;
+  INFO(NCCL_INIT, "PROFILER/Plugin: init commName: %s commHash: %lu nranks: %d rank: %d",
+       commName ? commName : "", commHash, nranks, rank);
+success:
+  if (res != inspectorSuccess) {
+    return ncclInternalError;
+  } else {
+    return ncclSuccess;
+  }
+}
+
+/*
+ * Description:
+ *
+ *   Finalizes the NCCL Inspector plugin and global state for a
+ *   communicator.
+ *
+ * Thread Safety:
+ *   Thread-safe (uses mutex for finalization).
+ *
+ * Input:
+ *   void* context - plugin context.
+ *
+ * Output:
+ *   Plugin context is finalized and cleaned up.
+ *
+ * Return:
+ *   ncclResult_t - success or error code.
+ *
+ */
+__hidden ncclResult_t inspectorPluginFinalize(void* context) {
+  inspectorDelComm((struct inspectorCommInfo *)context);
+  pthread_mutex_lock(&gLock);
+  if (--gInitialized == 0) {
+    inspectorGlobalFinalize();
+  }
+  pthread_mutex_unlock(&gLock);
+  return ncclSuccess;
+}
+
+inspectorResult_t inspectorPluginCollInfoRef(struct inspectorCollInfo *collInfo) {
+  collInfo->refCount += 1;
+  return inspectorSuccess;
+}
+
+inspectorResult_t inspectorPluginCollInfoRefSafe(struct inspectorCollInfo *collInfo) {
+  inspectorLockWr(&collInfo->guard);
+  inspectorPluginCollInfoRef(collInfo);
+  inspectorUnlockRWLock(&collInfo->guard);
+  return inspectorSuccess;
+}
+
+inspectorResult_t inspectorPluginCollInfoDeRef(struct inspectorCollInfo *collInfo) {
+  collInfo->refCount -= 1;
+  if (collInfo->refCount == 0) {
+    inspectorLockDestroy(&collInfo->guard);
+    memset(collInfo, 0, sizeof(struct inspectorCollInfo));
+    free(collInfo);
+    return inspectorReturn;
+  }
+  return inspectorSuccess;
+}
+
+inspectorResult_t inspectorPluginCollInfoDeRefSafe(struct inspectorCollInfo *collInfo) {
+  inspectorLockWr(&collInfo->guard);
+  inspectorResult_t res = inspectorPluginCollInfoDeRef(collInfo);
+  inspectorUnlockRWLock(&collInfo->guard);
+  return res;
+}
+
+/*
+ * Description:
+ *   Initializes a new inspectorCollInfo structure for a collective
+ *   event.
+ *
+ * Thread Safety:
+ *   Not thread-safe (allocates and initializes a new collective info
+ *   structure).
+ *
+ * Input:
+ *
+ *   struct inspectorCollInfo **collInfo - pointer to output
+ *   collective info struct.
+ *   ncclProfilerEventDescr_t *eDescr - event descriptor.
+ *
+ * Output:
+ *   collInfo is set to the new collective info struct.
+ *
+ * Return:
+ *   None.
+ */
+static void inspectorPluginCollInfoInit(struct inspectorCollInfo **collInfo,
+                                        ncclProfilerEventDescr_t *eDescr,
+                                        struct inspectorCommInfo *commInfo) {
+  struct inspectorCollInfo *collInfoPtr
+    = (struct inspectorCollInfo*)calloc(1, sizeof(struct inspectorCollInfo));
+  if (collInfoPtr == nullptr) {
+    WARN("Inspector: Failed to allocate memory for collective info structure");
+    *collInfo = nullptr;
+    return;
+  }
+  collInfoPtr->type = ncclProfileColl;
+  collInfoPtr->refCount = 0;
+  inspectorPluginCollInfoRef(collInfoPtr); //self ref; no locks needed
+  collInfoPtr->func = eDescr->coll.func;
+  collInfoPtr->sn = eDescr->coll.seqNumber;
+  collInfoPtr->nChannels = eDescr->coll.nChannels;
+  if (collInfoPtr->nChannels > 0) {
+    inspectorPluginCollInfoRef(collInfoPtr); //extra ref for kernel completion
+  }
+  collInfoPtr->tsStartUsec = inspectorGetTime();
+  collInfoPtr->msgSizeBytes =
+    ncclTypeSize(inspectorStringToDatatype(eDescr->coll.datatype)) * eDescr->coll.count;
+
+
+  collInfoPtr->commInfo = commInfo;
+  collInfoPtr->collEvtTrk.sn = 0;
+  collInfoPtr->collEvtTrk.nChannels = collInfoPtr->nChannels;
+  inspectorRecordEventTrace(collInfoPtr->collEvtTrk.evntTrace,
+                            NCCL_INSP_EVT_TRK_COLL_START, collInfoPtr);
+
+  inspectorLockInit(&collInfoPtr->guard);
+  *collInfo = collInfoPtr;
+}
+
+/*
+ * Description:
+ *
+ *   Initializes a new inspectorKernelChInfo structure for a kernel
+ *   channel event.
+ *
+ * Thread Safety:
+ *   Not thread-safe (initializes kernel channel info within a
+ *   collective info structure).
+ *
+ * Input:
+ *   struct inspectorKernelChInfo **kernelChInfo - pointer to output
+ *   kernel channel info struct.
+ *   ncclProfilerEventDescr_t *eDescr - event descriptor.
+ *
+ * Output:
+ *
+ *   kernelChInfo is set to the new kernel channel info struct.
+ *
+ * Return:
+ *   None.
+ */
+static void inspectorPluginKernelChInfoInit(struct inspectorKernelChInfo **kernelChInfo,
+                                            ncclProfilerEventDescr_t *eDescr) {
+  if (eDescr->parentObj) {
+    uint64_t parentType=*(uint64_t*)eDescr->parentObj;
+    if (parentType == ncclProfileColl) {
+      struct inspectorCollInfo *collInfo = (struct inspectorCollInfo*)eDescr->parentObj;
+      if (collInfo && collInfo->type == ncclProfileColl) {
+        inspectorLockWr(&collInfo->guard);
+        struct inspectorEventTraceInfo *krnlEvtTrk =
+          collInfo->collEvtTrk.kernelCh[eDescr->kernelCh.channelId].evntTrace;
+        inspectorRecordEventTrace(krnlEvtTrk,
+                                  NCCL_INSP_EVT_TRK_KERNEL_START,
+                                  collInfo);
+        struct inspectorKernelChInfo *kernelChInfoPtr
+          = &collInfo->kernelCh[eDescr->kernelCh.channelId];
+        kernelChInfoPtr->type = ncclProfileKernelCh;
+        kernelChInfoPtr->channelId = eDescr->kernelCh.channelId;
+        kernelChInfoPtr->startGpuClk = eDescr->kernelCh.pTimer;
+        if (kernelChInfoPtr->stopGpuClk == 0) {
+          inspectorPluginCollInfoRef(collInfo); //Pairs with Record Kernel Stop event
+        }
+        kernelChInfoPtr->tsStartUsec = inspectorGetTime();
+        if (collInfo->nKernelChStarted == 0) {
+          collInfo->tsStartUsec = kernelChInfoPtr->tsStartUsec;
+        }
+        collInfo->nKernelChStarted += 1;
+        inspectorPluginCollInfoRef(collInfo); //Pairs with Stop Kernel Event
+        kernelChInfoPtr->collInfo = collInfo;
+
+        *kernelChInfo = kernelChInfoPtr;
+        inspectorUnlockRWLock(&collInfo->guard);
+      }
+    }
+  }
+}
+/*
+ * Description:
+ *
+ *   Starts a profiling event for the NCCL Inspector plugin.
+ *
+ * Thread Safety:
+ *   Thread-safe (allocates and initializes event structures).
+ *
+ * Input:
+ *   void* context - plugin context.
+ *   void** eHandle - pointer to event handle output.
+ *   ncclProfilerEventDescr_t* eDescr - event descriptor.
+ *
+ * Output:
+ *   eHandle is set to the new event structure.
+ *
+ * Return:
+ *   ncclResult_t - success or error code.
+ *
+ */
+__hidden ncclResult_t inspectorPluginStartEvent(void* context,
+                                                void** eHandle,
+                                                ncclProfilerEventDescr_t* eDescr) {
+  if (context == nullptr || eDescr == nullptr) {
+    INFO(NCCL_INIT, "Profiler/Plugin: context/eDescr NULL for start event %s", __func__);
+    return ncclSuccess;
+  }
+  *eHandle = nullptr;
+  if (eDescr->type == ncclProfileColl) {
+    struct inspectorCollInfo *collEvent = nullptr;
+    struct inspectorCommInfo *commInfoCtx = (struct inspectorCommInfo*)context;
+    inspectorPluginCollInfoInit(&collEvent, eDescr, commInfoCtx);
+    *eHandle = collEvent;
+  } else if (eDescr->type == ncclProfileKernelCh) {
+    struct inspectorKernelChInfo *kernelChEvent = nullptr;
+    inspectorPluginKernelChInfoInit(&kernelChEvent, eDescr);
+    *eHandle = kernelChEvent;
+  } else {
+    return ncclSuccess;
+  }
+  return ncclSuccess;
+}
+
+/*
+ * Description:
+ *
+ *   Stops a profiling event for the NCCL Inspector plugin.
+ *
+ * Thread Safety:
+ *
+ *   Thread-safe (updates event state and performance info).
+ *
+ * Input:
+ *
+ *   void *eHandle - event handle.
+ *
+ * Output:
+ *
+ *   Event is stopped and performance info may be updated.
+ *
+ * Return:
+ *   ncclResult_t - success or error code.
+ *
+ */
+__hidden ncclResult_t inspectorPluginStopEvent(void *eHandle) {
+
+  if (eHandle == nullptr) {
+    INFO(NCCL_INIT,
+         "Profiler/Plugin: Event Handle NULL for start event %s", __func__);
+    return ncclSuccess;
+  }
+  uint64_t type = *(uint64_t *)eHandle;
+  inspectorResult_t res = inspectorSuccess;
+
+  if (type == ncclProfileColl) {
+    struct inspectorCollInfo *collInfo = (struct inspectorCollInfo *)eHandle;
+    // Record collective stop event
+    inspectorLockWr(&collInfo->guard);
+    inspectorRecordEventTrace(collInfo->collEvtTrk.evntTrace,
+                              NCCL_INSP_EVT_TRK_COLL_STOP,
+                              collInfo);
+    res = inspectorPluginCollInfoDeRef(collInfo);
+    if (res == inspectorReturn) {
+      // WARN("NCCL Inspector unnatural return: inspectorPluginStopEvent:ncclProfileColl");
+      return ncclSuccess;
+    }
+    inspectorUnlockRWLock(&collInfo->guard);
+    return ncclSuccess;
+  } else if (type == ncclProfileKernelCh) {
+    struct inspectorKernelChInfo *kernelChInfo
+      = (struct inspectorKernelChInfo *)eHandle;
+    struct inspectorCollInfo *collInfo = kernelChInfo->collInfo;
+    if (collInfo && collInfo->type == ncclProfileColl) {
+      inspectorLockWr(&collInfo->guard);
+      struct inspectorEventTraceInfo *krnlEvtTrk =
+        collInfo->collEvtTrk.kernelCh[kernelChInfo->channelId].evntTrace;
+      inspectorRecordEventTrace(krnlEvtTrk,
+                                NCCL_INSP_EVT_TRK_KERNEL_STOP,
+                                collInfo);
+      kernelChInfo->tsCompletedUsec = inspectorGetTime();
+      collInfo->nKernelChCompleted += 1;
+
+      res = inspectorPluginCollInfoDeRef(collInfo);
+      if (res == inspectorReturn) {
+        WARN("NCCL Inspector unnatural return: inspectorPluginStopEvent:ncclProfileKernelCh");
+        return ncclSuccess;
+      }
+      if ((collInfo->nKernelChCompleted == collInfo->nKernelChStarted)
+          && (collInfo->nKernelChCompleted == collInfo->nChannels)) {
+        struct inspectorCompletedCollInfo completedColl;
+        struct inspectorCommInfo *commInfo = collInfo->commInfo;
+        collInfo->tsCompletedUsec = kernelChInfo->tsCompletedUsec;
+        inspectorUpdateCollPerf(&completedColl, collInfo);
+
+        res = inspectorPluginCollInfoDeRef(collInfo);
+        if (res != inspectorReturn) {
+          inspectorUnlockRWLock(&collInfo->guard);
+        }
+        if (commInfo != nullptr) {
+          inspectorLockWr(&commInfo->guard);
+          inspectorComputeCollBw(commInfo,
+                                 &completedColl,
+                                 completedColl.func);
+          memcpy(&commInfo->completedCollInfo,
+                 &completedColl,
+                 sizeof(struct inspectorCompletedCollInfo));
+          commInfo->dump = true;
+          inspectorUnlockRWLock(&commInfo->guard);
+        }
+        return ncclSuccess;
+      }
+      inspectorUnlockRWLock(&collInfo->guard);
+    }
+    return ncclSuccess;
+  }
+  return ncclSuccess;
+}
+
+/*
+ * Description:
+ *
+ *   Records the state of a profiling event for the NCCL Inspector
+ *   plugin.
+ *
+ * Thread Safety:
+ *
+ *   Thread-safe (updates event state as needed).
+ *
+ * Input:
+ *   void* eHandle - event handle.
+ *   ncclProfilerEventState_t eState - event state.
+ *   ncclProfilerEventStateArgs_t* eStateArgs - event state arguments.
+ *
+ * Output:
+ *   Event state is updated as needed.
+ *
+ * Return:
+ *   ncclResult_t - success or error code.
+ *
+ */
+__hidden ncclResult_t inspectorPluginRecordEventState(void* eHandle,
+                                                      ncclProfilerEventState_t eState,
+                                                      ncclProfilerEventStateArgs_t* eStateArgs) {
+  if (eHandle == nullptr || eStateArgs == nullptr)
+    return ncclSuccess;
+
+  uint64_t type = *(uint64_t *)eHandle;
+
+  if (type == ncclProfileKernelCh && eState == ncclProfilerKernelChStop) {
+    struct inspectorKernelChInfo *kernelChInfo = (struct inspectorKernelChInfo *)eHandle;
+    struct inspectorCollInfo *collInfo = kernelChInfo->collInfo;
+    inspectorResult_t res = inspectorSuccess;
+    if (collInfo && collInfo->type == ncclProfileColl) {
+      inspectorLockWr(&collInfo->guard);
+      struct inspectorEventTraceInfo *krnlEvtTrk
+        = collInfo->collEvtTrk.kernelCh[kernelChInfo->channelId].evntTrace;
+      inspectorRecordEventTrace(krnlEvtTrk,
+                                NCCL_INSP_EVT_TRK_KERNEL_RECORD,
+                                collInfo);
+      kernelChInfo->stopGpuClk = eStateArgs->kernelCh.pTimer;
+      if (kernelChInfo->startGpuClk != 0) {
+        res = inspectorPluginCollInfoDeRef(collInfo);
+        if (res == inspectorReturn) {
+          WARN("NCCL Inspector unnatural return: inspectorPluginRecordEventState");
+          return ncclSuccess;
+        }
+      }
+      inspectorUnlockRWLock(&collInfo->guard);
+    }
+  }
+  return ncclSuccess;
+}
+
+ncclProfiler_t ncclProfiler_v5 = {
+  "Inspector",
+  inspectorPluginInit,
+  inspectorPluginStartEvent,
+  inspectorPluginStopEvent,
+  inspectorPluginRecordEventState,
+  inspectorPluginFinalize,
+};
@@ -0,0 +1,496 @@
+#include "json.h"
+#include <assert.h>
+#include <math.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+const char* jsonErrorString(jsonResult_t res) {
+  switch (res) {
+  case jsonSuccess:
+    return "jsonSuccess";
+  case jsonFileError:
+    return "jsonFileError";
+  case jsonUnknownStateError:
+    return "jsonUnknownStateError";
+  case jsonEmptyStateError:
+    return "jsonEmptyStateError";
+  case jsonExpectedNonNoneStateError:
+    return "jsonExpectedNonNoneStateError";
+  case jsonMemoryError:
+    return "jsonMemoryError";
+  case jsonStringOverflowError:
+    return "jsonStringOverflowError";
+  case jsonStringBadChar:
+    return "jsonStringBadChar";
+  case jsonLockError:
+    return "jsonLockError";
+  default:
+    return "unknown json error";
+  }
+}
+
+// We use these statics to mantain a stack of states where we are writing.
+typedef struct jsonFileOutput {
+  jsonState_t* states;
+  size_t state_cap; // Allocated stack capacity
+  size_t state_n;   // # of items in the stack.
+  FILE* fp;
+  pthread_mutex_t mutex;
+} jsonFileOutput;
+
+jsonResult_t jsonInitFileOutput(jsonFileOutput** jfo, const char* outfile) {
+  jsonFileOutput* new_jfo = (jsonFileOutput*)malloc(sizeof(jsonFileOutput));
+  if (new_jfo == NULL) {
+    return jsonMemoryError;
+  }
+  if (pthread_mutex_init(&new_jfo->mutex, NULL) != 0) {
+    free(new_jfo);
+    *jfo = 0;
+    return jsonLockError;
+  }
+  new_jfo->states = NULL;
+  new_jfo->state_cap = 0;
+  new_jfo->state_n = 0;
+  new_jfo->fp = fopen(outfile, "w");
+  if (new_jfo->fp == NULL) {
+    free(new_jfo);
+    *jfo = 0;
+    return jsonFileError;
+  }
+  *jfo = new_jfo;
+  return jsonSuccess;
+}
+
+jsonResult_t jsonNewline(jsonFileOutput* jfo) {
+  fprintf(jfo->fp, "\n");
+  return jsonSuccess;
+}
+
+jsonResult_t jsonFlushOutput(jsonFileOutput* jfo) {
+  fflush(jfo->fp);
+  return jsonSuccess;
+}
+
+jsonResult_t jsonLockOutput(jsonFileOutput* jfo) {
+  if (pthread_mutex_lock(&jfo->mutex) != 0) {
+    return jsonLockError;
+  }
+  return jsonSuccess;
+}
+
+jsonResult_t jsonUnlockOutput(jsonFileOutput* jfo) {
+  if (pthread_mutex_unlock(&jfo->mutex) != 0) {
+    return jsonLockError;
+  }
+  return jsonSuccess;
+}
+
+jsonResult_t jsonFinalizeFileOutput(jsonFileOutput* jfo) {
+  // Really should probably complain if we aren't in a valid state
+
+  if (pthread_mutex_destroy(&jfo->mutex) != 0) {
+    free(jfo);
+    return jsonLockError;
+  }
+  if (jfo->states != NULL) {
+    free(jfo->states);
+  }
+  jfo->states = NULL;
+  jfo->state_cap = 0;
+  jfo->state_n = 0;
+  if (jfo->fp) {
+    fclose(jfo->fp);
+    jfo->fp = 0;
+  }
+
+  free(jfo);
+  return jsonSuccess;
+}
+
+static int utf8copy(unsigned char* out, int out_lim, const unsigned char* in) {
+  int copy_len;
+  if ((in[0] & 0xE0) == 0xC0) {
+    // 2-byte sequence
+    if ((in[1] & 0xC0) != 0x80 || out_lim < 2) {
+      return 0;
+    }
+    copy_len = 2;
+  } else if ((in[0] & 0xF0) == 0xE0) {
+    // 3-byte sequence
+    if ((in[1] & 0xC0) != 0x80 || (in[2] & 0xC0) != 0x80 || out_lim < 3) {
+      return 0;
+    }
+    copy_len = 3;
+  } else if ((in[0] & 0xF8) == 0xF0) {
+    // 4-byte sequence
+    if ((in[1] & 0xC0) != 0x80 || (in[2] & 0xC0) != 0x80 || (in[3] & 0xC0) != 0x80 || out_lim < 4) {
+      return 0;
+    }
+    copy_len = 4;
+  } else {
+    // Invalid start byte
+    return 0;
+  }
+
+  for (int i = 0; i < copy_len; ++i) {
+    out[i] = in[i];
+  }
+
+  return copy_len;
+}
+
+// This tries to sanitize/quote a string from 'in' into 'out',
+// assuming 'out' has length 'lim'.  We mainly quote ",/,\,\t,\n, and
+// bail if we encounter non-printable stuff or non-ASCII stuff.
+// 'in' should be null-terminated, of course.
+//
+// We return false if we were not able to copy all of 'in', either for
+// length reasons or for unhandled characters.
+static jsonResult_t sanitizeJson(unsigned char out[], int lim, const unsigned char* in) {
+  int c = 0;
+  while (*in) {
+    if (c + 1 >= lim) {
+      out[c] = 0;
+      return jsonStringOverflowError;
+    }
+    switch (*in) {
+    case '"':
+    case '\\':
+    case '/':
+    case '\t':
+    case '\n':
+      if (c + 2 > lim) {
+        out[c] = 0;
+        return jsonStringOverflowError;
+      }
+
+      out[c++] = '\\';
+      if (*in == '\n') {
+        out[c++] = 'n';
+      } else if (*in == '\t') {
+        out[c++] = 't';
+      } else {
+        out[c++] = *in;
+      }
+      ++in;
+      break;
+    default:
+      if (*in <= 0x1F) {
+        out[c] = 0;
+        return jsonStringBadChar;
+      } else if (*in <= 0x7F) {
+        out[c++] = *in;
+        ++in;
+      } else {
+        const int utf8len = utf8copy(out + c, lim - c - 1, in);
+        if (utf8len == 0) {
+          out[c] = 0;
+          return jsonStringBadChar;
+        }
+        c += utf8len;
+        in += utf8len;
+      }
+      break;
+    }
+  }
+  out[c] = 0;
+  return jsonSuccess;
+}
+
+static size_t max(size_t a, size_t b) {
+  if (a < b) {
+    return b;
+  }
+  return a;
+}
+
+// Push state onto the state stack. Reallocate for extra storage if needed.
+// Because JSON_NONE is a pseudo-state, don't allow it to be pushed.
+static jsonResult_t jsonPushState(jsonFileOutput* jfo, jsonState_t state) {
+  if (state == JSON_NONE) {
+    return jsonExpectedNonNoneStateError;
+  }
+  if (jfo->state_cap <= (jfo->state_n + 1)) {
+    jfo->state_cap = max((size_t)16, jfo->state_cap * 2);
+    jfo->states = (jsonState_t*)realloc(jfo->states, sizeof(jsonState_t) * jfo->state_cap);
+    if (jfo->states == 0) {
+      return jsonMemoryError;
+    }
+  }
+  jfo->states[jfo->state_n++] = state;
+  return jsonSuccess;
+}
+
+// Return the current state at the top of the stack
+static jsonState_t jsonCurrState(const jsonFileOutput* jfo) {
+  if (jfo->state_n == 0) {
+    return JSON_NONE;
+  }
+  return jfo->states[jfo->state_n - 1];
+}
+
+// Replace the stack with state (equivalent to a pop & push if stack is not empty)
+static jsonResult_t jsonReplaceState(jsonFileOutput* jfo, jsonState_t state) {
+  if (state == JSON_NONE) {
+    return jsonExpectedNonNoneStateError;
+  }
+  if (jfo->state_n == 0) {
+    return jsonEmptyStateError;
+  }
+  jfo->states[jfo->state_n - 1] = state;
+  return jsonSuccess;
+}
+
+// Pop the top state off the stack, or return that the state is empty
+static jsonState_t jsonPopState(jsonFileOutput* jfo) {
+  if (jfo->state_n == 0) {
+    return JSON_NONE;
+  }
+  return jfo->states[--jfo->state_n];
+}
+
+// Emit a key and separator. Santize the key.
+// This is only acceptable if the top state is an object
+// Emit a ',' separator of we aren't the first item.
+jsonResult_t jsonKey(jsonFileOutput* jfo, const char* name) {
+  switch (jsonCurrState(jfo)) {
+  case JSON_OBJECT_EMPTY:
+    jsonReplaceState(jfo, JSON_OBJECT_SOME);
+    break;
+  case JSON_OBJECT_SOME:
+    fprintf(jfo->fp, ",");
+    break;
+  default:
+    return jsonUnknownStateError;
+  }
+  unsigned char tmp[2048];
+  const jsonResult_t res = sanitizeJson(tmp, sizeof(tmp), (const unsigned char*)name);
+  if (res != jsonSuccess) {
+    return res;
+  }
+  fprintf(jfo->fp, "\"%s\":", tmp);
+  jsonPushState(jfo, JSON_KEY);
+  return jsonSuccess;
+}
+
+// Helper function for inserting values.
+// Only acceptable after keys, top-level, or in lists.
+// Emit preceeding ',' if in a list and not first item.
+static jsonResult_t jsonValHelper(jsonFileOutput* jfo) {
+  switch (jsonCurrState(jfo)) {
+  case JSON_LIST_EMPTY:
+    jsonReplaceState(jfo, JSON_LIST_SOME);
+    break;
+  case JSON_LIST_SOME:
+    fprintf(jfo->fp, ",");
+    break;
+  case JSON_KEY:
+    jsonPopState(jfo);
+    break;
+  case JSON_NONE:
+    break;
+  default:
+    return jsonUnknownStateError;
+  }
+  return jsonSuccess;
+}
+
+// Start an object
+jsonResult_t jsonStartObject(jsonFileOutput* jfo) {
+  const jsonResult_t res = jsonValHelper(jfo);
+  if (res != jsonSuccess) {
+    return res;
+  }
+  fprintf(jfo->fp, "{");
+  return jsonPushState(jfo, JSON_OBJECT_EMPTY);
+}
+
+// Close an object
+jsonResult_t jsonFinishObject(jsonFileOutput* jfo) {
+  switch (jsonPopState(jfo)) {
+  case JSON_OBJECT_EMPTY:
+  case JSON_OBJECT_SOME:
+    break;
+  default:
+    return jsonUnknownStateError;
+  }
+  fprintf(jfo->fp, "}");
+  return jsonSuccess;
+}
+
+// Start a list
+jsonResult_t jsonStartList(jsonFileOutput* jfo) {
+  const jsonResult_t res = jsonValHelper(jfo);
+  if (res != jsonSuccess) {
+    return res;
+  }
+  fprintf(jfo->fp, "[");
+  return jsonPushState(jfo, JSON_LIST_EMPTY);
+}
+
+// Close a list
+jsonResult_t jsonFinishList(jsonFileOutput* jfo) {
+  switch (jsonPopState(jfo)) {
+  case JSON_LIST_EMPTY:
+  case JSON_LIST_SOME:
+    break;
+  default:
+    return jsonUnknownStateError;
+  }
+  fprintf(jfo->fp, "]");
+  return jsonSuccess;
+}
+
+// Write a null value
+jsonResult_t jsonNull(jsonFileOutput* jfo) {
+  const jsonResult_t res = jsonValHelper(jfo);
+  if (res != jsonSuccess) {
+    return res;
+  }
+  fprintf(jfo->fp, "null");
+  return jsonSuccess;
+}
+
+// Write a (sanititzed) string
+jsonResult_t jsonStr(jsonFileOutput* jfo, const char* str) {
+  if (str == NULL) {
+    jsonNull(jfo);
+    return jsonSuccess;
+  }
+  const jsonResult_t res = jsonValHelper(jfo);
+  if (res != jsonSuccess) {
+    return res;
+  }
+  unsigned char tmp[2048];
+  const jsonResult_t san_res = sanitizeJson(tmp, sizeof(tmp), (const unsigned char*)str);
+  if (san_res != jsonSuccess) {
+    return san_res;
+  }
+  fprintf(jfo->fp, "\"%s\"", tmp);
+  return jsonSuccess;
+}
+
+// Write a bool as "true" or "false" strings.
+jsonResult_t jsonBool(jsonFileOutput* jfo, bool val) {
+  return jsonStr(jfo, val ? "true" : "false");
+}
+
+// Write an integer value
+jsonResult_t jsonInt(jsonFileOutput* jfo, const int val) {
+  const jsonResult_t res = jsonValHelper(jfo);
+  if (res != jsonSuccess) {
+    return res;
+  }
+  fprintf(jfo->fp, "%d", val);
+  return jsonSuccess;
+}
+
+// Write an integer value
+jsonResult_t jsonUint32(jsonFileOutput* jfo, const uint32_t val) {
+  const jsonResult_t res = jsonValHelper(jfo);
+  if (res != jsonSuccess) {
+    return res;
+  }
+  fprintf(jfo->fp, "%u", val);
+  return jsonSuccess;
+}
+
+
+// Write an integer value
+jsonResult_t jsonUint64(jsonFileOutput* jfo, const uint64_t val) {
+  const jsonResult_t res = jsonValHelper(jfo);
+  if (res != jsonSuccess) {
+    return res;
+  }
+  fprintf(jfo->fp, "%lu", val);
+  return jsonSuccess;
+}
+
+// Write a size_t value
+jsonResult_t jsonSize_t(jsonFileOutput* jfo, const size_t val) {
+  const jsonResult_t res = jsonValHelper(jfo);
+  if (res != jsonSuccess) {
+    return res;
+  }
+  fprintf(jfo->fp, "%zu", val);
+  return jsonSuccess;
+}
+
+// Write a double value
+jsonResult_t jsonDouble(jsonFileOutput* jfo, const double val) {
+  const jsonResult_t res = jsonValHelper(jfo);
+  if (res != jsonSuccess) {
+    return res;
+  }
+  if (val != val) {
+    fprintf(jfo->fp, "\"nan\"");
+  } else {
+    fprintf(jfo->fp, "%lf", val);
+  }
+  return jsonSuccess;
+}
+
+#ifdef DO_JSON_TEST
+// compile with
+// gcc json.cc -Iinclude/ -DDO_JSON_TEST -o json_test
+// run with:
+// ./json_test
+// if something fails, it will print out the error
+// if it all works, print out "output matches reference"
+#define JSONCHECK(expr)                                                                            \
+  do {                                                                                             \
+    const jsonResult_t res = (expr);                                                               \
+    if (res != jsonSuccess) {                                                                      \
+      fprintf(stderr, "jsonError: %s\n", jsonErrorString(res));                                    \
+      exit(1);                                                                                     \
+    }                                                                                              \
+  } while (0)
+
+int main() {
+
+  const char refstr[] =
+      "{\"number\":123,\"utfstring\":\"∮ E⋅da = Q,  n → ∞, ∑ f(i) = ∏ g(i), ∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ "
+      "¬β = ¬(¬α ∨ β),\",\"list\":[\"true\",null,9423812381231,3123111,0.694234]}";
+
+  jsonFileOutput* jfo;
+  JSONCHECK(jsonInitFileOutput(&jfo, "test.json"));
+  JSONCHECK(jsonStartObject(jfo));
+  JSONCHECK(jsonKey(jfo, "number"));
+  JSONCHECK(jsonInt(jfo, 123));
+  JSONCHECK(jsonKey(jfo, "utfstring"));
+  JSONCHECK(
+      jsonStr(jfo, "∮ E⋅da = Q,  n → ∞, ∑ f(i) = ∏ g(i), ∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ ¬β = ¬(¬α ∨ β),"));
+  JSONCHECK(jsonKey(jfo, "list"));
+  JSONCHECK(jsonStartList(jfo));
+  JSONCHECK(jsonBool(jfo, true));
+  JSONCHECK(jsonNull(jfo));
+  JSONCHECK(jsonUint64(jfo, 9423812381231ULL));
+  JSONCHECK(jsonSize_t(jfo, 3123111));
+  JSONCHECK(jsonDouble(jfo, 0.69423413));
+  JSONCHECK(jsonFinishList(jfo));
+  JSONCHECK(jsonFinishObject(jfo));
+  JSONCHECK(jsonFinalizeFileOutput(jfo));
+
+  FILE* fp = fopen("test.json", "r");
+
+  const size_t reflen = sizeof(refstr) / sizeof(char);
+
+  char buffer[reflen];
+
+  fread(buffer, sizeof(char), reflen, fp);
+
+  fclose(fp);
+
+  if (memcmp(buffer, refstr, reflen) == 0) {
+    printf("output matches reference\n");
+  } else {
+    printf("output    %s\nreference %s\n", buffer, refstr);
+    return 1;
+  }
+
+  return 0;
+}
+
+#endif
@@ -0,0 +1,83 @@
+#pragma once
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stddef.h>
+
+typedef enum {
+  JSON_NONE, // A pseudo-state meaning that the document is empty
+  JSON_KEY,
+  JSON_OBJECT_EMPTY,
+  JSON_OBJECT_SOME,
+  JSON_LIST_EMPTY,
+  JSON_LIST_SOME,
+} jsonState_t;
+
+typedef enum {
+  jsonSuccess,
+  jsonFileError,
+  jsonUnknownStateError,
+  jsonEmptyStateError,
+  jsonExpectedNonNoneStateError,
+  jsonStringOverflowError,
+  jsonStringBadChar,
+  jsonMemoryError,
+  jsonLockError,
+} jsonResult_t;
+
+const char *jsonErrorString(jsonResult_t res);
+
+typedef struct jsonFileOutput jsonFileOutput;
+
+jsonResult_t jsonLockOutput(jsonFileOutput *jfo);
+
+jsonResult_t jsonUnlockOutput(jsonFileOutput *jfo);
+
+jsonResult_t jsonInitFileOutput(jsonFileOutput **jfo,
+                                const char *outfile);
+
+jsonResult_t jsonFinalizeFileOutput(jsonFileOutput *jfo);
+
+jsonResult_t jsonNewline(jsonFileOutput *jfo);
+jsonResult_t jsonFlushOutput(jsonFileOutput *jfo);
+
+// Emit a key and separator. Santize the key.
+// This is only acceptable if the top state is an object
+// Emit a ',' separator of we aren't the first item.
+jsonResult_t jsonKey(jsonFileOutput *jfo, const char *name);
+
+// Start an object
+jsonResult_t jsonStartObject(jsonFileOutput *jfo);
+
+// Close an object
+jsonResult_t jsonFinishObject(jsonFileOutput *jfo);
+
+// Start a list
+jsonResult_t jsonStartList(jsonFileOutput *jfo);
+
+// Close a list
+jsonResult_t jsonFinishList(jsonFileOutput *jfo);
+
+// Emit a null value
+jsonResult_t jsonNull(jsonFileOutput *jfo);
+
+// Write a (sanititzed) string
+jsonResult_t jsonStr(jsonFileOutput *jfo, const char *str);
+
+// Write a bool as "true" or "false" strings.
+jsonResult_t jsonBool(jsonFileOutput *jfo, bool val);
+
+// Write an integer value
+jsonResult_t jsonInt(jsonFileOutput *jfo, const int val);
+
+//Write an unsigned int value
+jsonResult_t jsonUint32(jsonFileOutput *jfo, const uint32_t val);
+
+// Write an integer value
+jsonResult_t jsonUint64(jsonFileOutput *jfo, const uint64_t val);
+
+// Write a size_t value
+jsonResult_t jsonSize_t(jsonFileOutput *jfo, const size_t val);
+
+// Write a double value
+jsonResult_t jsonDouble(jsonFileOutput *jfo, const double val);
@@ -0,0 +1,73 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef COMMON_H_
+#define COMMON_H_
+
+/* typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; */
+/* typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys; */
+
+/* Data types */
+typedef enum { ncclInt8       = 0, ncclChar       = 0,
+               ncclUint8      = 1,
+               ncclInt32      = 2, ncclInt        = 2,
+               ncclUint32     = 3,
+               ncclInt64      = 4,
+               ncclUint64     = 5,
+               ncclFloat16    = 6, ncclHalf       = 6,
+               ncclFloat32    = 7, ncclFloat      = 7,
+               ncclFloat64    = 8, ncclDouble     = 8,
+               ncclBfloat16   = 9,
+               ncclFloat8e4m3 = 10,
+               ncclFloat8e5m2 = 11,
+               ncclNumTypes   = 12
+} ncclDataType_t;
+
+typedef enum {
+  NCCL_LOG_NONE = 0,
+  NCCL_LOG_VERSION = 1,
+  NCCL_LOG_WARN = 2,
+  NCCL_LOG_INFO = 3,
+  NCCL_LOG_ABORT = 4,
+  NCCL_LOG_TRACE = 5
+} ncclDebugLogLevel;
+
+typedef enum { ncclSuccess                 =  0,
+               ncclUnhandledCudaError      =  1,
+               ncclSystemError             =  2,
+               ncclInternalError           =  3,
+               ncclInvalidArgument         =  4,
+               ncclInvalidUsage            =  5,
+               ncclRemoteError             =  6,
+               ncclInProgress              =  7,
+               ncclNumResults              =  8 } ncclResult_t;
+
+
+typedef enum {
+  NCCL_INIT = 0x1,
+  NCCL_COLL = 0x2,
+  NCCL_P2P = 0x4,
+  NCCL_SHM = 0x8,
+  NCCL_NET = 0x10,
+  NCCL_GRAPH = 0x20,
+  NCCL_TUNING = 0x40,
+  NCCL_ENV = 0x80,
+  NCCL_ALLOC = 0x100,
+  NCCL_CALL = 0x200,
+  NCCL_PROXY = 0x400,
+  NCCL_NVLS = 0x800,
+  NCCL_BOOTSTRAP = 0x1000,
+  NCCL_REG = 0x2000,
+  NCCL_PROFILE = 0x4000,
+  NCCL_RAS = 0x8000,
+  NCCL_INSPECTOR = 0x100000, // big number to avoid short-term conflicts
+  NCCL_ALL = ~0
+} ncclDebugLogSubSys;
+
+
+typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
+
+#endif
@@ -0,0 +1,85 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_H_
+#define PROFILER_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "common.h"
+
+enum {
+  ncclProfileGroup          = (1 << 0),  // group event type
+  ncclProfileColl           = (1 << 1),  // host collective call event type
+  ncclProfileP2p            = (1 << 2),  // host point-to-point call event type
+  ncclProfileProxyOp        = (1 << 3),  // proxy operation event type
+  ncclProfileProxyStep      = (1 << 4),  // proxy step event type
+  ncclProfileProxyCtrl      = (1 << 5),  // proxy control event type
+  ncclProfileKernelCh       = (1 << 6),  // kernel channel event type
+  ncclProfileNetPlugin      = (1 << 7),  // network plugin-defined, events
+  ncclProfileGroupApi       = (1 << 8),  // Group API events
+  ncclProfileCollApi        = (1 << 9),  // Collective API events
+  ncclProfileP2pApi         = (1 << 10), // Point-to-Point API events
+  ncclProfileKernelLaunch   = (1 << 11), // Kernel launch events
+};
+
+typedef enum {
+  ncclProfilerProxyOpSendPosted        = 0,  // deprecated in v4
+  ncclProfilerProxyOpSendRemFifoWait   = 1,  // deprecated in v4
+  ncclProfilerProxyOpSendTransmitted   = 2,  // deprecated in v4
+  ncclProfilerProxyOpSendDone          = 3,  // deprecated in v4
+  ncclProfilerProxyOpRecvPosted        = 4,  // deprecated in v4
+  ncclProfilerProxyOpRecvReceived      = 5,  // deprecated in v4
+  ncclProfilerProxyOpRecvTransmitted   = 6,  // deprecated in v4
+  ncclProfilerProxyOpRecvDone          = 7,  // deprecated in v4
+  ncclProfilerProxyOpInProgress_v4     = 19,
+
+  /* Legacy proxy profiler states */
+  ncclProfilerProxyStepSendGPUWait     = 8,
+  ncclProfilerProxyStepSendPeerWait_v4 = 20,
+  ncclProfilerProxyStepSendWait        = 9,
+  ncclProfilerProxyStepRecvWait        = 10,
+  ncclProfilerProxyStepRecvFlushWait   = 11,
+  ncclProfilerProxyStepRecvGPUWait     = 12,
+
+  /* Legacy proxy control states */
+  ncclProfilerProxyCtrlIdle            = 13,
+  ncclProfilerProxyCtrlActive          = 14,
+  ncclProfilerProxyCtrlSleep           = 15,
+  ncclProfilerProxyCtrlWakeup          = 16,
+  ncclProfilerProxyCtrlAppend          = 17,
+  ncclProfilerProxyCtrlAppendEnd       = 18,
+
+  /* Network defined events states */
+  ncclProfilerNetPluginUpdate          = 21,
+
+  /* Kernel event states */
+  ncclProfilerKernelChStop             = 22,
+
+  /* Group API States */
+  ncclProfilerEndGroupApiStart         = 23,
+  ncclProfilerBeginGroupApiEnd         = 24
+} ncclProfilerEventState_t;
+
+typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t;
+typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t;
+typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t;
+typedef ncclProfilerEventState_t ncclProfilerEventState_v4_t;
+typedef ncclProfilerEventState_t ncclProfilerEventState_v5_t;
+
+#include "profiler_v5.h"
+#include "profiler_v4.h"
+#include "profiler_v3.h"
+#include "profiler_v2.h"
+#include "profiler_v1.h"
+#include "profiler_net.h"
+
+typedef ncclProfiler_v5_t ncclProfiler_t;
+typedef ncclProfilerEventDescr_v5_t ncclProfilerEventDescr_t;
+typedef ncclProfilerEventStateArgs_v5_t ncclProfilerEventStateArgs_t;
+
+#endif // end include guard
@@ -0,0 +1,19 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_NET_H_
+#define PROFILER_NET_H_
+
+#define NCCL_PROFILER_NET_VER_BITS  (16)
+#define NCCL_PROFILER_NET_VER_MASK  (~0U >> NCCL_PROFILER_NET_VER_BITS)
+#define NCCL_PROFILER_NET_TYPE_MASK (~0U << NCCL_PROFILER_NET_VER_BITS)
+
+typedef enum {
+  NCCL_PROFILER_NET_TYPE_IB   = (1U << NCCL_PROFILER_NET_VER_BITS),
+  NCCL_PROFILER_NET_TYPE_SOCK = (2U << NCCL_PROFILER_NET_VER_BITS),
+} ncclProfilerNetType;
+
+#endif
@@ -0,0 +1,112 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V1_H_
+#define PROFILER_V1_H_
+
+#include <stdint.h>
+#include <stddef.h>
+#include <sys/types.h>
+
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint64_t seqNumber;
+      uint8_t func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      uint8_t datatype;
+      uint32_t op;
+      size_t trafficBytes;
+      uint8_t nMaxChannels;
+      uint8_t nWarps;
+      uint8_t algo;
+      uint8_t proto;
+      int isCollnet;
+      int isNvls;
+    } coll;
+
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint8_t func;
+      void* buff;
+      uint8_t datatype;
+      size_t count;
+      int peer;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+  };
+} ncclProfilerEventDescr_v1_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+    int steps;
+  } proxyOp;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+} ncclProfilerEventStateArgs_v1_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v1_t;
+
+#endif
@@ -0,0 +1,108 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V2_H_
+#define PROFILER_V2_H_
+
+#include <stdint.h>
+#include <stddef.h>
+#include <sys/types.h>
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      size_t trafficBytes;
+      uint8_t nMaxChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+    } coll;
+
+    struct {
+      const char* name;
+      uint64_t commHash;
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+  };
+} ncclProfilerEventDescr_v2_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+    int steps;
+  } proxyOp;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+} ncclProfilerEventStateArgs_v2_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v2_t;
+
+#endif
@@ -0,0 +1,116 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V3_H_
+#define PROFILER_V3_H_
+
+#include <stdint.h>
+#include <stddef.h>
+#include <sys/types.h>
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      uint8_t nMaxChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+    } coll;
+
+    struct {
+      const char* name;
+      uint64_t commHash;
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+
+    struct {
+      uint8_t channelId;
+    } kernelCh;
+
+    struct {
+      int64_t id;
+      void* data;
+    } netPlugin;
+  };
+} ncclProfilerEventDescr_v3_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+    int steps;
+  } proxyOp;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+} ncclProfilerEventStateArgs_v3_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v3_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v3_t eState, ncclProfilerEventStateArgs_v3_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v3_t;
+
+#endif
@@ -0,0 +1,127 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V4_H_
+#define PROFILER_V4_H_
+
+#include <stdint.h>
+#include <stddef.h>
+#include <sys/types.h>
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      uint8_t nChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+    } coll;
+
+    struct {
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+      uint8_t nChannels;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+
+    struct {
+      uint8_t channelId;
+      uint64_t pTimer;          // start timestamp from GPU globaltimer
+    } kernelCh;
+
+    struct {
+      int64_t id;
+      void* data;
+    } netPlugin;
+  };
+} ncclProfilerEventDescr_v4_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+  } proxyStep;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+
+  struct {
+    void* data;
+  } netPlugin;
+
+  struct {
+    uint64_t pTimer;
+  } kernelCh;
+} ncclProfilerEventStateArgs_v4_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  //  - commName       : user assigned communicator name
+  //  - commHash       : communicator id
+  //  - nNodes         : number of nodes in communicator
+  //  - nranks         : number of ranks in communicator
+  //  - rank           : rank identifier in communicator
+  //  - logfn          : logger function
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v4_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v4_t eState, ncclProfilerEventStateArgs_v4_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v4_t;
+
+#endif
@@ -0,0 +1,151 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V5_H_
+#define PROFILER_V5_H_
+
+typedef struct {
+  uint64_t type;                // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      bool graphCaptured;
+      int groupDepth;
+    } groupApi;
+
+    struct {
+      const char* func;
+      size_t count;
+      const char* datatype;
+      int root;
+      void* stream;
+      bool graphCaptured;
+    } collApi;
+
+    struct {
+      const char* func;
+      size_t count;
+      const char* datatype;
+      void* stream;
+      bool graphCaptured;
+    } p2pApi;
+
+    struct {
+      void* stream;
+    } kernelLaunch;
+
+    struct {
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      uint8_t nChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+      void* parentGroup; // for backward compatibility with v4
+    } coll;
+
+    struct {
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+      uint8_t nChannels;
+      void* parentGroup; // for backward compatibility with v4
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+
+    struct {
+      uint8_t channelId;
+      uint64_t pTimer;          // start timestamp from GPU globaltimer
+    } kernelCh;
+
+    struct {
+      int64_t id;
+      void* data;
+    } netPlugin;
+  };
+} ncclProfilerEventDescr_v5_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+  } proxyStep;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+
+  struct {
+    void* data;
+  } netPlugin;
+
+  struct {
+    uint64_t pTimer;
+  } kernelCh;
+} ncclProfilerEventStateArgs_v5_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  //  - commId         : communicator id
+  //  - commName       : user assigned communicator name
+  //  - nNodes         : number of nodes in communicator
+  //  - nranks         : number of ranks in communicator
+  //  - rank           : rank identifier in communicator
+  //  - logfn          : logger function
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, uint64_t commId, int* eActivationMask, const char* commName, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v5_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v5_t eState, ncclProfilerEventStateArgs_v5_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v5_t;
+
+#endif
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NCCL_TYPES_H_
+#define NCCL_TYPES_H_
+
+/* Data types */
+typedef enum { ncclInt8       = 0, ncclChar       = 0,
+               ncclUint8      = 1,
+               ncclInt32      = 2, ncclInt        = 2,
+               ncclUint32     = 3,
+               ncclInt64      = 4,
+               ncclUint64     = 5,
+               ncclFloat16    = 6, ncclHalf       = 6,
+               ncclFloat32    = 7, ncclFloat      = 7,
+               ncclFloat64    = 8, ncclDouble     = 8,
+               ncclBfloat16   = 9,
+} ncclDataType_t;
+
+#endif
@@ -0,0 +1,12 @@
+#ifndef VERSION_H
+#define VERSION_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+const char* get_git_version_info();
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif // VERSION_H
@@ -0,0 +1,803 @@
+diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc
+index 9bfd8dcf..4d3f0a08 100644
+--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
+@@ -29,6 +29,7 @@
+ 
+ #include "ibvwrap.h"
+ #include "mlx5/mlx5dvwrap.h"
+#include "ionic/ionicdvwrap.h"
+ #include "graph/xml.h"
+ 
+ #define MAXSUFFIXSIZE 16
+@@ -110,16 +111,38 @@ struct ncclIbMergedDev ncclIbMergedDevs[MAX_IB_VDEVS];
+ struct ncclIbDev ncclIbDevs[MAX_IB_DEVS];
+ static std::mutex ncclIbMutex;
+ static int ncclIbRelaxedOrderingEnabled = 0;
+static bool rcclAinicRoce = 0;
+static bool rcclCtsInlineData = 0;
+static bool rcclCtsOffloadEnabled = 0;
+static bool ncclIbUseInline = 0;
+static int ncclIbGdrFlushDisable = 0;
+
+enum ncclIbChannelType {
+  ncclIbChannelTypeCts  = 0,
+  ncclIbChannelTypeData = 1,
+  ncclIbChannelTypeMax  = 2
+};
+
+struct ncclChannelToUd {
+    int channelId;
+    bool udId;
+    bool udAllocated;
+};
+
+static ncclChannelToUd nccl_channel_ud_map[MAXCHANNELS][ncclIbChannelTypeMax];
+static bool nccl_channel_last_ud[MAX_IB_DEVS][ncclIbChannelTypeMax];
+ 
+ // With ncclNet_v11_t the NCCL core initializes the network plugin per-communicator
+ // rather than once for all communicators. However, the internal plugin implementation
+ // still assumes the plugin is initialized only once across all communicators. The ref
+ // counter makes sure the plugin internally initializes only once. When per communicator
+ // context support is added to the plugin the ref counter can be removed.
+ static int netRefCount;
+ 
+ #define NCCL_IB_LLSTR(ll) (((ll) == IBV_LINK_LAYER_INFINIBAND) ? "IB" : (((ll) == IBV_LINK_LAYER_ETHERNET) ? "RoCE" : "UNSPECIFIED"))
+ 
+#define NCCL_CTS_QP_SLOT_INVALID 0xFF
+
+ #define NCCL_IB_SL_DEFAULT 0
+ #define NCCL_IB_TC_DEFAULT 0
+ 
+@@ -141,6 +164,13 @@ NCCL_PARAM(IbEceEnable,"IB_ECE_ENABLE",1);
+ NCCL_PARAM(IbDataDirect,"IB_DATA_DIRECT",1);
+ NCCL_PARAM(IbQpsPerConn, "IB_QPS_PER_CONNECTION", 1);
+ RCCL_PARAM(IbQpsPerP2p, "IB_QPS_PER_P2P", 0);
+NCCL_PARAM(IbGdrFlushDisable, "GDR_FLUSH_DISABLE", 0);
+
+// AMD AINIC
+RCCL_PARAM(CtsInlineData, "CTS_INLINE_DATA", -1);
+RCCL_PARAM(CtsOffloadEnabled, "CTS_OFFLOAD_ENABLED", -1);
+
+extern int64_t rcclParamAinicRoce();
+ 
+ static ncclResult_t ncclIbStatsInit(struct ncclIbStats* stat) {
+   __atomic_store_n(&stat->fatalErrorCount, 0, __ATOMIC_RELAXED);
+@@ -779,6 +809,10 @@ ncclResult_t ncclIbInit(void** ctx, uint64_t commId, ncclNetCommConfig_t* config
+   static int shownIbHcaEnv = 0;
+   if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; }
+   if(wrap_mlx5dv_symbols() != ncclSuccess) { INFO(NCCL_NET, "NET/IB : Failed to open mlx5dv symbols. Advance features like CX-8 Direct-NIC will be disabled."); }
+  if(wrap_ionicdv_symbols() != ncclSuccess) {
+    WARN("NET/IB : Failed to open ionicdv symbols. Advance features like AINIC UD load balancing will be disabled.");
+    return ncclInternalError;
+  }
+ 
+   // Detect IB cards
+   int nIbDevs = 0;
+@@ -944,6 +978,23 @@ ncclResult_t ncclIbInit(void** ctx, uint64_t commId, ncclNetCommConfig_t* config
+     INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using%s %s; OOB %s:%s", line, ncclIbRelaxedOrderingEnabled ? "[RO]" : "",
+           ncclIbIfName, ncclSocketToString(&ncclIbIfAddr, addrline));
+ 
+    ncclIbUseInline = ncclParamIbUseInline();
+    ncclIbGdrFlushDisable = ncclParamIbGdrFlushDisable();
+
+    rcclAinicRoce = ((rcclParamAinicRoce() == 1) ? true : false);
+    if (rcclAinicRoce) {
+      // for AINIC, these params are defaulted to enabled unless user forces it to disable(0).
+      rcclCtsInlineData = ((rcclParamCtsInlineData() == 0) ? false : true);
+      rcclCtsOffloadEnabled = ((rcclParamCtsOffloadEnabled() == 0) ? false : true);
+      // for AINIC IbUseInline is enabled by default always
+      ncclIbUseInline = true;
+      // for AINIC GDR flush is disabled by default
+      ncclIbGdrFlushDisable = 1;
+
+      INFO(NCCL_INIT|NCCL_NET, "NET/IB : AINIC RoCEv2 optimizations enabled: CTS Inline Data: %s; CTS Offload: %s; "
+           "IB Use Inline: enabled; GDR Flush: disabled", rcclCtsInlineData ? "Enabled": "Disabled",
+           rcclCtsOffloadEnabled ? "Enabled": "Disabled");
+    }
+   }
+ exit:
+   ibContext.trafficClass = config->trafficClass;
+@@ -1271,6 +1322,8 @@ struct ncclIbListenComm {
+   struct ncclIbCommStage stage;
+ };
+ 
+#define MAX_INLINE_DATA_SIZE 24
+
+ struct alignas(64) ncclIbSendFifo {
+   uint64_t addr;
+   uint64_t size;
+@@ -1281,10 +1334,21 @@ struct alignas(64) ncclIbSendFifo {
+   char padding[16];
+ };
+ 
+struct alignas(32) ncclIbSendFifoCtsInline {
+  uint64_t addr;
+  uint32_t rkeys[1];
+  int size;
+  uint8_t nreqs;
+  uint16_t tag;
+  uint32_t idx;
+  char padding[9];
+} __attribute__((packed));
+
+ struct ncclIbQp {
+   struct ibv_qp* qp;
+   int devIndex;
+   int remDevIdx;
+  int8_t ctsQpSlot;
+ };
+ 
+ struct ncclIbRemSizesFifo {
+@@ -1331,6 +1395,7 @@ struct ncclIbSendComm {
+   struct ncclIbNetCommBase base;
+   // Start with fifo and ibv structs as they have alignment restrictions
+   struct ncclIbSendFifo fifo[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
+  struct ncclIbSendFifoCtsInline fifo_inline[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
+   struct ibv_sge sges[NCCL_NET_IB_MAX_RECVS];
+   struct ibv_send_wr wrs[NCCL_NET_IB_MAX_RECVS + 1];
+   // Each dev correlates to a mergedIbDev
+@@ -1346,6 +1411,7 @@ struct ncclIbSendComm {
+ static_assert((sizeof(struct ncclIbNetCommBase) % 32) == 0, "ncclIbNetCommBase size must be 32-byte multiple to ensure fifo is at proper offset");
+ static_assert((offsetof(struct ncclIbSendComm, fifo) % 32) == 0, "ncclIbSendComm fifo must be 32-byte aligned");
+ static_assert((sizeof(struct ncclIbSendFifo) % 32) == 0, "ncclIbSendFifo element size must be 32-byte multiples");
+static_assert((sizeof(struct ncclIbSendFifoCtsInline) % 32) == 0, "ncclIbSendFifoCtsInline element size must be 32-byte multiples");
+ static_assert((offsetof(struct ncclIbSendComm, sges) % 32) == 0, "sges must be 32-byte aligned");
+ static_assert((offsetof(struct ncclIbSendComm, wrs) % 32) == 0, "wrs must be 32-byte aligned");
+ 
+@@ -1360,6 +1426,7 @@ struct ncclIbGpuFlush {
+ 
+ struct ncclIbRemFifo {
+   struct ncclIbSendFifo elems[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
+  struct ncclIbSendFifoCtsInline elems_cts_inline[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
+   uint64_t fifoTail;
+   uint64_t addr;
+   uint32_t flags;
+@@ -1415,20 +1482,59 @@ ncclResult_t ncclIbDestroyBase(struct ncclIbNetCommDevBase* base) {
+   return ncclSuccess;
+ }
+ 
+-ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base, int access_flags, void* qp_context, struct ncclIbQp* qp) {
+ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base,
+                            int access_flags, void* qp_context, struct ncclIbQp* qp,
+                            int channel_id, bool data_qp, int8_t cts_qp_slot) {
+   struct ibv_qp_init_attr qpInitAttr;
+  enum ncclIbChannelType channel_type = (data_qp ? ncclIbChannelTypeData : ncclIbChannelTypeCts);
+   memset(&qpInitAttr, 0, sizeof(struct ibv_qp_init_attr));
+   qpInitAttr.qp_context = qp_context;
+   qpInitAttr.send_cq = base->cq;
+   qpInitAttr.recv_cq = base->cq;
+   qpInitAttr.qp_type = IBV_QPT_RC;
+
+  if (rcclAinicRoce) {
+    if (!nccl_channel_ud_map[channel_id][channel_type].udAllocated) {
+      bool lud = nccl_channel_last_ud[base->ibDevN][channel_type];
+      nccl_channel_ud_map[channel_id][channel_type].udId = lud;
+      nccl_channel_ud_map[channel_id][channel_type].udAllocated = true;
+      nccl_channel_last_ud[base->ibDevN][channel_type] =
+          !(nccl_channel_last_ud[base->ibDevN][channel_type]);
+    }
+    if (nccl_channel_ud_map[channel_id][channel_type].udId) {
+        wrap_ionicdv_pd_set_udma_mask(base->pd, IONIC_UDMA_MASK_HIGH);
+    } else {
+        wrap_ionicdv_pd_set_udma_mask(base->pd, IONIC_UDMA_MASK_LOW);
+    }
+    qpInitAttr.sq_sig_all |= (1 << 16);
+    if (data_qp) {
+      qpInitAttr.sq_sig_all |= (1 << 17);
+    } else {
+      qpInitAttr.sq_sig_all &= (~(1 << 17));
+    }
+    qpInitAttr.sq_sig_all |= (1 << 18);
+
+    if (rcclCtsOffloadEnabled) {
+      qpInitAttr.sq_sig_all |= (1 << 19);
+    } else {
+      qpInitAttr.sq_sig_all &= (~(1 << 19));
+    }
+  }
+
+   // We might send 2 messages per send (RDMA and RDMA_WITH_IMM)
+   qpInitAttr.cap.max_send_wr = 2*MAX_REQUESTS;
+   qpInitAttr.cap.max_recv_wr = MAX_REQUESTS;
+   qpInitAttr.cap.max_send_sge = 1;
+   qpInitAttr.cap.max_recv_sge = 1;
+-  qpInitAttr.cap.max_inline_data = ncclParamIbUseInline() ? sizeof(struct ncclIbSendFifo) : 0;
+  if (rcclCtsInlineData) {
+    qpInitAttr.cap.max_inline_data = MAX_INLINE_DATA_SIZE;
+  } else {
+    qpInitAttr.cap.max_inline_data = ncclIbUseInline ? sizeof(struct ncclIbSendFifo) : 0;
+  }
+   NCCLCHECK(wrap_ibv_create_qp(&qp->qp, base->pd, &qpInitAttr));
+  if (rcclAinicRoce) {
+    NCCLCHECK(wrap_ionicdv_qp_set_gda(qp->qp, false, true));
+  }
+   struct ibv_qp_attr qpAttr;
+   memset(&qpAttr, 0, sizeof(struct ibv_qp_attr));
+   qpAttr.qp_state = IBV_QPS_INIT;
+@@ -1438,6 +1544,9 @@ ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base,
+   NCCLCHECK(wrap_ibv_modify_qp(qp->qp, &qpAttr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS));
+   TRACE(NCCL_NET, "NET/IB : ncclIbCreateQp port=%d dev=%d devName=%s ndevs=%d nmdevs=%d qpn=%u pkey=%u pd=%p",
+     ib_port, base->ibDevN, ncclIbDevs[base->ibDevN].devName, ncclNIbDevs, ncclNMergedIbDevs, qp->qp->qp_num, qpAttr.pkey_index, base->pd);
+  if (rcclAinicRoce) {
+    qp->ctsQpSlot = cts_qp_slot;
+  }
+   return ncclSuccess;
+ }
+ 
+@@ -1521,7 +1630,7 @@ fail:
+   goto exit;
+ }
+ 
+-ncclResult_t ncclIbConnect(void* ctx, int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
+ncclResult_t ncclIbConnect(void* ctx, int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) {
+   ncclResult_t ret = ncclSuccess;
+   struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle;
+   struct ncclIbCommStage* stage = &handle->stage;
+@@ -1529,8 +1638,13 @@ ncclResult_t ncclIbConnect(void* ctx, int dev, void* opaqueHandle, void** sendCo
+   int ready;
+   uint8_t link_layer = IBV_LINK_LAYER_UNSPECIFIED;
+   int isP2p = 0; 
+  int channel_id = 0;
+   *sendComm = NULL;
+ 
+  if (rcclAinicRoce) {
+    channel_id = ((ncclNet_ctxt_t *)sendDevComm)->chId;
+  }
+
+   if (stage->state == ncclIbCommStateConnect)      goto ib_connect_check;
+   if (stage->state == ncclIbCommStateSendDevList)  goto ib_send_dev_list;
+   if (stage->state == ncclIbCommStateRecvDevList)  goto ib_recv_dev_list;
+@@ -1612,7 +1726,7 @@ ib_recv_dev_list:
+   for (int q = 0; q < comm->base.nqps; q++) {
+     ncclIbSendCommDev* commDev = comm->devs + devIndex;
+     ncclIbDev* ibDev = ncclIbDevs + commDev->base.ibDevN;
+-    NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &commDev->base, IBV_ACCESS_REMOTE_WRITE, &comm->base.stats, comm->base.qps + q), ret, fail);
+    NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &commDev->base, IBV_ACCESS_REMOTE_WRITE, &comm->base.stats, comm->base.qps + q, channel_id, true, NCCL_CTS_QP_SLOT_INVALID), ret, fail);
+     comm->base.qps[q].devIndex = devIndex;
+     meta.qpInfo[q].qpn      = comm->base.qps[q].qp->qp_num;
+     meta.qpInfo[q].devIndex = comm->base.qps[q].devIndex;
+@@ -1637,7 +1751,11 @@ ib_recv_dev_list:
+     devInfo->lid           = ibDev->portAttr.lid;
+     devInfo->ibv_dev_index = commDev->base.ibDevN;
+     // Prepare my fifo
+-    NCCLCHECKGOTO(wrap_ibv_reg_mr(&commDev->fifoMr, commDev->base.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
+    if (rcclCtsInlineData) {
+      NCCLCHECKGOTO(wrap_ibv_reg_mr(&commDev->fifoMr, commDev->base.pd, comm->fifo_inline, sizeof(struct ncclIbSendFifoCtsInline)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
+    } else {
+      NCCLCHECKGOTO(wrap_ibv_reg_mr(&commDev->fifoMr, commDev->base.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
+    }
+     devInfo->fifoRkey = commDev->fifoMr->rkey;
+ 
+     // Pack local GID info
+@@ -1680,7 +1798,11 @@ ib_recv_dev_list:
+     }
+   }
+   config = (ncclNetCommConfig_t*)ctx;
+-  meta.fifoAddr = (uint64_t)comm->fifo;
+  if (rcclCtsInlineData) {
+    meta.fifoAddr = (uint64_t)comm->fifo_inline;
+  } else {
+    meta.fifoAddr = (uint64_t)comm->fifo;
+  }
+   meta.sl = (ncclParamIbSl() != -1) ? ncclParamIbSl() : (config && config->trafficClass != NCCL_NET_TRAFFIC_CLASS_UNDEF) ? config->trafficClass : NCCL_IB_SL_DEFAULT;
+   meta.tc = (ncclParamIbTc() != -1) ? ncclParamIbTc() : (config && config->trafficClass != NCCL_NET_TRAFFIC_CLASS_UNDEF) ? config->trafficClass : NCCL_IB_TC_DEFAULT;
+   strncpy(meta.devName, mergedDev->devName, MAX_MERGED_DEV_NAME);
+@@ -1825,18 +1947,22 @@ ncclResult_t ncclIbCheckVProps(ncclNetVDeviceProps_t* vProps1, ncclNetVDevicePro
+   return ncclSuccess;
+ }
+ 
+-NCCL_PARAM(IbGdrFlushDisable, "GDR_FLUSH_DISABLE", 0);
+ RCCL_PARAM(IbGdrFlushGpuMemNoRelaxedOrdering, "GDR_FLUSH_GPU_MEM_NO_RELAXED_ORDERING", 1);
+ 
+-ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
+ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** recvDevComm) {
+   ncclResult_t ret = ncclSuccess;
+   struct ncclIbListenComm* lComm = (struct ncclIbListenComm*)listenComm;
+   struct ncclIbCommStage* stage = &lComm->stage;
+   struct ncclIbRecvComm* rComm = (struct ncclIbRecvComm*)stage->comm;
+   int ready;
+   int link_layer = IBV_LINK_LAYER_UNSPECIFIED;
+  int channel_id = 0;
+   *recvComm = NULL;
+ 
+  if (rcclAinicRoce) {
+    channel_id = ((ncclNet_ctxt_t *) recvDevComm)->chId;
+  }
+
+   if (stage->state == ncclIbCommStateAccept)   goto ib_accept_check;
+   if (stage->state == ncclIbCommStateRecvDevList) goto ib_recv_dev_list;
+   if (stage->state == ncclIbCommStateSendDevList) goto ib_send_dev_list;
+@@ -1966,7 +2092,7 @@ ib_recv:
+     // Local ibDevN
+     ibDevN = rComm->devs[devIndex].base.ibDevN;
+     ibDev = ncclIbDevs + ibDevN;
+-    NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_REMOTE_WRITE, &rComm->base.stats, qp), ret, fail);
+    NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_REMOTE_WRITE, &rComm->base.stats, qp, channel_id, false, q), ret, fail);
+     qp->devIndex = devIndex;
+     devIndex = (devIndex + 1) % rComm->base.vProps.ndevs;
+ 
+@@ -1992,16 +2118,22 @@ ib_recv:
+ 
+   useDmaBuf  = (ncclIbDmaBufSupport(lComm->dev) == ncclSuccess);
+   rComm->flushEnabled = ((ncclIbGdrSupport() == ncclSuccess || useDmaBuf)
+-                            && (ncclParamIbGdrFlushDisable() == 0)) ? 1 : 0;              
+                            && (ncclIbGdrFlushDisable == 0)) ? 1 : 0;
+   for (int i = 0; i < rComm->base.vProps.ndevs; i++) {
+     rCommDev = rComm->devs + i;
+     ibDev = ncclIbDevs + rCommDev->base.ibDevN;
+ 
+     // Retain remote fifo info and prepare my RDMA ops
+     rComm->remFifo.addr = remMeta.fifoAddr;
+-    NCCLCHECKGOTO(wrap_ibv_reg_mr(&rCommDev->fifoMr, rCommDev->base.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
+    if (rcclCtsInlineData) {
+      NCCLCHECKGOTO(wrap_ibv_reg_mr(&rCommDev->fifoMr, rCommDev->base.pd, &rComm->remFifo.elems_cts_inline,
+                                    sizeof(struct ncclIbSendFifoCtsInline)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS,
+                                    IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
+    } else {
+      NCCLCHECKGOTO(wrap_ibv_reg_mr(&rCommDev->fifoMr, rCommDev->base.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
+    }
+     rCommDev->fifoSge.lkey = rCommDev->fifoMr->lkey;
+-    if (ncclParamIbUseInline()) rComm->remFifo.flags = IBV_SEND_INLINE;
+    if (ncclIbUseInline) rComm->remFifo.flags = IBV_SEND_INLINE;
+ 
+     // Allocate Flush dummy buffer for GPU Direct RDMA
+     if (rComm->flushEnabled) {
+@@ -2039,7 +2171,7 @@ ib_recv:
+       rCommDev->gpuFlush.sge.addr = (uint64_t)&rComm->gpuFlushHostMem;
+       rCommDev->gpuFlush.sge.length = 1;
+       rCommDev->gpuFlush.sge.lkey = rCommDev->gpuFlush.hostMr->lkey;
+-      NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE, &rComm->base.stats, &rCommDev->gpuFlush.qp), ret, fail);
+      NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE, &rComm->base.stats, &rCommDev->gpuFlush.qp, channel_id, true, NCCL_CTS_QP_SLOT_INVALID), ret, fail);
+       struct ncclIbDevInfo devInfo;
+       devInfo.lid         = ibDev->portAttr.lid;
+       devInfo.link_layer  = ibDev->portAttr.link_layer;
+@@ -2257,10 +2389,15 @@ ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) {
+ 
+ NCCL_PARAM(IbSplitDataOnQps, "IB_SPLIT_DATA_ON_QPS", 0);
+ 
+-ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
+ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot, bool use_write_op) {
+   struct ncclIbRequest** reqs = comm->fifoReqs[slot];
+   volatile struct ncclIbSendFifo* slots = comm->fifo[slot];
+-  int nreqs = slots[0].nreqs;
+  int nreqs;
+  if (rcclCtsOffloadEnabled) {
+    nreqs = 1;
+  } else {
+    nreqs = slots[0].nreqs;
+  }
+   if (nreqs > NCCL_NET_IB_MAX_RECVS) return ncclInternalError;
+ 
+   uint64_t wr_id = 0ULL;
+@@ -2272,7 +2409,11 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
+     sge->addr=(uintptr_t)reqs[r]->send.data;
+     wr->opcode = IBV_WR_RDMA_WRITE;
+     wr->send_flags = 0;
+-    wr->wr.rdma.remote_addr = slots[r].addr;
+    if (rcclCtsOffloadEnabled) {
+      wr->wr.rdma.remote_addr = 0xdeadbeef;
+    } else {
+      wr->wr.rdma.remote_addr = slots[r].addr;
+    }
+     wr->next = wr + 1;
+     wr_id += (reqs[r] - comm->base.reqs) << (r*8);
+ #ifdef NCCL_ENABLE_NET_PROFILING
+@@ -2283,7 +2424,7 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
+   // Write size as immediate data. In the case of multi-send, only write
+   // 0 or 1 as size to indicate whether there was data sent or received.
+   uint32_t immData = 0;
+-  if (nreqs == 1) {
+  if ((nreqs == 1) && (use_write_op == false)) {
+     immData = reqs[0]->send.size;
+   } else {
+     int* sizes = comm->remSizesFifo.elems[slot];
+@@ -2293,22 +2434,24 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
+   }
+ 
+   struct ibv_send_wr* lastWr = comm->wrs+nreqs-1;
+-  if (nreqs > 1 || (comm->ar && reqs[0]->send.size > ncclParamIbArThreshold())) {
+-    // When using ADAPTIVE_ROUTING, send the bulk of the data first as an
+-    // RDMA_WRITE, then a 0-byte RDMA_WRITE_WITH_IMM to trigger a remote
+-    // completion.
+-    lastWr++;
+-    memset(lastWr, 0, sizeof(struct ibv_send_wr));
+-    if (nreqs > 1) {
+-      // Write remote sizes Fifo
+-      lastWr->wr.rdma.remote_addr = comm->remSizesFifo.addr + slot*NCCL_NET_IB_MAX_RECVS*sizeof(int);
+-      lastWr->num_sge = 1;
+-      lastWr->sg_list = &comm->remSizesFifo.sge;
+  if (use_write_op == false) {
+    if (nreqs > 1 || (comm->ar && reqs[0]->send.size > ncclParamIbArThreshold())) {
+      // When using ADAPTIVE_ROUTING, send the bulk of the data first as an
+      // RDMA_WRITE, then a 0-byte RDMA_WRITE_WITH_IMM to trigger a remote
+      // completion.
+      lastWr++;
+      memset(lastWr, 0, sizeof(struct ibv_send_wr));
+      if (nreqs > 1) {
+        // Write remote sizes Fifo
+        lastWr->wr.rdma.remote_addr = comm->remSizesFifo.addr + slot*NCCL_NET_IB_MAX_RECVS*sizeof(int);
+        lastWr->num_sge = 1;
+        lastWr->sg_list = &comm->remSizesFifo.sge;
+      }
+     }
+    lastWr->opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
+    lastWr->imm_data = immData;
+   }
+   lastWr->wr_id = wr_id;
+-  lastWr->opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
+-  lastWr->imm_data = immData;
+   lastWr->next = NULL;
+   lastWr->send_flags = IBV_SEND_SIGNALED;
+ 
+@@ -2324,7 +2467,11 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
+       //ncclIbAddEvent(reqs[r], devIndex, &comm->devs[devIndex].base);
+ 
+       // Select proper rkey (needed even for 0-size send)
+-      comm->wrs[r].wr.rdma.rkey = slots[r].rkeys[qp->remDevIdx];
+      if (rcclCtsOffloadEnabled) {
+        comm->wrs[r].wr.rdma.rkey = 0xbade;
+      } else {
+        comm->wrs[r].wr.rdma.rkey = slots[r].rkeys[qp->remDevIdx];
+      }
+ 
+       int chunkSize = DIVUP(DIVUP(reqs[r]->send.size, nqps), align) * align;
+       int length = std::min(reqs[r]->send.size-reqs[r]->send.offset, chunkSize);
+@@ -2340,7 +2487,7 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
+       }
+     }
+ 
+-    if (nreqs > 1) {
+    if ((use_write_op == false) && (nreqs > 1)) {
+       // Also make sure lastWr writes remote sizes using the right lkey
+       comm->remSizesFifo.sge.lkey = comm->remSizesFifo.mrs[devIndex]->lkey;
+       lastWr->wr.rdma.rkey = comm->remSizesFifo.rkeys[devIndex];
+@@ -2398,32 +2545,46 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void*
+   NCCLCHECK(ncclIbStatsCheckFatalCount(&comm->base.stats,__func__));
+ 
+   struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) mhandle;
+  bool use_write_op = false;
+  if (rcclAinicRoce) {
+      use_write_op = (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) ? true : false;
+  }
+ 
+   // Wait for the receiver to have posted the corresponding receive
+   int nreqs = 0;
+   volatile struct ncclIbSendFifo* slots;
+ 
+  if (rcclCtsOffloadEnabled) {
+      nreqs = 1;
+  }
+
+   int slot = (comm->fifoHead) % MAX_REQUESTS;
+   struct ncclIbRequest** reqs = comm->fifoReqs[slot];
+-  slots = comm->fifo[slot];
+-  uint64_t idx = comm->fifoHead+1;
+-  if (slots[0].idx != idx) { *request = NULL; return ncclSuccess; }
+-  nreqs = slots[0].nreqs;
+-  // Wait until all data has arrived
+-  for (int r=1; r<nreqs; r++) while(slots[r].idx != idx);
+-  __sync_synchronize(); // order the nreqsPtr load against tag/rkey/addr loads below
+  if (!rcclCtsOffloadEnabled) {
+    slots = comm->fifo[slot];
+    uint64_t idx = comm->fifoHead+1;
+    if (slots[0].idx != idx) { *request = NULL; return ncclSuccess; }
+    nreqs = slots[0].nreqs;
+    // Wait until all data has arrived
+    for (int r=1; r<nreqs; r++) while(slots[r].idx != idx);
+    __sync_synchronize(); // order the nreqsPtr load against tag/rkey/addr loads below
+  }
+   for (int r=0; r<nreqs; r++) {
+-    if (reqs[r] != NULL || slots[r].tag != tag) continue;
+-
+-    if (size > slots[r].size) size = slots[r].size;
+-    // Sanity checks
+-    if (slots[r].size < 0 || slots[r].addr == 0 || slots[r].rkeys[0] == 0) {
+-      char line[SOCKET_NAME_MAXLEN + 1];
+-      union ncclSocketAddress addr;
+-      ncclSocketGetAddr(&comm->base.sock, &addr);
+-      WARN("NET/IB : req %d/%d tag %x peer %s posted incorrect receive info: size %ld addr %lx rkeys[0]=%x",
+-        r, nreqs, tag, ncclSocketToString(&addr, line), slots[r].size, slots[r].addr, slots[r].rkeys[0]);
+-      return ncclInternalError;
+    if (!rcclCtsOffloadEnabled) {
+      if (reqs[r] != NULL || slots[r].tag != tag) continue;
+
+      if (size > slots[r].size) size = slots[r].size;
+      // Sanity checks
+      if (slots[r].size < 0 || slots[r].addr == 0 || slots[r].rkeys[0] == 0) {
+        char line[SOCKET_NAME_MAXLEN + 1];
+        union ncclSocketAddress addr;
+        ncclSocketGetAddr(&comm->base.sock, &addr);
+        WARN("NET/IB : req %d/%d tag %x peer %s posted incorrect receive info: size %ld addr %lx rkeys[0]=%x",
+             r, nreqs, tag, ncclSocketToString(&addr, line), slots[r].size, slots[r].addr, slots[r].rkeys[0]);
+        return ncclInternalError;
+      }
+    } else{
+      if (reqs[r] != NULL) continue;
+     }
+ 
+     struct ncclIbRequest* req;
+@@ -2467,10 +2628,12 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void*
+     }
+ 
+     TIME_START(0);
+-    NCCLCHECK(ncclIbMultiSend(comm, slot));
+    NCCLCHECK(ncclIbMultiSend(comm, slot, use_write_op));
+ 
+     // Clear slots[0]->nreqs, as well as other fields to help debugging and sanity checks
+-    memset((void*)slots, 0, sizeof(struct ncclIbSendFifo));
+    if (!rcclCtsOffloadEnabled) {
+      memset((void*)slots, 0, sizeof(struct ncclIbSendFifo));
+    }
+     memset(reqs, 0, NCCL_NET_IB_MAX_RECVS*sizeof(struct ncclIbRequest*));
+     comm->fifoHead++;
+     TIME_STOP(0);
+@@ -2483,30 +2646,60 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void*
+ 
+ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, size_t* sizes, int* tags, void** mhandles, struct ncclIbRequest* req) {
+   struct ibv_send_wr wr;
+  struct ncclIbSendFifo* localElem = NULL;
+  struct ncclIbSendFifoCtsInline* localElemCtsInline = NULL;
+  uint64_t localElemRef;
+  int qpIndex = 0;
+  ncclIbQp* ctsQp = NULL;
+   memset(&wr, 0, sizeof(wr));
+ 
+   int slot = comm->remFifo.fifoTail%MAX_REQUESTS;
+   req->recv.sizes = comm->sizesFifo[slot];
+   for (int i=0; i<n; i++) req->recv.sizes[i] = 0;
+-  struct ncclIbSendFifo* localElem = comm->remFifo.elems[slot];
+  if (rcclCtsInlineData) {
+    localElemCtsInline = comm->remFifo.elems_cts_inline[slot];
+  } else {
+    localElem = comm->remFifo.elems[slot];
+  }
+ 
+-  // Select the next devIndex (local) and QP to use for posting this CTS message
+-  // Since QPs are initialized by striping across devIndex, we can simply assign this to the same value
+-  ncclIbQp* ctsQp = comm->base.qps + comm->base.devIndex;
+-  comm->base.devIndex = (comm->base.devIndex + 1) % comm->base.vProps.ndevs;
+  if (rcclAinicRoce) {
+    qpIndex = comm->base.qpIndex;
+    ctsQp = comm->base.qps + qpIndex;
+  } else {
+    // Select the next devIndex (local) and QP to use for posting this CTS message
+    // Since QPs are initialized by striping across devIndex, we can simply assign this to the same value
+    ctsQp = comm->base.qps + comm->base.devIndex;
+    comm->base.devIndex = (comm->base.devIndex + 1) % comm->base.vProps.ndevs;
+  }
+ 
+   for (int i=0; i<n; i++) {
+-    localElem[i].addr = (uint64_t)data[i];
+     struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) mhandles[i];
+    if (rcclCtsInlineData) {
+      localElemCtsInline[i].addr = (uint64_t)data[i];
+
+      // Send all applicable rkeys
+      for (int j = 0; j < comm->base.vProps.ndevs; j++)
+        localElemCtsInline[i].rkeys[j] = mhandleWrapper->mrs[j]->rkey;
+
+      localElemCtsInline[i].nreqs = n;
+      localElemCtsInline[i].size = sizes[i]; // Sanity/Debugging
+      localElemCtsInline[i].tag = tags[i];
+      localElemCtsInline[i].idx = comm->remFifo.fifoTail+1;
+      localElemRef = (uint64_t)localElemCtsInline;
+
+    } else {
+      localElem[i].addr = (uint64_t)data[i];
+ 
+-    // Send all applicable rkeys
+-    for (int j = 0; j < comm->base.vProps.ndevs; j++)
+-      localElem[i].rkeys[j] = mhandleWrapper->mrs[j]->rkey;
+      // Send all applicable rkeys
+      for (int j = 0; j < comm->base.vProps.ndevs; j++)
+        localElem[i].rkeys[j] = mhandleWrapper->mrs[j]->rkey;
+ 
+-    localElem[i].nreqs = n;
+-    localElem[i].size = sizes[i]; // Sanity/Debugging
+-    localElem[i].tag = tags[i];
+-    localElem[i].idx = comm->remFifo.fifoTail+1;
+      localElem[i].nreqs = n;
+      localElem[i].size = sizes[i]; // Sanity/Debugging
+      localElem[i].tag = tags[i];
+      localElem[i].idx = comm->remFifo.fifoTail+1;
+      localElemRef = (uint64_t)localElem;
+    }
+   }
+   wr.wr.rdma.remote_addr = comm->remFifo.addr + slot*NCCL_NET_IB_MAX_RECVS*sizeof(struct ncclIbSendFifo);
+ 
+@@ -2514,8 +2707,12 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, siz
+   wr.wr.rdma.rkey = comm->base.remDevs[ctsQp->remDevIdx].fifoRkey;
+ 
+   // Set the correct sge properties
+-  comm->devs[ctsQp->devIndex].fifoSge.addr   = (uint64_t)localElem;
+-  comm->devs[ctsQp->devIndex].fifoSge.length = n*sizeof(struct ncclIbSendFifo);
+  comm->devs[ctsQp->devIndex].fifoSge.addr   = localElemRef;
+  if (rcclCtsInlineData) {
+    comm->devs[ctsQp->devIndex].fifoSge.length = MAX_INLINE_DATA_SIZE;
+  } else {
+    comm->devs[ctsQp->devIndex].fifoSge.length = n*sizeof(struct ncclIbSendFifo);
+  }
+   wr.sg_list = &comm->devs[ctsQp->devIndex].fifoSge;
+   wr.num_sge = 1;
+ 
+@@ -2545,7 +2742,13 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, siz
+   //
+   // slot == devIndex - When writing to fifo slot N, and this QP lives on device index N, it should send signalled.
+   // This works out that each fifo posting QP gets drained
+-  if (slot == ctsQp->devIndex) {
+  if (rcclAinicRoce) {
+    if (slot == ctsQp->ctsQpSlot) {
+      wr.send_flags |= IBV_SEND_SIGNALED;
+      wr.wr_id = req - comm->base.reqs;
+      ncclIbAddEvent(req, ctsQp->devIndex, &comm->devs[ctsQp->devIndex].base);
+    }
+  } else if (slot == ctsQp->devIndex) {
+     wr.send_flags |= IBV_SEND_SIGNALED;
+     wr.wr_id = req - comm->base.reqs;
+     ncclIbAddEvent(req, ctsQp->devIndex, &comm->devs[ctsQp->devIndex].base);
+@@ -2560,10 +2763,16 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, siz
+ 
+   comm->remFifo.fifoTail++;
+ 
+  if (rcclAinicRoce) {
+    // Select the next qpIndex
+    comm->base.qpIndex = (comm->base.qpIndex+1) % comm->base.nqps;
+  }
+   return ncclSuccess;
+ }
+ 
+ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request) {
+  ncclResult_t res = ncclSuccess;
+  bool netOptRecvCompletionEnabled = false;
+   struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
+   if (comm->base.ready == 0) {
+     WARN("NET/IB: ncclIbIrecv() called when comm->base.ready == 0");
+@@ -2573,6 +2782,11 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int*
+   if (n > NCCL_NET_IB_MAX_RECVS) return ncclInternalError;
+   NCCLCHECK(ncclIbStatsCheckFatalCount(&comm->base.stats,__func__));
+ 
+  if (rcclAinicRoce) {
+    if (*request == (void *) NCCL_NET_OPTIONAL_RECV_COMPLETION) {
+        netOptRecvCompletionEnabled = true;
+    }
+  }
+   struct ncclIbRequest* req;
+   NCCLCHECK(ncclIbGetRequest(&comm->base, &req));
+   req->type = NCCL_NET_IB_REQ_RECV;
+@@ -2586,50 +2800,64 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int*
+     req->devBases[i] = &comm->devs[i].base;
+   }
+ 
+-  struct ibv_recv_wr wr;
+-  memset(&wr, 0, sizeof(wr));
+-  wr.wr_id = req - comm->base.reqs;
+-  wr.sg_list = NULL;
+-  wr.num_sge = 0;
+  if (!netOptRecvCompletionEnabled) {
+    struct ibv_recv_wr wr;
+    memset(&wr, 0, sizeof(wr));
+    wr.wr_id = req - comm->base.reqs;
+    wr.sg_list = NULL;
+    wr.num_sge = 0;
+ 
+-  TIME_START(1);
+-  // Select either all QPs, or one qp per-device
+-  const int nqps = ncclParamIbSplitDataOnQps() ? comm->base.nqps : comm->base.nDataQps;
+    TIME_START(1);
+    // Select either all QPs, or one qp per-device
+    const int nqps = ncclParamIbSplitDataOnQps() ? comm->base.nqps : comm->base.nDataQps;
+ 
+-  // Post recvs
+-  struct ibv_recv_wr* bad_wr;
+-  for (int i = 0; i < nqps; i++) {
+-    struct ncclIbQp* qp = comm->base.qps + comm->base.qpIndex;
+-    ncclIbAddEvent(req, qp->devIndex, &comm->devs[qp->devIndex].base);
+    // Post recvs
+    struct ibv_recv_wr* bad_wr;
+    int qpIndex = comm->base.qpIndex;
+    for (int i = 0; i < nqps; i++) {
+      struct ncclIbQp* qp = comm->base.qps + comm->base.qpIndex;
+      ncclIbAddEvent(req, qp->devIndex, &comm->devs[qp->devIndex].base);
+ #ifdef NCCL_ENABLE_NET_PROFILING
+-    // Start a QP event for every request in the multirecv and every qp
+-    for (int r = 0; r < n; r++) {
+-      int nEventHandles = req->pInfo[r].nEventHandles;
+-      assert(nEventHandles < MAX_QPS_PER_REQ);
+-      req->pInfo[r].qpIndex[nEventHandles] = comm->base.qpIndex;
+-      // Store info for profiler
+-      int64_t pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER;
+-      req->pInfo[r].data.type = ncclProfileQp;
+-      req->pInfo[r].data.qp.device = qp->devIndex;
+-      req->pInfo[r].data.qp.wr_id = wr.wr_id;
+-      req->pInfo[r].data.qp.qpNum = qp->qp->qp_num;
+-      NCCLCHECK(ncclProfilerFunction(&req->pInfo[r].qpEventHandles[nEventHandles], ncclProfilerNetEventStart, phandles[r], pluginId, &req->pInfo[r].data));
+-      req->pInfo[r].nEventHandles++;
+-    }
+      // Start a QP event for every request in the multirecv and every qp
+      for (int r = 0; r < n; r++) {
+        int nEventHandles = req->pInfo[r].nEventHandles;
+        assert(nEventHandles < MAX_QPS_PER_REQ);
+        req->pInfo[r].qpIndex[nEventHandles] = comm->base.qpIndex;
+        // Store info for profiler
+        int64_t pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER;
+        req->pInfo[r].data.type = ncclProfileQp;
+        req->pInfo[r].data.qp.device = qp->devIndex;
+        req->pInfo[r].data.qp.wr_id = wr.wr_id;
+        req->pInfo[r].data.qp.qpNum = qp->qp->qp_num;
+        NCCLCHECK(ncclProfilerFunction(&req->pInfo[r].qpEventHandles[nEventHandles], ncclProfilerNetEventStart, phandles[r], pluginId, &req->pInfo[r].data));
+        req->pInfo[r].nEventHandles++;
+      }
+ #endif
+-    NCCLCHECK(wrap_ibv_post_recv(qp->qp, &wr, &bad_wr));
+-    comm->base.qpIndex = (comm->base.qpIndex+1)%comm->base.nqps;
+-  }
+      NCCLCHECKGOTO(wrap_ibv_post_recv(qp->qp, &wr, &bad_wr), res, err);
+      // Don't update comm->base.qpIndex yet, we need to run through this same set of QPs
+      // inside ncclIbPostFifo()
+      if (rcclAinicRoce) {
+        qpIndex = (qpIndex+1)%comm->base.nqps;
+      } else {
+        comm->base.qpIndex = (comm->base.qpIndex+1)%comm->base.nqps;
+      }
+    }
+ 
+-  TIME_STOP(1);
+    TIME_STOP(1);
+  } // netOptRecvCompletionEnabled = false
+ 
+   // Post to FIFO to notify sender
+   TIME_START(2);
+-  NCCLCHECK(ncclIbPostFifo(comm, n, data, sizes, tags, mhandles, req));
+  NCCLCHECKGOTO(ncclIbPostFifo(comm, n, data, sizes, tags, mhandles, req), res, err);
+   TIME_STOP(2);
+ 
+   *request = req;
+   return ncclSuccess;
+err:
+  if (req) {
+      ncclIbFreeRequest(req);
+  }
+  return res;
+ }
+ 
+ ncclResult_t ncclIbIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) {
+@@ -2698,6 +2926,8 @@ static int getReqQpIndex(struct ncclIbRequest* req, int request, int qpNumber) {
+ }
+ #endif
+ 
+#define NCCL_CQ_POLL_MAX_EVENT        16
+
+ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
+   struct ncclIbRequest *r = (struct ncclIbRequest*)request;
+   *done = 0;
+@@ -2731,13 +2961,18 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
+ 
+     int totalWrDone = 0;
+     int wrDone = 0;
+-    struct ibv_wc wcs[4];
+    struct ibv_wc wcs[NCCL_CQ_POLL_MAX_EVENT];
+    int cqMaxPollEvent = 4;
+    if (rcclAinicRoce) {
+        cqMaxPollEvent = NCCL_CQ_POLL_MAX_EVENT;
+    }
+ 
+     for (int i = 0; i < NCCL_IB_MAX_DEVS_PER_NIC; i++) {
+       TIME_START(3);
+       // If we expect any completions from this device's CQ
+       if (r->events[i]) {
+-        NCCLCHECK(wrap_ibv_poll_cq(r->devBases[i]->cq, 4, wcs, &wrDone));
+        NCCLCHECK(wrap_ibv_poll_cq(r->devBases[i]->cq, cqMaxPollEvent,
+                                   wcs, &wrDone));
+         totalWrDone += wrDone;
+         if (wrDone == 0) { TIME_CANCEL(3); } else { TIME_STOP(3); }
+         if (wrDone == 0) continue;
+@@ -2889,7 +3124,7 @@ ncclResult_t rcclNetP2pPolicy(void* handle, int isP2p) {
+ }
+ 
+ ncclNet_t ncclNetIb = {
+-  "IB",
+  "ROCM-IB",
+   ncclIbInit,
+   ncclIbDevices,
+   ncclIbGetProperties,
@@ -179,4 +179,4 @@ When developing new tuner plugins:
 - [NCCL Documentation](https://docs.nvidia.com/deeplearning/nccl/)
 - Example plugin implementations in this directory

-For questions and support, refer to the NCCL community resources and documentation.
+For questions and support, refer to the NCCL community resources and documentation.
@@ -0,0 +1,49 @@
+# Compiled shared objects and binaries
+*.so
+*.o
+*.a
+*.out
+*.exe
+*.dll
+*.dylib
+*.bin
+*.elf
+
+# Python cache
+__pycache__/
+*.pyc
+*.pyo
+
+# Build and test artifacts
+/build/
+*.log
+*.tmp
+*.swp
+
+# Ignore all CSV files except scripts/sample_performance_data.csv
+*.csv
+!scripts/sample_performance_data.csv
+
+# Ignore all .conf files except nccl_tuner.conf
+*.conf
+!nccl_tuner.conf
+
+my_configs
+
+# Ignore test binary
+test/test_plugin
+
+# Editor/OS files
+.DS_Store
+Thumbs.db
+
+# Backup files
+*~
+*.bak
+
+# Ignore by convention
+*.old
+*.orig
+
+# Git
+.git/
@@ -0,0 +1,26 @@
+# Find all C source files in current directory
+set(SRC_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/plugin.c
+)
+
+# Create shared library
+add_library(nccl-tuner-example SHARED ${SRC_FILES})
+
+# Set include directories
+target_include_directories(nccl-tuner-example PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/nccl
+)
+
+# Set output name to match Makefile
+set_target_properties(nccl-tuner-example PROPERTIES
+    OUTPUT_NAME "nccl-tuner-example"
+    PREFIX "lib"
+    POSITION_INDEPENDENT_CODE ON
+    LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/test/unit/plugins
+)
+
+# Add custom target for clean (equivalent to Makefile clean target)
+add_custom_target(clean-tuner-lib
+    COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/libnccl-tuner-example.so
+    COMMENT "Cleaning libnccl-tuner-example.so"
+)
@@ -45,6 +45,40 @@ typedef enum {

 #define NCCL_ALGO_PROTO_IGNORE -1.0

+#define NCCL_HW_NVLINK 0
+#define NCCL_HW_PCI 1
+#define NCCL_HW_NET 2
+#define NCCL_NUM_HW_LINKS 3
+
+#define NCCL_VOLTA_COMPCAP_IDX 0
+#define NCCL_AMPERE_COMPCAP_IDX 1
+#define NCCL_HOPPER_COMPCAP_IDX 2
+#define NCCL_BLACKWELL_COMPCAP_IDX 3
+#define NCCL_NUM_COMPCAPS 4
+
+#define NCCL_TUNING_SCALE_1NODE 0
+#define NCCL_TUNING_SCALE_2NODES 1
+#define NCCL_TUNING_SCALE_4NODES 2
+#define NCCL_NUM_TUNING_SCALES 3
+
+typedef struct {
+  int nNvlDomains;                    // number of NVLink domains
+  int minRanksPerNvlDomain;           // minimum ranks across all NVLink domains
+  int maxRanksPerNvlDomain;           // maximum ranks across all NVLink domains
+} ncclNvlDomainInfo_v5_t;
+
+typedef struct {
+  double baseLatencies [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  double hwLatencies [NCCL_NUM_HW_LINKS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+
+  double llMaxBws [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES];
+  double perChMaxRingLL128Bws [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES];
+  double perChMaxTreeLL128Bws [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES];
+  double perChMaxTreeBws [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES];
+
+
+} ncclTunerConstants_v5_t;
+
 // API to be implemented by external tuner
 typedef struct {
  // Name of the tuner
@@ -52,12 +86,17 @@ typedef struct {

  // Initializes tuner states.
  // Inputs:
+  //   - commId: communicator identifier
  //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
  //   - nNodes: number of nodes in current communicator.
  //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
+  //   - nvlDomainInfo: NVL domain information struct
  // Outputs:
  //   - context: tuner context object
-  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
+  // Input/Output:
+  //   - constants: tuner constants
+  ncclResult_t (*init)(void** ctx, uint64_t commId, size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction,
+                      ncclNvlDomainInfo_v5_t* nvlDomainInfo, ncclTunerConstants_v5_t* constants);

  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
  // Inputs:
@@ -87,11 +126,13 @@ typedef struct {

  // Terminates the plugin and cleans up any resources that the plugin allocated.
  // context: tuner context object
-  ncclResult_t (*destroy)(void* context);
-} ncclTuner_v4_t;
+  ncclResult_t (*finalize)(void* context);
+} ncclTuner_v5_t;

-typedef ncclTuner_v4_t ncclTuner_t;
+typedef ncclTuner_v5_t ncclTuner_t;
+typedef ncclNvlDomainInfo_v5_t ncclNvlDomainInfo_t;
+typedef ncclTunerConstants_v5_t ncclTunerConstants_t;

-#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4"
+#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v5"

 #endif
@@ -51,6 +51,7 @@ typedef struct {
  size_t nRanks;
  size_t nNodes;
  ncclDebugLogger_t logFunction;
+  ncclNvlDomainInfo_v5_t nvlDomainInfo;
 } TunerContext;

 // Parse collective type from string
@@ -289,7 +290,25 @@ static ncclResult_t loadConfig(TunerContext* ctx, const char* filename) {
  return ncclSuccess;
 }

-__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) {
+__hidden ncclResult_t pluginInit(void** context, uint64_t commId, size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction,
+                                 ncclNvlDomainInfo_v5_t* nvlDomainInfo, ncclTunerConstants_v5_t* constants) {
+
+  if (NULL != constants) {
+    // NCCL constants tuning
+    // Tune NCCL's internal tuning model to improve base algo/proto selection.
+    // Note: Example numbers are for reference only.
+    //       Actual numbers may vary depending on the hardware and network topology.
+    //       These numbers are not guaranteed to be optimal for all cases.
+    // Limit the tree bandwidth to 15GB/s
+    constants->perChMaxTreeBws[NCCL_BLACKWELL_COMPCAP_IDX][NCCL_TUNING_SCALE_4NODES] = 15.0;
+
+    // Limit the ring bandwidth to 20GB/s
+    constants->perChMaxRingLL128Bws[NCCL_BLACKWELL_COMPCAP_IDX][NCCL_TUNING_SCALE_4NODES] = 20.0;
+
+    // Set NVLSTree base network latency to 24us
+    constants->hwLatencies[NCCL_HW_NET][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] = 24.0;
+  }
+  
  TunerContext* ctx = (TunerContext*)malloc(sizeof(TunerContext));
  if (!ctx) return ncclSystemError;

@@ -299,10 +318,16 @@ __hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t
  ctx->nRanks = nRanks;
  ctx->nNodes = nNodes;
  ctx->logFunction = logFunction;
+  if (nvlDomainInfo) {
+    ctx->nvlDomainInfo = *nvlDomainInfo;
+  } else {
+    memset(&ctx->nvlDomainInfo, 0, sizeof(ncclNvlDomainInfo_v5_t));
+  }

  if (logFunction) {
    logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
-                "TUNER/ExamplePlugin: Initializing tuner for %zu nodes, %zu ranks", nNodes, nRanks);
+                "TUNER/ExamplePlugin: Initializing tuner for %zu nodes, %zu ranks, %d NVL domains",
+                nNodes, nRanks, ctx->nvlDomainInfo.nNvlDomains);
  }

  // Try to load config file from environment variable or default location
@@ -435,7 +460,7 @@ __hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size
  return ncclSuccess;
 }

-__hidden ncclResult_t pluginDestroy(void* context) {
+__hidden ncclResult_t pluginFinalize(void* context) {
  if (context) {
    TunerContext* ctx = (TunerContext*)context;
    if (ctx->configs) {
@@ -446,11 +471,12 @@ __hidden ncclResult_t pluginDestroy(void* context) {
  return ncclSuccess;
 }

+
 #define PLUGIN_NAME "Example"

-const ncclTuner_v4_t ncclTunerPlugin_v4 = {
+const ncclTuner_v5_t ncclTunerPlugin_v5 = {
  .name = PLUGIN_NAME,
  .init = pluginInit,
  .getCollInfo = pluginGetCollInfo,
-  .destroy = pluginDestroy
+  .finalize = pluginFinalize
 };
@@ -0,0 +1,53 @@
+# NCCL Tuner Configuration File (CSV Format)
+# Format: collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
+#
+# Collective types: broadcast, reduce, allgather, reducescatter, allreduce
+# Algorithms: tree, ring, collnet_direct, collnet_chain, nvls, nvls_tree, pat
+# Protocols: ll, ll128, simple
+# Channels: number of channels to use, or -1 to keep default
+# nNodes: number of nodes to match, or -1 for any number of nodes
+# nRanks: number of ranks to match, or -1 for any number of ranks
+# numPipeOps: number of pipeline operations to match, or -1 for any number (optional)
+# regBuff: whether user buffer can be registered (0=no, 1=yes, -1=any) (optional)
+#
+# Note: numPipeOps and regBuff parameters are optional - configurations without them will match any value
+#
+#AR 4PPN
+allreduce,33554432,4294967296,ring,simple,16,2,8,-1,-1
+allreduce,33554432,4294967296,ring,simple,16,4,16,-1,-1
+allreduce,67108864,4294967296,ring,simple,16,8,32,-1,-1
+#AR 2PPN
+allreduce,2097152,4294967296,ring,simple,4,2,4,-1,-1
+allreduce,16777216,4294967296,ring,simple,4,4,8,-1,-1
+allreduce,33554432,4294967296,ring,simple,4,8,16,-1,-1
+#AR 1PPN
+allreduce,134217728,4294967296,ring,simple,4,4,4,-1,-1
+allreduce,67108864,4294967296,ring,simple,4,8,8,-1,-1
+
+
+#AG 4PPN
+allgather,8388608,4294967296,ring,simple,16,2,8,-1,-1
+allgather,16777216,4294967296,ring,simple,16,4,16,-1,-1
+allgather,16777216,4294967296,ring,simple,16,8,32,-1,-1
+#AG 2PPN
+allgather,262144,4294967296,ring,simple,4,2,4,-1,-1
+allgather,16777216,4294967296,ring,simple,4,4,8,-1,-1
+allgather,33554432,4294967296,ring,simple,4,8,16,-1,-1
+#AG 1PPN
+allgather,262144,2097152,ring,simple,4,2,2,-1,-1
+allgather,262144,8388608,ring,simple,4,4,4,-1,-1
+allgather,67108864,4294967296,ring,simple,4,8,8,-1,-1
+
+#RS 4PPN
+reducescatter,1048576,4294967296,ring,simple,16,2,8,-1,-1
+reducescatter,1048576,4294967296,ring,simple,16,4,16,-1,-1
+reducescatter,1048576,4294967296,ring,simple,16,8,32,-1,-1
+#RS 2PPN
+reducescatter,262144,33554432,ring,simple,4,2,4,-1,-1
+reducescatter,262144,4294967296,ring,simple,4,4,8,-1,-1
+reducescatter,262144,4294967296,ring,simple,4,8,16,-1,-1
+#RS 1PPN
+reducescatter,131072,262144,ring,simple,4,2,2,-1,-1
+reducescatter,1048576,2097152,ring,simple,4,2,2,-1,-1
+reducescatter,131072,4194304,ring,simple,4,4,4,-1,-1
+reducescatter,262144,8388608,ring,simple,4,8,8,-1,-1
@@ -98,12 +98,12 @@ int test_plugin_init() {
  void* context = NULL;

  // Test successful initialization
-  ncclResult_t result = pluginInit(8, 2, mock_logger, &context);
+  ncclResult_t result = pluginInit(&context, 0, 8, 2, mock_logger, NULL, NULL);
  TEST_ASSERT(result == ncclSuccess, "Plugin init should succeed");
  TEST_ASSERT(context != NULL, "Context should be allocated");

  // Clean up
-  pluginDestroy(context);
+  pluginFinalize(context);
  TEST_PASS();
 }

@@ -123,11 +123,11 @@ int test_config_parsing_valid() {
  setenv("NCCL_TUNER_CONFIG_FILE", "test_valid.conf", 1);

  void* context = NULL;
-  ncclResult_t result = pluginInit(16, 2, mock_logger, &context);
+  ncclResult_t result = pluginInit(&context, 0, 16, 2, mock_logger, NULL, NULL);
  TEST_ASSERT(result == ncclSuccess, "Plugin init with valid config should succeed");

  // Clean up
-  pluginDestroy(context);
+  pluginFinalize(context);
  unlink("test_valid.conf");
  unsetenv("NCCL_TUNER_CONFIG_FILE");
  TEST_PASS();
@@ -144,12 +144,12 @@ int test_config_parsing_invalid() {
  setenv("NCCL_TUNER_CONFIG_FILE", "test_invalid.conf", 1);

  void* context = NULL;
-  ncclResult_t result = pluginInit(8, 1, mock_logger, &context);
+  ncclResult_t result = pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);
  // Should still succeed but with no valid configs loaded
  TEST_ASSERT(result == ncclSuccess, "Plugin init should succeed even with invalid config");

  // Clean up
-  pluginDestroy(context);
+  pluginFinalize(context);
  unlink("test_invalid.conf");
  unsetenv("NCCL_TUNER_CONFIG_FILE");
  TEST_PASS();
@@ -165,7 +165,7 @@ int test_collective_matching() {
  setenv("NCCL_TUNER_CONFIG_FILE", "test_match.conf", 1);

  void* context = NULL;
-  pluginInit(8, 1, mock_logger, &context);
+  pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);

  // Create mock cost table
  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
@@ -209,7 +209,7 @@ int test_collective_matching() {
  TEST_ASSERT(nChannels == 4, "Should set 4 channels");

  // Clean up
-  pluginDestroy(context);
+  pluginFinalize(context);
  unlink("test_match.conf");
  unsetenv("NCCL_TUNER_CONFIG_FILE");
  TEST_PASS();
@@ -226,7 +226,7 @@ int test_size_matching() {
  setenv("NCCL_TUNER_CONFIG_FILE", "test_size.conf", 1);

  void* context = NULL;
-  pluginInit(8, 1, mock_logger, &context);
+  pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);

  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
@@ -280,7 +280,7 @@ int test_size_matching() {
  TEST_ASSERT(nChannels == 8, "Large: Should set 8 channels");

  // Clean up
-  pluginDestroy(context);
+  pluginFinalize(context);
  unlink("test_size.conf");
  unsetenv("NCCL_TUNER_CONFIG_FILE");
  TEST_PASS();
@@ -298,7 +298,7 @@ int test_topology_matching() {

  // Test with single node setup
  void* context1 = NULL;
-  pluginInit(8, 1, mock_logger, &context1);  // 8 ranks, 1 node
+  pluginInit(&context1, 0, 8, 1, mock_logger, NULL, NULL);  // 8 ranks, 1 node

  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
@@ -316,11 +316,11 @@ int test_topology_matching() {
  TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Single node: Should match tree config");
  TEST_ASSERT(nChannels == 2, "Single node: Should set 2 channels");

-  pluginDestroy(context1);
+  pluginFinalize(context1);

  // Test with 4 nodes, 32 ranks setup
  void* context2 = NULL;
-  pluginInit(32, 4, mock_logger, &context2);  // 32 ranks, 4 nodes
+  pluginInit(&context2, 0, 32, 4, mock_logger, NULL, NULL);  // 32 ranks, 4 nodes

  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
@@ -349,7 +349,7 @@ int test_default_channels() {
  setenv("NCCL_TUNER_CONFIG_FILE", "test_default.conf", 1);

  void* context = NULL;
-  pluginInit(8, 1, mock_logger, &context);
+  pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);

  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
@@ -369,7 +369,7 @@ int test_default_channels() {
  TEST_ASSERT(nChannels == 1, "Should keep default channels (1) when config has -1");

  // Clean up
-  pluginDestroy(context);
+  pluginFinalize(context);
  unlink("test_default.conf");
  unsetenv("NCCL_TUNER_CONFIG_FILE");
  TEST_PASS();
@@ -386,7 +386,7 @@ int test_regbuff_matching() {
  setenv("NCCL_TUNER_CONFIG_FILE", "test_regbuff.conf", 1);

  void* context = NULL;
-  pluginInit(8, 1, mock_logger, &context);
+  pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);

  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
@@ -437,7 +437,7 @@ int test_regbuff_matching() {
  TEST_ASSERT(nChannels == 8, "Any regBuff: Should set 8 channels");

  // Clean up
-  pluginDestroy(context);
+  pluginFinalize(context);
  unlink("test_regbuff.conf");
  unsetenv("NCCL_TUNER_CONFIG_FILE");
  TEST_PASS();
@@ -454,7 +454,7 @@ int test_pipeops_matching() {
  setenv("NCCL_TUNER_CONFIG_FILE", "test_pipeops.conf", 1);

  void* context = NULL;
-  pluginInit(8, 1, mock_logger, &context);
+  pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);

  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
@@ -504,7 +504,7 @@ int test_pipeops_matching() {
  TEST_ASSERT(nChannels == 8, "Any pipeOps: Should set 8 channels");

  // Clean up
-  pluginDestroy(context);
+  pluginFinalize(context);
  unlink("test_pipeops.conf");
  unsetenv("NCCL_TUNER_CONFIG_FILE");
  TEST_PASS();
@@ -519,7 +519,7 @@ int test_no_match_fallback() {
  setenv("NCCL_TUNER_CONFIG_FILE", "test_fallback.conf", 1);

  void* context = NULL;
-  pluginInit(8, 1, mock_logger, &context);
+  pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);

  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
@@ -543,7 +543,7 @@ int test_no_match_fallback() {
  TEST_ASSERT(nChannels == 1, "Should use default channels");

  // Clean up
-  pluginDestroy(context);
+  pluginFinalize(context);
  unlink("test_fallback.conf");
  unsetenv("NCCL_TUNER_CONFIG_FILE");
  TEST_PASS();
@@ -593,7 +593,7 @@ int test_large_config() {

  // Initialize plugin with large config
  void* context = NULL;
-  ncclResult_t result = pluginInit(16, 4, mock_logger, &context);
+  ncclResult_t result = pluginInit(&context, 0, 16, 4, mock_logger, NULL, NULL);
  TEST_ASSERT(result == ncclSuccess, "Plugin init with large config should succeed");
  TEST_ASSERT(context != NULL, "Context should be allocated");

@@ -652,7 +652,7 @@ int test_large_config() {
  TEST_ASSERT(result == ncclSuccess, "GetCollInfo should work with large config set");

  // Clean up
-  pluginDestroy(context);
+  pluginFinalize(context);
  unlink(large_config_file);
  unsetenv("NCCL_TUNER_CONFIG_FILE");

@@ -684,7 +684,7 @@ int test_very_large_config_stress() {

  // Test initialization with stress config
  void* context = NULL;
-  ncclResult_t result = pluginInit(8, 2, mock_logger, &context);
+  ncclResult_t result = pluginInit(&context, 0, 8, 2, mock_logger, NULL, NULL);
  TEST_ASSERT(result == ncclSuccess, "Plugin should handle very large config files");

  TunerContext* ctx = (TunerContext*)context;
@@ -705,7 +705,7 @@ int test_very_large_config_stress() {
  }

  // Clean up
-  pluginDestroy(context);
+  pluginFinalize(context);
  unlink(stress_config_file);
  unsetenv("NCCL_TUNER_CONFIG_FILE");

@@ -726,7 +726,7 @@ int test_empty_config() {
  setenv("NCCL_TUNER_CONFIG_FILE", empty_config_file, 1);

  void* context = NULL;
-  ncclResult_t result = pluginInit(8, 2, mock_logger, &context);
+  ncclResult_t result = pluginInit(&context, 0, 8, 2, mock_logger, NULL, NULL);
  TEST_ASSERT(result == ncclSuccess, "Plugin should handle empty config files");

  TunerContext* ctx = (TunerContext*)context;
@@ -751,13 +751,134 @@ int test_empty_config() {
  TEST_ASSERT(result == ncclSuccess, "GetCollInfo should work with empty config");

  // Clean up
-  pluginDestroy(context);
+  pluginFinalize(context);
  unlink(empty_config_file);
  unsetenv("NCCL_TUNER_CONFIG_FILE");

  TEST_PASS();
 }

+// Test NVLink domain info handling
+int test_nvl_domain_info() {
+  printf("Testing NVLink domain info handling...\n");
+
+  // Test NVLink domain structure with min/max ranks per domain
+  ncclNvlDomainInfo_v5_t nvl_domain = {
+    .nNvlDomains = 2, // 2 nodes = 2 domains
+    .minRanksPerNvlDomain = 3, // minimum ranks across all domains (bottleneck)
+    .maxRanksPerNvlDomain = 5  // maximum ranks across all domains (capacity)
+  };
+  
+  void* context = NULL;
+  ncclResult_t result = pluginInit(&context, 0, 8, 2, mock_logger, &nvl_domain, NULL);
+  TEST_ASSERT(result == ncclSuccess, "Plugin init with NVLink domains should succeed");
+  
+  // Validate NVLD info structure
+  TEST_ASSERT(nvl_domain.nNvlDomains == 2, "Should have 2 domains (nodes)");
+  TEST_ASSERT(nvl_domain.minRanksPerNvlDomain == 3, "Should have minimum 3 ranks per domain");
+  TEST_ASSERT(nvl_domain.maxRanksPerNvlDomain == 5, "Should have maximum 5 ranks per domain");
+  
+  // Clean up
+  pluginFinalize(context);
+  printf("NVLink domain info test passed!\n");
+  TEST_PASS();
+}
+
+int test_tuner_constants() {
+  // Initialize constants to -1.0 for testing purposes
+  ncclTunerConstants_v5_t constants = {
+    // Base latencies: [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]
+    .baseLatencies = {
+      {-1.0, -1.0, -1.0},    // NCCL_ALGO_TREE: LL, LL128, Simple
+      {-1.0, -1.0, -1.0},    // NCCL_ALGO_RING: LL, LL128, Simple
+      {-1.0, -1.0, -1.0},   // NCCL_ALGO_COLLNET_DIRECT
+      {-1.0, -1.0, -1.0},   // NCCL_ALGO_COLLNET_CHAIN
+      {-1.0, -1.0, -1.0},    // NCCL_ALGO_NVLS
+      {-1.0, -1.0, -1.0},    // NCCL_ALGO_NVLS_TREE
+      {-1.0, -1.0, -1.0}     // NCCL_ALGO_PAT
+    },
+
+    // Hardware latencies: [NCCL_NUM_HW_LINKS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]
+    .hwLatencies = {
+      // NCCL_HW_NVLINK
+      {
+        {-1.0, -1.0, -1.0},    // TREE
+        {-1.0, -1.0, -1.0},    // RING
+        {-1.0, -1.0, -1.0},    // COLLNET_DIRECT
+        {-1.0, -1.0, -1.0},    // COLLNET_CHAIN
+        {-1.0, -1.0, -1.0},    // NVLS
+        {-1.0, -1.0, -1.0},    // NVLS_TREE
+        {-1.0, -1.0, -1.0}     // PAT
+      },
+      // NCCL_HW_PCI
+      {
+        {-1.0, -1.0, -1.0},   // TREE
+        {-1.0, -1.0, -1.0},    // RING
+        {-1.0, -1.0, -1.0},  // COLLNET_DIRECT
+        {-1.0, -1.0, -1.0},  // COLLNET_CHAIN
+        {-1.0, -1.0, -1.0},     // NVLS
+        {-1.0, -1.0, -1.0},   // NVLS_TREE
+        {-1.0, -1.0, -1.0}   // PAT
+      },
+      // NCCL_HW_NET
+      {
+        {-1.0, -1.0, -1.0},  // TREE
+        {-1.0, -1.0, -1.0},  // RING
+        {-1.0, -1.0, -1.0},  // COLLNET_DIRECT
+        {-1.0, -1.0, -1.0},  // COLLNET_CHAIN
+        {-1.0, -1.0, -1.0},  // NVLS
+        {-1.0, -1.0, -1.0},  // NVLS_TREE
+        {-1.0, -1.0, -1.0}   // PAT
+      }
+    },
+
+    // LL maximum bandwidths: [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES]
+    .llMaxBws = {
+      {-1.0, -1.0, -1.0},  // Volta: 1node, 2nodes, 4nodes
+      {-1.0, -1.0, -1.0},  // Ampere: 1node, 2nodes, 4nodes
+      {-1.0, -1.0, -1.0},  // Hopper: 1node, 2nodes, 4nodes
+      {-1.0, -1.0, -1.0}   // Blackwell: 1node, 2nodes, 4nodes
+    },
+
+    // Per-channel maximum Ring LL128 bandwidths: [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES]
+    .perChMaxRingLL128Bws = {
+      {-1.0, -1.0, -1.0},   // Volta: 1node, 2nodes, 4nodes
+      {-1.0, -1.0, -1.0},  // Ampere: 1node, 2nodes, 4nodes
+      {-1.0, -1.0, -1.0},  // Hopper: 1node, 2nodes, 4nodes
+      {-1.0, -1.0, -1.0}   // Blackwell: 1node, 2nodes, 4nodes
+    },
+
+    // Per-channel maximum Tree LL128 bandwidths: [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES]
+    .perChMaxTreeLL128Bws = {
+      {-1.0, -1.0, -1.0},    // Volta: 1node, 2nodes, 4nodes
+      {-1.0, -1.0, -1.0},   // Ampere: 1node, 2nodes, 4nodes
+      {-1.0, -1.0, -1.0},  // Hopper: 1node, 2nodes, 4nodes
+      {-1.0, -1.0, -1.0}   // Blackwell: 1node, 2nodes, 4nodes
+    },
+
+    // Per-channel maximum Tree bandwidths: [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES]
+    .perChMaxTreeBws = {
+      {-1.0, -1.0, -1.0},  // Volta: 1node, 2nodes, 4nodes
+      {-1.0, -1.0, -1.0},  // Ampere: 1node, 2nodes, 4nodes
+      {-1.0, -1.0, -1.0},  // Hopper: 1node, 2nodes, 4nodes
+      {-1.0, -1.0, -1.0}   // Blackwell: 1node, 2nodes, 4nodes
+    }
+  };
+
+  void* context = NULL;
+  ncclResult_t result = pluginInit(&context, 0, 8, 2, mock_logger, NULL, &constants);
+  TEST_ASSERT(result == ncclSuccess, "Plugin init with constants should succeed");
+
+  // Test that the constants were set correctly
+  TEST_ASSERT(constants.perChMaxTreeBws[NCCL_BLACKWELL_COMPCAP_IDX][NCCL_TUNING_SCALE_4NODES] == 15.0, "Tree bandwidth should be 15GB/s");
+  TEST_ASSERT(constants.perChMaxRingLL128Bws[NCCL_BLACKWELL_COMPCAP_IDX][NCCL_TUNING_SCALE_4NODES] == 20.0, "Ring bandwidth should be 20GB/s");
+  TEST_ASSERT(constants.hwLatencies[NCCL_HW_NET][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] == 24.0, "NVLSTree base network latency should be 24us");
+
+  // Clean up
+  pluginFinalize(context);
+  TEST_PASS();
+}
+
 // Test runner function pointer type
 typedef int (*TestFunction)(void);

@@ -783,6 +904,8 @@ TestCase test_cases[] = {
  {"large-config", test_large_config, "Large configuration files (dynamic allocation)"},
  {"stress-config", test_very_large_config_stress, "Very large configuration stress test"},
  {"empty-config", test_empty_config, "Empty configuration file handling"},
+  {"nvl-domain", test_nvl_domain_info, "NVL domain info handling"},
+  {"constants", test_tuner_constants, "Tuner constants initialization"},
  {NULL, NULL, NULL} // End marker
 };

@@ -826,6 +949,7 @@ int main(int argc, char* argv[]) {
  if (argc == 1) {
    // No arguments - run all tests
    for (int i = 0; test_cases[i].name != NULL; i++) {
+      printf("Running test: %s\n", test_cases[i].name);
      total++;
      passed += test_cases[i].func();
    }
@@ -26,7 +26,7 @@ install_dependencies=false
 install_library=false
 install_prefix="${ROCM_PATH}"
 log_trace=false
-msccl_kernel_enabled=true
+msccl_kernel_enabled=false
 mscclpp_enabled=false
 enable_mscclpp_clip=false
 num_parallel_jobs=$(nproc)
@@ -39,7 +39,9 @@ run_tests_all=false
 time_trace=false
 force_reduce_pipeline=false
 generate_sym_kernels=false
+warp_speed_enabled=true # note that this flag will be overridden to false for non MI350/MI300 platforms
 quiet_warnings=false
+build_rocshmem_support=false

 # #################################################
 # helper functions
@@ -54,7 +56,7 @@ function display_help()
    echo "       --debug                 Build debug library"
    echo "       --enable_backtrace      Build with custom backtrace support"
    echo "       --disable-colltrace     Build without collective trace"
-    echo "       --disable-msccl-kernel  Build without MSCCL kernels"
+    echo "       --enable-msccl-kernel   Build with MSCCL kernels"
    echo "       --dump-asm              Disassemble code and dump assembly with inline code"
    echo "       --enable-mscclpp        Build with MSCCL++ support"
    echo "       --enable-mscclpp-clip   Build MSCCL++ with clip wrapper on bfloat16 and half addition routines"
@@ -81,6 +83,7 @@ function display_help()
    echo "       --force-reduce-pipeline Force reduce_copy sw pipeline to be used for every reduce-based collectives and datatypes"
    echo "       --generate-sym-kernels  Generate symmetric memory kernels"
    echo "    -q|--quiet-warnings        Suppress majority of compiler warnings (not recommended)"
+    echo "       --rocshmem              Build with rocSHMEM support"
 }

 # #################################################
@@ -90,7 +93,7 @@ function display_help()
 # check if we have a modern version of getopt that can handle whitespace and long parameters
 getopt -T
 if [[ "$?" -eq 4 ]]; then
-    GETOPT_PARSE=$(getopt --name "${0}" --options cdfhij:lprtq --longoptions address-sanitizer,dependencies,debug,dump-asm,enable-code-coverage,enable_backtrace,disable-colltrace,disable-msccl-kernel,enable-mscclpp,fast,help,install,jobs:,kernel-resource-use,local_gpu_only,amdgpu_targets:,no_clean,npkit-enable,log-trace,openmp-test-enable,roctx-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,static,tests_build,time-trace,force-reduce-pipeline,generate-sym-kernels,quiet-warnings,verbose -- "$@")
+    GETOPT_PARSE=$(getopt --name "${0}" --options cdfhij:lprtq --longoptions address-sanitizer,dependencies,debug,dump-asm,enable-code-coverage,enable_backtrace,disable-colltrace,disable-msccl-kernel,enable-mscclpp,fast,help,install,jobs:,kernel-resource-use,local_gpu_only,amdgpu_targets:,no_clean,npkit-enable,log-trace,openmp-test-enable,roctx-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,static,tests_build,time-trace,force-reduce-pipeline,generate-sym-kernels,quiet-warnings,disable-warp-speed,verbose,rocshmem -- "$@")
 else
    echo "Need a new version of getopt"
    exit 1
@@ -137,7 +140,9 @@ while true; do
         --verbose)                  build_verbose=true;                                                                               shift ;;
         --force-reduce-pipeline)    force_reduce_pipeline=true;                                                                       shift ;;
         --generate-sym-kernels)     generate_sym_kernels=true;                                                                        shift ;;
+         --disable-warp-speed)       warp_speed_enabled=false;                                                                         shift ;;
    -q | --quiet-warnings)           quiet_warnings=true;                                                                              shift ;;
+         --rocshmem)                 build_rocshmem_support=true;                                                                      shift ;;
    --) shift ; break ;;
    *)  echo "Unexpected command line parameter received; aborting";
        exit 1
@@ -316,12 +321,25 @@ if [[ "${npkit_enabled}" == true ]]; then
    cmake_common_options="${cmake_common_options} -DENABLE_NPKIT=ON"
 fi

+# Enable WARP_SPEED only on MI350/MI300 platforms
+if [[ "${warp_speed_enabled}" == true ]]; then
+    cmake_common_options="${cmake_common_options} -DENABLE_WARP_SPEED=ON"
+fi
+
 # Suppress Warnings
 if [[ "${quiet_warnings}" == true ]]; then
    cmake_common_options="${cmake_common_options} -DQUIET_WARNINGS=ON"
 fi


+# Enable rocSHMEM support
+if [[ "${build_rocshmem_support}" == true ]]; then
+    cmake_common_options="${cmake_common_options} -DENABLE_ROCSHMEM=ON"
+    cmake_common_options="${cmake_common_options} -DROCSHMEM_INSTALL_DIR=${ROCSHMEM_INSTALL_DIR}"
+else
+    cmake_common_options="${cmake_common_options} -DENABLE_ROCSHMEM=OFF"
+fi
+
 check_exit_code "$?"

 # Enable ninja build for time tracing
@@ -32,13 +32,8 @@ CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)

 # You should define NVCC_GENCODE in your environment to the minimal set
 # of archs to reduce compile time.
-CUDA8_GENCODE = -gencode=arch=compute_50,code=sm_50 \
-                -gencode=arch=compute_60,code=sm_60 \
+CUDA8_GENCODE = -gencode=arch=compute_60,code=sm_60 \
                -gencode=arch=compute_61,code=sm_61
-ifeq ($(shell test "0$(CUDA_MAJOR)" -lt 12; echo $$?),0)
-# SM35 is deprecated from CUDA12.0 onwards
-CUDA8_GENCODE += -gencode=arch=compute_35,code=sm_35
-endif
 CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70
 CUDA10_GENCODE = -gencode=arch=compute_75,code=sm_75
 CUDA11_GENCODE = -gencode=arch=compute_80,code=sm_80
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 27
-NCCL_PATCH   := 7
+NCCL_MINOR   := 28
+NCCL_PATCH   := 3
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
@@ -10,7 +10,7 @@ build : debian.build txz.build

 BUILDDIR ?= $(abspath ../build)
 ABSBUILDDIR := $(abspath $(BUILDDIR))
-TARGETS := debian txz
+TARGETS := debian txz doc
 all:   ${TARGETS:%=%.build}
 prep:  ${TARGETS:%=%.prep}
 build: ${TARGETS:%=%.build}
@@ -1,4 +1,4 @@
 bin/ncclras /usr/bin
-include/nccl.h /usr/include
+include/* /usr/include
 lib/libnccl.so /usr/lib/${pkg:MultiArch}
 lib/libnccl_static.a /usr/lib/${pkg:MultiArch}
@@ -47,8 +47,8 @@ ln -s libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_li
 # devel
 install -m 755 -d $RPM_BUILD_ROOT/%{_bindir}
 install -m 755 -d $RPM_BUILD_ROOT/%{_includedir}
+cp -a include/* $RPM_BUILD_ROOT/%{_includedir}/
 install -m 755 bin/ncclras $RPM_BUILD_ROOT/%{_bindir}
-install -m 644 include/nccl.h $RPM_BUILD_ROOT/%{_includedir}
 ln -s libnccl.so.${nccl:Major} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so

 # static
@@ -67,7 +67,7 @@ rm -rf $RPM_BUILD_ROOT
 %doc LICENSE.txt
 %defattr(-,root,root,-)
 %{_bindir}/ncclras
-%{_includedir}/nccl.h
+%{_includedir}/*
 %{_libdir}/libnccl.so

 %files static
@@ -22,7 +22,7 @@ prep: $(TXZTARGETS)
 build: prep
 	$(MAKE) -C ../../src clean
 	@printf "Building source tar.xz package\n"
-	(cd $(BUILDDIR); bash srctxz/create_srctxz.sh)
+	(cd $(BUILDDIR); SRCTXZ_APITESTS=$(SRCTXZ_APITESTS) bash srctxz/create_srctxz.sh)
 	mkdir -p $(PKGDIR)
 	mv $(BUILDDIR)/../../nccl-src*.txz $(PKGDIR)

@@ -28,8 +28,34 @@ NCCL_SUFFIX=${nccl:Suffix}
 NCCL_BUILD=${pkg:Revision}

 NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${NCCL_BUILD}"
+if [ "${SRCTXZ_APITESTS}" = "1" ]; then
+  NCCLNAME+="-apitest"
+fi

-tar --exclude build \
+
+INCLUDE_TEST_ENTRIES=("apitest" "googletest" "gtest.mk")
+
+if [ "${SRCTXZ_APITESTS}" = "1" ]; then
+  # Exclude all entries inside test folder except those in INCLUDE_TEST_ENTRIES
+  for entry in $(ls $NCCLDIR/test); do
+    if [[ ! " ${INCLUDE_TEST_ENTRIES[@]} " =~ " $entry " ]]; then
+      EXCLUDE_TEST+=" --exclude $NCCLDIR/test/$entry"
+    fi
+  done
+else
+  # Exclude the entire test directory
+  EXCLUDE_TEST+=" --exclude test"
+fi
+
+tar --exclude fortran \
+    --exclude doc \
+    --exclude plc \
+    --exclude build \
    --exclude ".git*" \
+    --exclude share \
+    --exclude ompi \
+    --exclude ext-net \
    --exclude pkg/srctxz \
+    --exclude docker \
+    $EXCLUDE_TEST \
    --transform "s/^$NCCLDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $NCCLDIR
@@ -0,0 +1,180 @@
+# Source files
+set(LIBSRCFILES
+    bootstrap.cc
+    channel.cc
+    ce_coll.cc
+    collectives.cc
+    debug.cc
+    enqueue.cc
+    group.cc
+    init.cc
+    init_nvtx.cc
+    proxy.cc
+    transport.cc
+    mnnvl.cc
+    allocator.cc
+    sym_kernels.cc
+    dev_runtime.cc
+)
+
+# Add compatibility shim if using static cudart
+if(CUDARTLIB STREQUAL "cudart_static")
+    list(APPEND LIBSRCFILES enhcompat.cc)
+endif()
+
+# Configure pkg-config file
+configure_file(
+    ${CMAKE_CURRENT_SOURCE_DIR}/nccl.pc.in
+    ${CMAKE_BINARY_DIR}/lib/pkgconfig/nccl.pc
+    @ONLY
+)
+
+# Add files from subdirectories
+add_subdirectory(transport)
+add_subdirectory(misc)
+add_subdirectory(register)
+add_subdirectory(graph)
+add_subdirectory(plugin)
+add_subdirectory(device)
+add_subdirectory(nccl_device)
+add_subdirectory(ras)
+add_subdirectory(scheduler)
+
+add_compile_options(-fmacro-prefix-map=${CMAKE_CURRENT_SOURCE_DIR}/=)
+
+# Add all source files
+list(APPEND LIBSRCFILES
+    ${TRANSPORT_SOURCES}
+    ${MISC_SOURCES}
+    ${REGISTER_SOURCES}
+    ${GRAPH_SOURCES}
+    ${PLUGIN_SOURCES}
+    ${RAS_SOURCES}
+    ${SYM_SOURCES}
+    ${SCHEDULER_SOURCES}
+)
+
+###################### Create a shared NCCL library ############################
+add_library(nccl SHARED)
+
+target_sources(nccl PRIVATE ${LIBSRCFILES})
+
+# Include directories
+target_include_directories(nccl PUBLIC
+    ${CMAKE_CURRENT_SOURCE_DIR}/device
+    ${CMAKE_CURRENT_SOURCE_DIR}/include
+    ${CMAKE_CURRENT_SOURCE_DIR}/include/plugin
+    ${CUDAToolkit_INCLUDE_DIRS}
+    ${CUDAToolkit_INCLUDE_DIRS}/cccl
+)
+
+add_custom_command(
+    OUTPUT ${CMAKE_BINARY_DIR}/include/nccl.h
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_BINARY_DIR}/include
+    COMMAND sed -e "s/\\\$$\\{nccl:Major\\}/${NCCL_MAJOR}/g"
+                -e "s/\\\$$\\{nccl:Minor\\}/${NCCL_MINOR}/g"
+                -e "s/\\\$$\\{nccl:Patch\\}/${NCCL_PATCH}/g"
+                -e "s/\\\$$\\{nccl:Suffix\\}/${NCCL_SUFFIX}/g"
+                -e "s/\\\$$\\{nccl:Version\\}/${NCCL_VERSION_CODE}/g"
+                ${CMAKE_CURRENT_SOURCE_DIR}/nccl.h.in > ${CMAKE_BINARY_DIR}/include/nccl.h
+    BYPRODUCTS ${CMAKE_BINARY_DIR}/include/nccl.h
+)
+
+add_custom_target(nccl_header DEPENDS ${CMAKE_BINARY_DIR}/include/nccl.h)
+
+add_dependencies(nccl nccl_header)
+
+# Set version and output name
+set_target_properties(nccl PROPERTIES
+    VERSION ${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}
+    SOVERSION ${NCCL_MAJOR}
+    OUTPUT_NAME "nccl"
+    PREFIX "lib"
+)
+
+# Set CUDA specific flags
+set_target_properties(nccl PROPERTIES
+    CUDA_SEPARABLE_COMPILATION ON
+    CUDA_RESOLVE_DEVICE_SYMBOLS ON
+    CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}"
+    POSITION_INDEPENDENT_CODE ON
+)
+
+# Link libraries
+target_link_libraries(nccl
+    PRIVATE
+    nccl_device
+    pthread
+    rt
+    dl
+    ${CUDAToolkit_LIBRARIES}
+    ${EXTRA_LIBS}
+)
+
+# Set output directories for nccl shared library
+set_target_properties(nccl PROPERTIES
+    LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+)
+
+###################### Create a ras binary executable ############################
+set(RAS_BINSRCFILES ras/client.cc)
+
+add_executable(ncclras ${RAS_BINSRCFILES})
+
+target_include_directories(ncclras PUBLIC
+    ${CMAKE_BINARY_DIR}/include
+    ${CUDAToolkit_INCLUDE_DIRS}
+)
+
+add_dependencies(ncclras nccl_header)
+
+target_link_libraries(ncclras
+    PRIVATE
+    pthread
+    rt
+    dl
+)
+
+# Set output directory for ncclras executable
+set_target_properties(ncclras PROPERTIES
+    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
+)
+
+###################### Create a static NCCL library ############################
+add_library(nccl_static STATIC ${LIBSRCFILES})
+
+# Include directories
+target_include_directories(nccl_static PUBLIC
+    ${CMAKE_CURRENT_SOURCE_DIR}/device
+    ${CMAKE_CURRENT_SOURCE_DIR}/include
+    ${CMAKE_CURRENT_SOURCE_DIR}/include/plugin
+    ${CUDAToolkit_INCLUDE_DIRS}
+    ${CUDAToolkit_INCLUDE_DIRS}/cccl
+)
+
+# Add dependency on nccl_header
+add_dependencies(nccl_static nccl_header)
+
+# Link libraries
+target_link_libraries(nccl_static
+    PRIVATE
+    nccl_device
+    pthread
+    rt
+    dl
+    ${CUDAToolkit_LIBRARIES}
+    ${EXTRA_LIBS}
+)
+
+# Set CUDA specific flags
+set_target_properties(nccl_static PROPERTIES
+    CUDA_SEPARABLE_COMPILATION ON
+    CUDA_RESOLVE_DEVICE_SYMBOLS ON
+    CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}"
+    POSITION_INDEPENDENT_CODE ON
+)
+
+# Set output directory for nccl_static library
+set_target_properties(nccl_static PROPERTIES
+    ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+)
@@ -7,10 +7,12 @@ include ../makefiles/common.mk
 include ../makefiles/version.mk

 ##### src files
-INCEXPORTS  := nccl.h
+INCEXPORTS  := nccl.h nccl_device.h \
+	$(patsubst include/%,%,$(wildcard include/nccl_device/*.h include/nccl_device/impl/*.h))
+
 LIBSRCFILES := \
 	bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \
-	init.cc init_nvtx.cc proxy.cc transport.cc mnnvl.cc allocator.cc symmetric.cc \
+	init.cc init_nvtx.cc proxy.cc transport.cc mnnvl.cc allocator.cc dev_runtime.cc sym_kernels.cc ce_coll.cc \
 	$(wildcard graph/*.cc) \
 	$(wildcard misc/*.cc) \
 	$(wildcard transport/*.cc) \
@@ -19,6 +21,8 @@ LIBSRCFILES := \
 	$(wildcard plugin/net/*.cc) \
 	$(wildcard plugin/tuner/*.cc) \
 	$(wildcard plugin/profiler/*.cc) \
+	$(wildcard nccl_device/*.cc) \
+	$(wildcard scheduler/*.cc) \
 	$(filter-out ras/client.cc,$(wildcard ras/*.cc))
 BINSRCFILES := ras/client.cc

@@ -123,6 +127,16 @@ $(INCDIR)/nccl_%.h : include/nccl_%.h
 	mkdir -p $(INCDIR)
 	install -m 644 $< $@

+$(INCDIR)/nccl_device/%.h: include/nccl_device/%.h
+	@printf "Grabbing   %-35s > %s\n" $< $@
+	mkdir -p $(INCDIR)/nccl_device
+	install -m 644 $< $@
+
+$(INCDIR)/nccl_device/impl/%.h: include/nccl_device/impl/%.h
+	@printf "Grabbing   %-35s > %s\n" $< $@
+	mkdir -p $(INCDIR)/nccl_device/impl
+	install -m 644 $< $@
+
 $(PKGDIR)/%.pc : %.pc
 	@printf "Grabbing   %-35s > %s\n" $< $@
 	mkdir -p $(PKGDIR)
@@ -149,7 +163,7 @@ install : build
 	mkdir -p $(PREFIX)/bin
 	cp -P -v $(BUILDDIR)/lib/lib* $(PREFIX)/lib/
 	cp -P -v $(BUILDDIR)/lib/pkgconfig/* $(PREFIX)/lib/pkgconfig/
-	cp -v $(BUILDDIR)/include/* $(PREFIX)/include/
+	cp -v -r $(BUILDDIR)/include/* $(PREFIX)/include/
 	cp -v $(BUILDDIR)/bin/ncclras $(PREFIX)/bin/

 FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cc" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|gdrwrap.h|nccl.h')
@@ -7,10 +7,11 @@
 #include "comm.h"
 #include "transport.h"
 #include "group.h"
+#include "nvtx.h"

 NCCL_API(ncclResult_t, ncclMemAlloc, void **ptr, size_t size);
 ncclResult_t  ncclMemAlloc_impl(void **ptr, size_t size) {
-  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  NCCL_NVTX3_FUNC_RANGE;
  ncclResult_t ret = ncclSuccess;

 #if ROCM_VERSION >= 70000
@@ -99,7 +100,7 @@ fail:

 NCCL_API(ncclResult_t, ncclMemFree, void *ptr);
 ncclResult_t  ncclMemFree_impl(void *ptr) {
-  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  NCCL_NVTX3_FUNC_RANGE;
  ncclResult_t ret = ncclSuccess;
  int saveDevice;

@@ -129,70 +130,339 @@ fail:
  goto exit;
 }

-// This is a collective function and should be called by all ranks in the communicator
-ncclResult_t ncclCommSymmetricAllocInternal(struct ncclComm* comm, size_t size, size_t alignment, void** symPtr) {
-  ncclResult_t ret = ncclSuccess;
-  void* regSymAddr = NULL;
-  size_t allocSize = size;
-  size_t granularity;
-  CUdevice cuDev;
-  CUmemAllocationProp memprop = {};
-  CUmemGenericAllocationHandle memHandle;
-  int bit = 0, cnt = 0;
+////////////////////////////////////////////////////////////////////////////////
+// ncclSpace:
+//
+// This datastructure "cuts" the line of non-negative integers into segments
+// which alternate between "full" (allocated) and "empty" (not allocated). The
+// cuts are sorted ascending. The segment after the last cut must be empty
+// (the unallocated frontier). Knwoing this we can deduce whether the segment
+// ending at cut[i] is full or empty with this formula:
+//   isFull(i) = (i%2 != ncuts%2)

-  // aligment must be power of 2 as an input
-  while (bit < sizeof(size_t) * 8) {
-    if (alignment & (1L << bit)) cnt++;
-    if (cnt == 2) {
-      WARN("rank %d alignment %ld is not power of 2", comm->rank, alignment);
-      goto fail;
+void ncclSpaceConstruct(struct ncclSpace* a) {
+  memset(a, 0, sizeof(*a));
+}
+
+void ncclSpaceDestruct(struct ncclSpace* a) {
+  free(a->cuts);
+}
+
+static void insertSegment(struct ncclSpace* a, int index, int64_t lo, int64_t hi) {
+  // Insert space for two cuts in `a->cuts[]` before `index`.
+  if (a->count + 2 > a->capacity) {
+    a->capacity *= 2;
+    if (a->capacity == 0) a->capacity = 16;
+    int64_t* cuts1 = (int64_t*)malloc(a->capacity*sizeof(int64_t));
+    for (int i=0; i < index; i++) cuts1[i] = a->cuts[i];
+    for (int i=index; i < a->count; i++) cuts1[i+2] = a->cuts[i];
+    free(a->cuts);
+    a->cuts = cuts1;
+  } else {
+    for (int i=a->count-1; index <= i; i--) a->cuts[i+2] = a->cuts[i];
+  }
+  a->cuts[index+0] = lo;
+  a->cuts[index+1] = hi;
+  a->count += 2;
+
+  // Filter pairs of adjacent repeated values from cuts[]. Since these mark
+  // boundaries where segments transition between full<->empty, dropping such a
+  // pair fuses two adjacent segments together. Examples:
+  //   [1,2,3,3,4] -> [1,2,4]
+  //   [1,2,3,3,3,4] -> [1,2,3,4] // have to leave one 3 because its a full<->empty transition
+  //   [1,2,3,3,3,3,4] -> [1,2,4]
+  // Leading zeros don't have to be in pairs, they are always dropped:
+  //   [0,1,2] -> [1,2]
+  //   [0,0,1,2] -> [1,2]
+  int r = index, w = index; // Read and write cursors.
+  int64_t prev = r==0 ? 0 : a->cuts[r-1];
+  while (r < a->count) {
+    int64_t cur = a->cuts[r++];
+    a->cuts[w++] = cur;
+    if (prev == cur) { // Repeated value is an empty segment which can be deleted.
+      // Erase last two cuts or just one if we're at the start.
+      w -= w==1 ? 1 : 2;
+      // Zeros can only occur at the beginning (due to being sorted). We want to
+      // drop any number of zeros, but only even numbers of other repeated values.
+      // So set to zero here, which will make prev=0, thus if next value is zero
+      // it will be dropped but if its not zero then it will need to begin a new
+      // pair to be dropped.
+      cur = 0;
    }
-    bit++;
+    prev = cur;
  }
-  // temporarily align the alignment to NCCL_REC_PAGE_SIZE
-  ALIGN_SIZE(alignment, NCCL_REC_PAGE_SIZE);
-
-  CUCHECKGOTO(cuDeviceGet(&cuDev, comm->cudaDev), ret, fail);
-  memprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
-  memprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-  memprop.requestedHandleType = ncclCuMemHandleType;
-  memprop.location.id = cuDev;
-  CUCHECKGOTO(cuMemGetAllocationGranularity(&granularity, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail);
-  ALIGN_SIZE(allocSize, granularity);
-
-  CUCHECKGOTO(cuMemCreate(&memHandle, allocSize, &memprop, 0), ret, fail);
-  ALIGN_SIZE(comm->symAllocHead, alignment);
-  NCCLCHECKGOTO(ncclIpcSymmetricMap(comm, comm->symAllocHead, allocSize, memHandle, &regSymAddr), ret, fail);
-  NCCLCHECKGOTO(ncclNvlsSymmetricMap(comm, comm->symAllocHead, allocSize, regSymAddr), ret, fail);
-  NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
-  comm->symAllocHead += allocSize;
-  *symPtr = regSymAddr;
-
-exit:
-  return ret;
-fail:
-  *symPtr = NULL;
-  goto exit;
+  a->count = w;
 }

-ncclResult_t ncclCommSymmetricFreeInternal(struct ncclComm* comm, void* symPtr) {
-  CUmemGenericAllocationHandle handle;
-  size_t size = 0;
-  ncclResult_t ret = ncclSuccess;
-  int saveDev = comm->cudaDev;
-  CUDACHECKGOTO(cudaGetDevice(&saveDev), ret, fail);
-  if (ncclCuMemEnable()) {
-    CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
-    CUCHECKGOTO(cuMemRetainAllocationHandle(&handle, symPtr), ret, fail);
-    CUCHECKGOTO(cuMemRelease(handle), ret, fail);
-    CUCHECKGOTO(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)symPtr), ret, fail);
-    NCCLCHECKGOTO(ncclNvlsSymmetricFree(comm, size, symPtr), ret, fail);
-    NCCLCHECKGOTO(ncclIpcSymmetricFree(comm, size, symPtr), ret, fail);
-    CUCHECKGOTO(cuMemRelease(handle), ret, fail);
+ncclResult_t ncclSpaceAlloc(
+    struct ncclSpace* a, int64_t limit, int64_t size, int align,
+    int64_t* outOffset
+  ) {
+  // When allocating we try to locate the first empty segment which can hold
+  // the allocation and move its lower cut upward.
+  int i = a->count%2; // First empty segment ends at cuts[i]
+  size_t off;
+  while (i <= a->count) {
+    size_t lo = i == 0 ? 0 : a->cuts[i-1];
+    size_t hi = i == a->count ? limit : a->cuts[i];
+    off = alignUp(lo, align);
+    if (off + size <= hi) {
+      *outOffset = off;
+      if (i == 0 || off + size == hi) { // Slow path required.
+        insertSegment(a, i, off, off+size);
+      } else { // We can just append to the end of a full segment.
+        a->cuts[i-1] = off + size;
+      }
+      return ncclSuccess;
+    }
+    i += 2; // Next empty segment
  }
-exit:
-  CUDACHECK(cudaSetDevice(saveDev));
-  return ret;
-fail:
-  goto exit;
+  WARN("Allocation failed. No suitable space found to accommodate size=0x%lx within limit=0x%lx", (long)size, (long)limit);
+  return ncclInternalError;
+}
+
+ncclResult_t ncclSpaceFree(struct ncclSpace* a, int64_t offset, int64_t size) {
+  if (a->count == 0 || a->cuts[a->count-1] <= offset) {
+    WARN("No allocation found at offset=0x%lx", (long)offset);
+    return ncclInternalError;
+  }
+
+  // This could be binary search, but since allocate is linear there's no point.
+  int i = 1 - a->count%2; // First full segment ends at cuts[i]
+  while (a->cuts[i] <= offset) i += 2;
+
+  int64_t lo = i==0 ? 0 : a->cuts[i-1];
+  int64_t hi = a->cuts[i];
+
+  if (offset < lo || hi < offset + size) {
+    WARN("Given size=0x%lx extends beyond allocation.", (long)size);
+    return ncclInternalError;
+  }
+
+  // First try the two fast cases which just shrink a segment from one side.
+  if (i != 0 && lo == offset && offset + size != hi) {
+    a->cuts[i-1] = offset + size; // Bring bottom up.
+  } else if (lo != offset && offset + size == hi) {
+    a->cuts[i] = offset; // Bring top down.
+  } else { // Slow path.
+    insertSegment(a, i, offset, offset+size);
+  }
+  return ncclSuccess;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// ncclShadowPool:
+
+struct ncclShadowPage { // A contiguous block of (at most) 64 objects
+  struct ncclShadowPage* next;
+  int objSize;
+  uint64_t freeMask;
+  void* devObjs;
+};
+struct ncclShadowObject {
+  struct ncclShadowObject* next;
+  void* devObj;
+  void* hostObj;
+  struct ncclShadowPage* page; // null if not allocated in page but directly in CUDA mempool.
+};
+
+void ncclShadowPoolConstruct(struct ncclShadowPool* pool) {
+  pool->hbits = 0;
+  pool->count = 0;
+  pool->table = nullptr;
+  pool->pages = nullptr;
+}
+
+ncclResult_t ncclShadowPoolDestruct(struct ncclShadowPool* pool) {
+  if (pool->hbits != 0) {
+    cudaStream_t stream;
+    CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+
+    if (pool->count != 0) {
+      for (int i=0; i < 1<<pool->hbits; i++) {
+        struct ncclShadowObject* obj = pool->table[i];
+        while (obj != nullptr) {
+          struct ncclShadowPage* page = obj->page;
+          if (page != nullptr) {
+            if (page->freeMask == 0) { // Put full pages back into page list.
+              page->freeMask = 1;
+              page->next = pool->pages;
+              pool->pages = page;
+            }
+          } else {
+            cudaFreeAsync(obj->devObj, stream);
+          }
+          struct ncclShadowObject* next = obj->next;
+          free(obj);
+          obj = next;
+        }
+      }
+    }
+    free(pool->table);
+
+    while (pool->pages != nullptr) {
+      cudaFreeAsync(pool->pages->devObjs, stream);
+      struct ncclShadowPage* next = pool->pages->next;
+      free(pool->pages);
+      pool->pages = next;
+    }
+
+    cudaStreamSynchronize(stream);
+    cudaStreamDestroy(stream);
+    cudaMemPoolDestroy(pool->memPool);
+  }
+  return ncclSuccess;
+}
+
+static int hashBucket(int hbits, void* devObj) {
+  uintptr_t h = reinterpret_cast<uintptr_t>(devObj);
+  h ^= h>>32;
+  h *= 0x9e3779b97f4a7c13;
+  return (uint64_t)h >> (64-hbits);
+}
+
+static void hashInsert(struct ncclShadowPool* pool, struct ncclShadowObject* obj) {
+  int b = hashBucket(pool->hbits, obj->devObj);
+  obj->next = pool->table[b];
+  pool->table[b] = obj;
+}
+
+ncclResult_t ncclShadowPoolAlloc(
+    struct ncclShadowPool* pool, size_t size, void** outDevObj, void** outHostObj,
+    cudaStream_t stream
+  ) {
+  if (size == 0) {
+    if (outDevObj) *outDevObj = nullptr;
+    if (outHostObj) *outHostObj = nullptr;
+    return ncclSuccess;
+  }
+
+  int hbits = pool->hbits;
+  if (hbits == 0) {
+    cudaMemPoolProps props = {};
+    props.allocType = cudaMemAllocationTypePinned;
+    props.handleTypes = cudaMemHandleTypeNone;
+    props.location.type = cudaMemLocationTypeDevice;
+    cudaGetDevice(&props.location.id);
+    CUDACHECK(cudaMemPoolCreate(&pool->memPool, &props));
+
+    pool->hbits = hbits = 4;
+    pool->table = (struct ncclShadowObject**)malloc(sizeof(struct ncclShadowObject*)<<hbits);
+    for (int i=0; i < 1<<hbits; i++) pool->table[i] = nullptr;
+  }
+
+  // Check for hash table size increase before inserting. Maintain 2:1 object:bucket ratio.
+  if (pool->count+1 > 2<<hbits) {
+    struct ncclShadowObject** table0 = pool->table;
+    struct ncclShadowObject** table1 = (struct ncclShadowObject**)malloc(sizeof(struct ncclShadowObject*)<<(hbits+1));
+    pool->table = table1;
+    pool->hbits = hbits+1;
+    for (int i1=0; i1 < 2<<hbits; i1++) table1[i1] = nullptr;
+    for (int i0=0; i0 < 1<<hbits; i0++) {
+      struct ncclShadowObject* obj = table0[i0];
+      while (obj) {
+        struct ncclShadowObject* next = obj->next;
+        hashInsert(pool, obj);
+        obj = next;
+      }
+    }
+    hbits += 1; // match pool->hbits
+    free(table0);
+  }
+
+  struct ncclShadowPage* page;
+  void *devObj;
+  if ((64<<10)/size >= 3) {
+    int shift = std::max<int>(0, (int)log2Down(size) + 1 - 4);
+    int pageObjSize = ((size + (1<<shift)-1)>>shift)<<shift;
+    struct ncclShadowPage** pagePtr = &pool->pages;
+    while (true) {
+      page = *pagePtr;
+      if (page == nullptr) {
+        size_t pageSize = std::min<size_t>(64<<10, 64*pageObjSize);
+        page = (struct ncclShadowPage*)malloc(sizeof(struct ncclShadowPage));
+        page->objSize = pageObjSize;
+        page->freeMask = uint64_t(-1)>>(64 - pageSize/pageObjSize);
+        page->next = pool->pages;
+        pool->pages = page;
+        CUDACHECK(cudaMallocFromPoolAsync(&page->devObjs, pageSize, pool->memPool, stream));
+        CUDACHECK(cudaMemsetAsync(page->devObjs, 0, pageSize, stream));
+        // fall through...
+      }
+      if (page->objSize == pageObjSize) {
+        int slot = popFirstOneBit(&page->freeMask);
+        devObj = (char*)page->devObjs + slot*pageObjSize;
+        if (page->freeMask == 0) *pagePtr = page->next; // Remove full page from list.
+        break;
+      }
+      pagePtr = &page->next;
+    }
+  } else {
+    page = nullptr;
+    CUDACHECK(cudaMallocFromPoolAsync(&devObj, size, pool->memPool, stream));
+    CUDACHECK(cudaMemsetAsync(devObj, 0, size, stream));
+  }
+
+  struct ncclShadowObject* obj = (struct ncclShadowObject*)malloc(
+    sizeof(struct ncclShadowObject) + /*padding=*/alignof(max_align_t)-1 + size
+  );
+  obj->page = page;
+  obj->devObj = devObj;
+  obj->hostObj = alignUp((char*)(obj+1), alignof(max_align_t));
+  memset(obj->hostObj, 0, size);
+  hashInsert(pool, obj);
+  pool->count += 1;
+  if (outDevObj) *outDevObj = devObj;
+  if (outHostObj) *outHostObj = obj->hostObj;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclShadowPoolFree(struct ncclShadowPool* pool, void* devObj, cudaStream_t stream) {
+  if (devObj == nullptr) return ncclSuccess;
+
+  int b = hashBucket(pool->hbits, devObj);
+  struct ncclShadowObject** pobj = &pool->table[b];
+  while (true) {
+    if (*pobj == nullptr) {
+      WARN("Device object does not exist in shadow pool.");
+      return ncclInternalError;
+    }
+    if ((*pobj)->devObj == devObj) break;
+    pobj = &(*pobj)->next;
+  }
+  struct ncclShadowObject* obj = *pobj;
+  *pobj = obj->next;
+  if (obj->page != nullptr) {
+    if (obj->page->freeMask == 0) {
+      obj->page->next = pool->pages;
+      pool->pages = obj->page;
+    }
+    int slot = ((char*)obj->devObj - (char*)obj->page->devObjs)/obj->page->objSize;
+    obj->page->freeMask |= uint64_t(1)<<slot;
+  } else {
+    CUDACHECK(cudaFreeAsync(devObj, stream));
+  }
+  free(obj);
+  pool->count -= 1;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclShadowPoolToHost(struct ncclShadowPool* pool, void* devObj, void** hostObj) {
+  if (devObj == nullptr) {
+    *hostObj = nullptr;
+    return ncclSuccess;
+  }
+
+  int b = hashBucket(pool->hbits, devObj);
+  struct ncclShadowObject* obj = pool->table[b];
+  while (true) {
+    if (obj == nullptr) {
+      WARN("Device object does not exist in shadow pool.");
+      return ncclInternalError;
+    }
+    if (obj->devObj == devObj) break;
+    obj = obj->next;
+  }
+  *hostObj = obj->hostObj;
+  return ncclSuccess;
 }
@@ -15,6 +15,7 @@
 #include "signals.h" // [RCCL]
 #include "param.h"
 #include "ras.h"
+#include <mutex>

 #define BOOTSTRAP_N_CHECK_ABORT           10000
 #define BOOTSTRAP_TAG_CONNECT             (0x1 << 31)
@@ -86,13 +87,13 @@ struct bootstrapRootArgs {
 static char bootstrapNetIfName[MAX_IF_NAME_SIZE+1];
 static union ncclSocketAddress bootstrapNetIfAddr;
 static int bootstrapNetInitDone = 0;
-pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER;
+static std::mutex bootstrapNetMutex;

 NCCL_PARAM(BootstrapNetEnable,"OOB_NET_ENABLE", 0);

 ncclResult_t bootstrapNetInit() {
  if (bootstrapNetInitDone == 0) {
-    pthread_mutex_lock(&bootstrapNetLock);
+    std::lock_guard<std::mutex> lock(bootstrapNetMutex);
    if (bootstrapNetInitDone == 0) {
      const char* env = ncclGetEnv("NCCL_COMM_ID");
      int nIfs = 0;
@@ -100,21 +101,18 @@ ncclResult_t bootstrapNetInit() {
        union ncclSocketAddress remoteAddr;
        if (ncclSocketGetAddrFromString(&remoteAddr, env) != ncclSuccess) {
          WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
-          pthread_mutex_unlock(&bootstrapNetLock);
          return ncclInvalidArgument;
        }
        NCCLCHECK(ncclFindInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE,
                                               &nIfs));
        if (nIfs <= 0) {
          WARN("NET/Socket : No usable listening interface found");
-          pthread_mutex_unlock(&bootstrapNetLock);
          return ncclSystemError;
        }
      } else {
        NCCLCHECK(ncclFindInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1, &nIfs));
        if (nIfs <= 0) {
          WARN("Bootstrap : no socket interface found");
-          pthread_mutex_unlock(&bootstrapNetLock);
          return ncclInvalidUsage;
        }
      }
@@ -124,7 +122,6 @@ ncclResult_t bootstrapNetInit() {
      INFO(NCCL_BOOTSTRAP, "Bootstrap: Using%s", line);
      bootstrapNetInitDone = 1;
    }
-    pthread_mutex_unlock(&bootstrapNetLock);
  }
  return ncclSuccess;
 }
@@ -486,7 +483,7 @@ static ncclResult_t getUDS(uint64_t* peerUDS) {
 static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) {
  static int devOOB = -1;
  if (devOOB < 0) {
-    pthread_mutex_lock(&bootstrapNetLock);
+    std::lock_guard<std::mutex> lock(bootstrapNetMutex);
    if (devOOB < 0) {
      const char* userIfEnv = ncclGetEnv("NCCL_OOB_NET_IFNAME");
      if (userIfEnv && strlen(userIfEnv) > 0) {
@@ -517,7 +514,6 @@ static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) {
            WARN("no device found matching %s%s, verify NCCL_OOB_NET_IFNAME", searchExact ? "exactly " : "", userIfEnv);
          else
            WARN("no device found after excluding %s%s, verify NCCL_OOB_NET_IFNAME", searchExact ? "exactly " : "", userIfEnv);
-          pthread_mutex_unlock(&bootstrapNetLock);
          return ncclInvalidArgument;
        }
      } else {
@@ -530,13 +526,12 @@ static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) {
      bool hasProp = res == ncclSuccess;
      INFO(NCCL_BOOTSTRAP, "Bootstrap: Using %s:%d", (hasProp) ? props.name : "N/A", (hasProp) ? props.port : -1);
    }
-    pthread_mutex_unlock(&bootstrapNetLock);
  }
  *dev = devOOB;
  return ncclSuccess;
 }

-static ncclResult_t netRingConnect(ncclNet_t* net, struct bootstrapListen_t* listen, char peerHandle[NCCL_NET_HANDLE_MAXSIZE],
+static ncclResult_t netRingConnect(void* ctx, ncclNet_t* net, struct bootstrapListen_t* listen, char peerHandle[NCCL_NET_HANDLE_MAXSIZE],
                                   void** sendComm, ncclNetDeviceHandle_t** sendDevHandle,
                                   void** recvComm, ncclNetDeviceHandle_t** recvDevHandle, volatile uint32_t* abortFlag) {

@@ -544,7 +539,7 @@ static ncclResult_t netRingConnect(ncclNet_t* net, struct bootstrapListen_t* lis
  do {
    NCCLCHECK(checkAbort(abortFlag, &abortCounter));
    if (!*sendComm)
-      NCCLCHECK(net->connect(listen->net.dev, NULL, peerHandle, sendComm, sendDevHandle));
+      NCCLCHECK(net->connect(ctx, listen->net.dev, peerHandle, sendComm, sendDevHandle));
    if (!*recvComm)
      NCCLCHECK(net->accept(listen->net.comm, recvComm, recvDevHandle));
  } while (!*sendComm || !*recvComm);
@@ -660,7 +655,7 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
  if (ncclParamBootstrapNetEnable()) {
    // Create net interface for other ranks to contact me (all gather)
    NCCLCHECK(netGetDevice(rank, comm, &STATE_LISTEN(state, net.dev)));
-    NCCLCHECK(state->net->listen(STATE_LISTEN(state, net.dev), STATE_LISTEN(state, net.handle), &STATE_LISTEN(state, net.comm)));
+    NCCLCHECK(state->net->listen(comm->netContext, STATE_LISTEN(state, net.dev), STATE_LISTEN(state, net.handle), &STATE_LISTEN(state, net.comm)));
    memcpy(info.connectInfo.handle, STATE_LISTEN(state, net.handle), NCCL_NET_HANDLE_MAXSIZE);
  } else {
    // create socket for ring neightbor to contact mee
@@ -714,7 +709,7 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {

  // accept and connect the ring network
  if (ncclParamBootstrapNetEnable()) {
-    NCCLCHECK(netRingConnect(state->net, &state->listen, nextPeer.handle,
+    NCCLCHECK(netRingConnect(comm->netContext, state->net, &state->listen, nextPeer.handle,
                             &STATE_RING(state, net.sendComm), &STATE_RING(state, net.sendDevHandle),
                             &STATE_RING(state, net.recvComm), &STATE_RING(state, net.recvDevHandle), state->abortFlag));
  } else {
@@ -807,7 +802,7 @@ ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclCo
  // create a handle for the others to reach out to me
  if (ncclParamBootstrapNetEnable()) {
    NCCLCHECKGOTO(netGetDevice(rank, comm, &STATE_LISTEN(state, net.dev)), ret, fail);
-    NCCLCHECKGOTO(state->net->listen(STATE_LISTEN(state, net.dev), STATE_LISTEN(state, net.handle), &STATE_LISTEN(state, net.comm)), ret, fail);
+    NCCLCHECKGOTO(state->net->listen(comm->netContext, STATE_LISTEN(state, net.dev), STATE_LISTEN(state, net.handle), &STATE_LISTEN(state, net.comm)), ret, fail);
    memcpy(info.handle, STATE_LISTEN(state, net.handle), NCCL_NET_HANDLE_MAXSIZE);
  } else {
    // create socket for ring neightbor to contact mee
@@ -826,7 +821,7 @@ ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclCo
  NCCLCHECKGOTO(bootstrapSend(parent->bootstrap, prev, BOOTSTRAP_TAG_COMMSPLIT, &info, sizeof(union ringConnectInfo)), ret, fail);
  NCCLCHECKGOTO(bootstrapRecv(parent->bootstrap, next, BOOTSTRAP_TAG_COMMSPLIT, &nextPeer, sizeof(union ringConnectInfo)), ret, fail);
  if (ncclParamBootstrapNetEnable()) {
-    NCCLCHECKGOTO(netRingConnect(state->net, &state->listen, nextPeer.handle,
+    NCCLCHECKGOTO(netRingConnect(comm->netContext, state->net, &state->listen, nextPeer.handle,
                                 &STATE_RING(state, net.sendComm), &STATE_RING(state, net.sendDevHandle),
                                 &STATE_RING(state, net.recvComm), &STATE_RING(state, net.recvDevHandle), state->abortFlag),
                  ret, fail);
@@ -0,0 +1,615 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "comm.h"
+#include "register_inline.h"
+#include <cuda.h>
+#include "rocmwrap.h"
+#include "ce_coll.h"
+#include "alloc.h"
+
+// Static constant for graph synchronization
+static const uint32_t GRAPH_SYNC_VALUE = 1;
+
+// Static constants for intra-batch synchronization to improve CE collective performance with large scale
+// Frequency of intra-batch synchronization
+static const uint32_t CE_COLL_INTRA_BATCH_SYNC_FREQ = 8;
+// Message threshold for intra-batch synchronization
+static const uint64_t CE_COLL_INTRA_BATCH_SYNC_MSG_THRESHOLD = 512*1024*1024;
+
+ncclResult_t ncclCeInit(struct ncclComm* comm) {
+  ncclResult_t ret = ncclSuccess;
+
+  uint8_t* ceDevBase;
+  size_t ceDevBaseSize = alignUp(comm->nRanks*sizeof(uint32_t), 16) * 2;
+  ncclWindow_vidmem* ceWinDev;
+  ncclWindow_vidmem* ceWinDevHost;
+
+  // Ensure symmetric memory runtime is initialized
+  NCCLCHECKGOTO(ncclDevrInitOnce(comm), ret, fail);
+  // Allocate and register memory for the symmetric memory
+  NCCLCHECKGOTO(ncclMemAlloc((void**)&ceDevBase, ceDevBaseSize), ret, fail);
+  NCCLCHECKGOTO(ncclDevrWindowRegisterInGroup(comm, ceDevBase, ceDevBaseSize, NCCL_WIN_COLL_SYMMETRIC, &ceWinDev), ret, fail);
+  NCCLCHECKGOTO(ncclShadowPoolToHost(&comm->devrState.shadows, ceWinDev, &ceWinDevHost), ret, fail);
+  // Get the ncclDevrWindow from the winHost field
+  comm->ceColl.ceSyncWin = (struct ncclDevrWindow*)ceWinDevHost->winHost;
+
+  comm->ceColl.baseUCSymReadyOffset = 0;
+  comm->ceColl.baseUCSymComplOffset = alignUp(comm->nRanks*sizeof(uint32_t), 16);
+  comm->ceColl.baseUCSymReadyPtr = (uint8_t*)comm->ceColl.ceSyncWin->userPtr + comm->ceColl.baseUCSymReadyOffset;
+  comm->ceColl.baseUCSymComplPtr = (uint8_t*)comm->ceColl.ceSyncWin->userPtr + comm->ceColl.baseUCSymComplOffset;
+  comm->ceColl.ceSeqNum = 0;
+  comm->ceColl.useCompletePtr = false;
+  comm->ceColl.intraBatchSyncFreq = CE_COLL_INTRA_BATCH_SYNC_FREQ;
+  comm->ceColl.intraBatchSyncMsgThreshold = CE_COLL_INTRA_BATCH_SYNC_MSG_THRESHOLD;
+  INFO(NCCL_INIT, "Init CE, rank %d baseUCSymReadyPtr %p, baseUCSymComplPtr %p, seq num %d", comm->rank, comm->ceColl.baseUCSymReadyPtr, comm->ceColl.baseUCSymComplPtr, comm->ceColl.ceSeqNum);
+
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+ncclResult_t ncclCeFinalize(struct ncclComm* comm) {
+  ncclResult_t ret = ncclSuccess;
+  
+  // Clean up ceInitTaskQueue
+  while (!ncclIntruQueueEmpty(&comm->ceInitTaskQueue)) {
+    struct ncclCeInitTask* task = ncclIntruQueueDequeue(&comm->ceInitTaskQueue);
+    free(task);
+  }
+  
+  // Clean up CE resources
+  if (comm->ceColl.baseUCSymReadyPtr != NULL) {
+    if (comm->ceColl.ceSyncWin && comm->ceColl.ceSyncWin->vidmem) {
+      NCCLCHECKGOTO(ncclCommWindowDeregister(comm, comm->ceColl.ceSyncWin->vidmem), ret, fail);
+      NCCLCHECKGOTO(ncclMemFree(comm->ceColl.baseUCSymReadyPtr), ret, fail);
+    }
+    comm->ceColl.baseUCSymReadyPtr = NULL;
+    comm->ceColl.baseUCSymComplPtr = NULL;
+    comm->ceColl.ceSyncWin = NULL;
+  }
+
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+bool ncclCeImplemented(ncclFunc_t coll, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty) {
+  int driverVersion;
+  if (ncclCudaDriverVersion(&driverVersion) != ncclSuccess) return false;
+
+  // CE is supported in CUDA 12.5 and later
+  if (driverVersion >= 12050) {
+    switch (coll) {
+    case ncclFuncAllGather:
+    case ncclFuncAlltoAll:
+    case ncclFuncScatter:
+    case ncclFuncGather:
+      return true;
+    default:
+      return false;
+    }
+  }
+  return false;
+}
+
+ncclResult_t ncclPrepMCSync(struct ncclComm* comm, bool isComplete, hipStreamBatchMemOpParams* batchParams, size_t* opIdx, cudaStream_t stream) {
+  ncclResult_t ret = ncclSuccess;
+
+  uint32_t* readyPtrs    = (uint32_t*)comm->ceColl.baseUCSymReadyPtr;
+  uint32_t* completePtrs = (uint32_t*)comm->ceColl.baseUCSymComplPtr;
+
+  bool capturing = ncclCudaGraphValid(comm->planner.capturingGraph);
+  uint32_t currentSeq = ++comm->ceColl.ceSeqNum;
+
+  // Source pointer is either the constant graph sync value or the sequence number
+  void* srcPtr = capturing ? (void*)&GRAPH_SYNC_VALUE : (void*)&currentSeq;
+  // Wait value is either the constant graph sync value or the sequence number
+  uint32_t waitValue = capturing ? GRAPH_SYNC_VALUE : currentSeq;
+
+  // Use multi-cast address as destination pointer
+  void* mcDstPtr;
+  void* dstPtr = isComplete ? (void*)&completePtrs[comm->rank] : (void*)&readyPtrs[comm->rank];
+  size_t offset = (uint8_t*)dstPtr - (uint8_t*)comm->ceColl.ceSyncWin->userPtr;
+  NCCLCHECKGOTO(ncclDevrGetLsaTeamPtrMC(comm, comm->ceColl.ceSyncWin, offset, ncclTeamLsa(comm), &mcDstPtr), ret, fail);
+  
+  // Write our own ready/complete flag to the multi-cast address
+  CUDACHECKGOTO(cudaMemcpyAsync(
+    mcDstPtr,
+    srcPtr,
+    sizeof(uint32_t),
+    cudaMemcpyHostToDevice,
+    stream), ret, fail);
+
+  // Add local wait operations for every other rank
+  for (int r = 0; r < comm->nRanks; ++r) {
+    if (r == comm->rank) continue;
+    batchParams[*opIdx] = {};
+    // batchParams[*opIdx].waitValue.operation = CU_STREAM_MEM_OP_WAIT_VALUE_32;
+    batchParams[*opIdx].waitValue.address = (CUdeviceptr)(isComplete ? (void*)&completePtrs[r] : (void*)&readyPtrs[r]);
+    batchParams[*opIdx].waitValue.value = waitValue;
+    batchParams[*opIdx].waitValue.flags = CU_STREAM_WAIT_VALUE_EQ;
+    (*opIdx)++;
+  }
+
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+ncclResult_t ncclPrepUCSync(struct ncclComm* comm, bool isComplete,
+                               hipStreamBatchMemOpParams* batchParams,
+                               size_t* opIdx) {
+  ncclResult_t ret = ncclSuccess;
+
+  uint32_t* readyPtrs    = (uint32_t*)comm->ceColl.baseUCSymReadyPtr;
+  uint32_t* completePtrs = (uint32_t*)comm->ceColl.baseUCSymComplPtr;
+
+  bool capturing = ncclCudaGraphValid(comm->planner.capturingGraph);
+  uint32_t currentSeq = ++comm->ceColl.ceSeqNum;
+
+  // Write our own ready/complete flag to remote ranks
+  uint32_t waitValue = capturing ? GRAPH_SYNC_VALUE : currentSeq;
+  for (int r = 0; r < comm->nRanks; ++r) {
+    if (r == comm->rank) continue;
+    void * peerDstPtr;
+    void* dstPtr = isComplete ? (void*)&completePtrs[comm->rank] : (void*)&readyPtrs[comm->rank];
+    size_t offset = (uint8_t*)dstPtr - (uint8_t*)comm->ceColl.ceSyncWin->userPtr;
+    NCCLCHECKGOTO(ncclDevrGetLsaRankPtr(comm, comm->ceColl.ceSyncWin, offset, r, &peerDstPtr), ret, fail);
+    batchParams[*opIdx] = {};
+    // batchParams[*opIdx].writeValue.operation = CU_STREAM_MEM_OP_WRITE_VALUE_32;
+    batchParams[*opIdx].writeValue.address  = (CUdeviceptr)peerDstPtr;
+    batchParams[*opIdx].writeValue.value = waitValue;
+    // batchParams[*opIdx].writeValue.flags = CU_STREAM_WRITE_VALUE_DEFAULT;
+    (*opIdx)++;
+  }
+
+  // Add local wait operations for every other rank
+  for (int r = 0; r < comm->nRanks; ++r) {
+    if (r == comm->rank) continue;
+    batchParams[*opIdx] = {};
+    // batchParams[*opIdx].waitValue.operation = CU_STREAM_MEM_OP_WAIT_VALUE_32;
+    batchParams[*opIdx].waitValue.address  = (CUdeviceptr)(isComplete ? (void*)&completePtrs[r] : (void*)&readyPtrs[r]);
+    batchParams[*opIdx].waitValue.value = waitValue;
+    batchParams[*opIdx].waitValue.flags = CU_STREAM_WAIT_VALUE_EQ;
+    (*opIdx)++;
+  }
+
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+
+ncclResult_t ncclMemOpSync(struct ncclComm* comm, cudaStream_t stream) {
+  ncclResult_t ret = ncclSuccess;
+
+  // Get pointers to the ready and complete synchronization arrays
+  uint32_t* readyPtrs = (uint32_t*)comm->ceColl.baseUCSymReadyPtr;
+  uint32_t* completePtrs = (uint32_t*)comm->ceColl.baseUCSymComplPtr;
+  
+  // Allocate enough slots for all possible ops
+  size_t batchSize = (comm->nvlsSupport ? NCCL_CE_SYNC_OPS_PER_RANK_MC : NCCL_CE_SYNC_OPS_PER_RANK_UC) * comm->nRanks;
+  size_t opIdx = 0;
+
+  // Prepare batch memory operations for synchronization
+  hipStreamBatchMemOpParams* batchParams = nullptr;
+  NCCLCHECKGOTO(ncclCalloc(&batchParams, batchSize), ret, fail);
+
+  if (comm->nvlsSupport) {
+    NCCLCHECKGOTO(ncclPrepMCSync(comm, comm->ceColl.useCompletePtr, batchParams, &opIdx, stream), ret, fail);
+  } else {
+    NCCLCHECKGOTO(ncclPrepUCSync(comm, comm->ceColl.useCompletePtr, batchParams, &opIdx), ret, fail);
+  }
+
+  // For CUDA graph capture, add reset operation
+  if (ncclCudaGraphValid(comm->planner.capturingGraph)) {
+    for (int i = 0; i < comm->nRanks; i++) {
+      batchParams[opIdx] = {};
+      // batchParams[opIdx].writeValue.operation = CU_STREAM_MEM_OP_WRITE_VALUE_32;
+      batchParams[opIdx].writeValue.address = (CUdeviceptr)(comm->ceColl.useCompletePtr ? (void*)&completePtrs[i] : (void*)&readyPtrs[i]);
+      batchParams[opIdx].writeValue.value = 0;
+      // batchParams[opIdx].writeValue.flags = CU_STREAM_WRITE_VALUE_DEFAULT;
+      opIdx++;
+    }
+  }
+  
+  // Execute all memory operations in a single batch
+  CUCHECKGOTO(hipStreamBatchMemOp(stream, opIdx, batchParams, 0), ret, fail);
+
+  // Toggle the flag for next call
+  comm->ceColl.useCompletePtr = !comm->ceColl.useCompletePtr;
+
+exit:
+  if (batchParams) free(batchParams);
+  return ret;
+fail:
+  goto exit;
+}
+
+ncclResult_t ncclCeInitBatchOpsParams(struct ncclCeBatchOpsParams* params, int nRanks) {
+  ncclResult_t ret = ncclSuccess;
+  
+  params->srcs = nullptr;
+  params->dsts = nullptr;
+  params->sizes = nullptr;
+  params->numOps = 0;
+  params->intraBatchSync = false;
+#if CUDART_VERSION >= 12080
+  params->attrs = nullptr;
+  params->attrIdxs = nullptr;
+  params->numAttrs = 0;
+#endif
+  
+  NCCLCHECKGOTO(ncclCalloc(&params->srcs, nRanks), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&params->dsts, nRanks), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&params->sizes, nRanks), ret, fail);
+#if CUDART_VERSION >= 12080
+  NCCLCHECKGOTO(ncclCalloc(&params->attrs, nRanks), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&params->attrIdxs, nRanks), ret, fail);
+#endif
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+void ncclCeFreeBatchOpsParams(struct ncclCeBatchOpsParams* params) {
+  if (params->srcs) free(params->srcs);
+  if (params->dsts) free(params->dsts);
+  if (params->sizes) free(params->sizes);
+#if CUDART_VERSION >= 12080
+  if (params->attrs) free(params->attrs);
+  if (params->attrIdxs) free(params->attrIdxs);
+#endif
+}
+
+ncclResult_t ncclCeLaunchBatchOps(struct ncclComm* comm, struct ncclCeBatchOpsParams* params, cudaStream_t stream) {
+  ncclResult_t ret = ncclSuccess;
+
+  // Check if there are any operations to perform
+  if (params->numOps == 0) {
+    return ncclSuccess;
+  }
+
+  // Check if we are in a CUDA graph capture
+  bool capturing = ncclCudaGraphValid(comm->planner.capturingGraph);
+
+  int driverVersion;
+  NCCLCHECKGOTO(ncclCudaDriverVersion(&driverVersion), ret, fail);
+    
+  //--------------Graph capture--------------
+  // cudaMemcpyBatchAsync is not supported during CUDA graph capture
+  if (capturing) {
+    for (int i =0; i < params->numOps; i++) {
+      CUDACHECKGOTO(cudaMemcpyAsync(
+        (void*)params->dsts[i],
+        (void*)params->srcs[i],
+        params->sizes[i],
+        cudaMemcpyDeviceToDevice,
+        stream), ret, fail);
+
+      if (params->intraBatchSync && ((i+1) % comm->ceColl.intraBatchSyncFreq == 0) && ((i+1) < params->numOps)) {
+        NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
+      }
+    }
+  }
+  //--------------No graph capture--------------
+  else {
+    if (/*CUDART_VERSION >= 12080 &&*/ driverVersion >= 12080) {
+#if CUDART_VERSION >= 12080
+    // For CUDA 12.8+, use batch memory copy for better performance
+    params->attrs[0] = {};
+    params->attrs[0].srcAccessOrder = cudaMemcpySrcAccessOrderStream;
+    params->attrs[0].flags = cudaMemcpyFlagPreferOverlapWithCompute;
+    params->attrIdxs[0] = 0;
+    params->numAttrs = 1;
+
+    if (params->intraBatchSync) {
+      // Break into multiple batches with sync between them
+      int batchSize = comm->ceColl.intraBatchSyncFreq;
+      for (int i = 0; i < params->numOps; i += batchSize) {
+        int currentBatchSize = (i + batchSize <= params->numOps) ? batchSize : params->numOps - i;
+
+        #if CUDART_VERSION >= 13000
+        CUDACHECKGOTO(cudaMemcpyBatchAsync(
+          &params->dsts[i], &params->srcs[i], &params->sizes[i], currentBatchSize,
+          params->attrs, params->attrIdxs, params->numAttrs, stream), ret, fail);
+        #else
+        CUDACHECKGOTO(cudaMemcpyBatchAsync(
+          &params->dsts[i], &params->srcs[i], &params->sizes[i], currentBatchSize,
+          params->attrs, params->attrIdxs, params->numAttrs, nullptr, stream), ret, fail);
+        #endif
+
+        // Sync after each batch
+        if (i + batchSize < params->numOps) {
+          NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
+        }
+      }
+    } else {
+      // Use single batch for all operations
+      #if CUDART_VERSION >= 13000
+      CUDACHECKGOTO(cudaMemcpyBatchAsync(
+        params->dsts, params->srcs, params->sizes, params->numOps,
+        params->attrs, params->attrIdxs, params->numAttrs, stream), ret, fail);
+      #else
+      CUDACHECKGOTO(cudaMemcpyBatchAsync(
+        params->dsts, params->srcs, params->sizes, params->numOps,
+        params->attrs, params->attrIdxs, params->numAttrs, nullptr, stream), ret, fail);
+      #endif
+    }
+#endif
+    } else {
+      // For older CUDA versions, fall back to individual transfers
+      for (int i = 0; i < params->numOps; i++) {
+        CUDACHECKGOTO(cudaMemcpyAsync(
+          (void*)params->dsts[i],
+          (void*)params->srcs[i],
+          params->sizes[i],
+          cudaMemcpyDeviceToDevice,
+          stream), ret, fail);
+
+        if (params->intraBatchSync && ((i+1) % comm->ceColl.intraBatchSyncFreq == 0) && ((i+1) < params->numOps)) {
+          NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
+        }
+      }
+    }
+  }
+
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+
+ncclResult_t ncclCeAllGather(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) {
+  ncclResult_t ret = ncclSuccess;
+  
+  // Calculate the size of each rank's data chunk
+  const size_t chunkBytes = args->nElts * args->eltSize;
+  uint8_t* mySendBuff = (uint8_t*)args->sendBuff;
+  uint8_t* myRecvBuff = (uint8_t*)args->recvBuff + comm->rank * chunkBytes;
+  void* peerRecvBuff;
+  size_t offset;
+
+  struct ncclCeBatchOpsParams batchOpsParams = {};
+  NCCLCHECKGOTO(ncclCeInitBatchOpsParams(&batchOpsParams, comm->nRanks), ret, fail);
+
+  // Ensure all ranks are ready before starting transfers
+  NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
+
+  // Copy own data to receive buffer if operation is out-of-place
+  if (myRecvBuff != mySendBuff) {
+    batchOpsParams.srcs[batchOpsParams.numOps] = (void*)mySendBuff;
+    batchOpsParams.dsts[batchOpsParams.numOps] = (void*)myRecvBuff;
+    batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
+    batchOpsParams.numOps++;
+  }
+
+  // Copy data to other ranks
+  for (int r = 1; r < comm->nRanks; r++) {
+    int targetRank = (comm->rank + r) % comm->nRanks;
+    offset = myRecvBuff - (uint8_t*)args->recvWin->userPtr;
+    NCCLCHECKGOTO(ncclDevrGetLsaRankPtr(comm, args->recvWin, offset, targetRank, &peerRecvBuff), ret, fail);
+    batchOpsParams.srcs[batchOpsParams.numOps] = (void*)mySendBuff;
+    batchOpsParams.dsts[batchOpsParams.numOps] = (void*)peerRecvBuff;
+    batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
+    batchOpsParams.numOps++;
+  }
+
+  // Check if we need to perform intra-batch synchronization
+  batchOpsParams.intraBatchSync = (batchOpsParams.numOps > comm->ceColl.intraBatchSyncFreq && chunkBytes*batchOpsParams.numOps >= comm->ceColl.intraBatchSyncMsgThreshold);
+
+  // Launch the batch operations
+  NCCLCHECKGOTO(ncclCeLaunchBatchOps(comm, &batchOpsParams, stream), ret, fail);
+
+  // Ensure all transfers are complete across all ranks
+  NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
+
+exit:
+  ncclCeFreeBatchOpsParams(&batchOpsParams);
+  return ret;
+fail:
+  goto exit;
+}
+
+ncclResult_t ncclCeAlltoAll(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) {
+  ncclResult_t ret = ncclSuccess;
+  
+  // Calculate the size of data each rank sends to every other rank
+  const size_t chunkBytes = args->nElts * args->eltSize;
+  uint8_t* mySendBuff = (uint8_t*)args->sendBuff;
+  uint8_t* myRecvBuff = (uint8_t*)args->recvBuff;
+  void* peerRecvBuff;
+  size_t offset;
+
+  struct ncclCeBatchOpsParams batchOpsParams = {};
+  NCCLCHECKGOTO(ncclCeInitBatchOpsParams(&batchOpsParams, comm->nRanks * comm->nRanks), ret, fail);
+
+  // Ensure all ranks are ready before starting transfers
+  NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
+
+  // Copy data to other ranks: send data chunk for each destination rank
+  for (int r = 0; r < comm->nRanks; r++) {
+    int dstRank = (comm->rank + r) % comm->nRanks;
+    uint8_t* srcPtr = mySendBuff + dstRank * chunkBytes;
+    uint8_t* dstPtr = myRecvBuff + comm->rank * chunkBytes;
+    
+    if (dstRank == comm->rank) {
+      // Local copy for own data
+      batchOpsParams.srcs[batchOpsParams.numOps] = (void*)srcPtr;
+      batchOpsParams.dsts[batchOpsParams.numOps] = (void*)dstPtr;
+      batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
+      batchOpsParams.numOps++;
+    } else {
+      // Remote copy to other ranks: send to rank dstRank's receive buffer at position comm->rank
+      offset = dstPtr - (uint8_t*)args->recvWin->userPtr;
+      NCCLCHECKGOTO(ncclDevrGetLsaRankPtr(comm, args->recvWin, offset, dstRank, &peerRecvBuff), ret, fail);
+      batchOpsParams.srcs[batchOpsParams.numOps] = (void*)srcPtr;
+      batchOpsParams.dsts[batchOpsParams.numOps] = (void*)peerRecvBuff;
+      batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
+      batchOpsParams.numOps++;
+    }
+  }
+
+  // Check if we need to perform intra-batch synchronization
+  batchOpsParams.intraBatchSync = (batchOpsParams.numOps > comm->ceColl.intraBatchSyncFreq && chunkBytes*batchOpsParams.numOps >= comm->ceColl.intraBatchSyncMsgThreshold);
+
+  // Launch the batch operations
+  NCCLCHECKGOTO(ncclCeLaunchBatchOps(comm, &batchOpsParams, stream), ret, fail);
+
+  // Ensure all transfers are complete across all ranks
+  NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
+
+exit:
+  ncclCeFreeBatchOpsParams(&batchOpsParams);
+  return ret;
+fail:
+  goto exit;
+}
+
+ncclResult_t ncclCeScatter(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) {
+  ncclResult_t ret = ncclSuccess;
+  
+  // Calculate the size of data root sends to each rank
+  const size_t chunkBytes = args->nElts * args->eltSize;
+  uint8_t* mySendBuff = (uint8_t*)args->sendBuff;
+  uint8_t* myRecvBuff = (uint8_t*)args->recvBuff;
+  int rootRank = args->rootRank;
+  void* peerDstPtr;
+  size_t offset;
+
+  struct ncclCeBatchOpsParams batchOpsParams = {};
+  NCCLCHECKGOTO(ncclCeInitBatchOpsParams(&batchOpsParams, comm->nRanks), ret, fail);
+
+  // Ensure all ranks are ready before starting transfers
+  NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
+
+  if (comm->rank == rootRank) {
+    // Check if this is an in-place scatter operation
+    bool isInPlace = (myRecvBuff == mySendBuff + comm->rank * chunkBytes);
+
+    // Copy root's own data first if not in-place
+    if (!isInPlace) {
+      uint8_t* srcPtr = mySendBuff + comm->rank * chunkBytes;
+      uint8_t* dstPtr = myRecvBuff;
+      batchOpsParams.srcs[batchOpsParams.numOps] = (void*)srcPtr;
+      batchOpsParams.dsts[batchOpsParams.numOps] = (void*)dstPtr;
+      batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
+      batchOpsParams.numOps++;
+    }
+
+    // Root rank distributes data to other ranks
+    for (int r = 1; r < comm->nRanks; r++) {
+      int dstRank = (comm->rank + r) % comm->nRanks;
+      uint8_t* srcPtr = mySendBuff + dstRank * chunkBytes;
+      uint8_t* dstPtr = isInPlace ? myRecvBuff + dstRank * chunkBytes : myRecvBuff;
+
+      offset = dstPtr - (uint8_t*)args->recvWin->userPtr;
+      NCCLCHECKGOTO(ncclDevrGetLsaRankPtr(comm, args->recvWin, offset, dstRank, &peerDstPtr), ret, fail);
+      batchOpsParams.srcs[batchOpsParams.numOps] = (void*)srcPtr;
+      batchOpsParams.dsts[batchOpsParams.numOps] = (void*)peerDstPtr;
+      batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
+      batchOpsParams.numOps++;
+    }
+  }
+  // Non-root ranks don't need to perform any copy operations
+
+  // Launch the batch operations
+  NCCLCHECKGOTO(ncclCeLaunchBatchOps(comm, &batchOpsParams, stream), ret, fail);
+
+  // Ensure all transfers are complete across all ranks
+  NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
+
+exit:
+  ncclCeFreeBatchOpsParams(&batchOpsParams);
+  return ret;
+fail:
+  goto exit;
+}
+
+ncclResult_t ncclCeGather(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) {
+  ncclResult_t ret = ncclSuccess;
+  
+  // Calculate the size of data each rank sends to root
+  const size_t chunkBytes = args->nElts * args->eltSize;
+  uint8_t* mySendBuff = (uint8_t*)args->sendBuff;
+  uint8_t* myRecvBuff = (uint8_t*)args->recvBuff;
+  int rootRank = args->rootRank;
+  void* peerRecvBuff;
+  size_t offset;
+
+  struct ncclCeBatchOpsParams batchOpsParams = {};
+  NCCLCHECKGOTO(ncclCeInitBatchOpsParams(&batchOpsParams, 1), ret, fail);
+
+  // Ensure all ranks are ready before starting transfers
+  NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
+
+  if (comm->rank == rootRank) {
+    // Root rank copies its own data to the correct position in receive buffer
+    uint8_t* dstPtr = myRecvBuff + comm->rank * chunkBytes;
+    if (mySendBuff != dstPtr) {
+      batchOpsParams.srcs[batchOpsParams.numOps] = (void*)mySendBuff;
+      batchOpsParams.dsts[batchOpsParams.numOps] = (void*)dstPtr;
+      batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
+      batchOpsParams.numOps++;
+    }
+  } else {
+    // Non-root ranks send their data to root's receive buffer
+    uint8_t* rootRecvPtr = (uint8_t*)args->recvBuff + comm->rank * chunkBytes;
+    offset = rootRecvPtr - (uint8_t*)args->recvWin->userPtr;
+    NCCLCHECKGOTO(ncclDevrGetLsaRankPtr(comm, args->recvWin, offset, rootRank, &peerRecvBuff), ret, fail);
+    batchOpsParams.srcs[batchOpsParams.numOps] = (void*)mySendBuff;
+    batchOpsParams.dsts[batchOpsParams.numOps] = (void*)peerRecvBuff;
+    batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
+    batchOpsParams.numOps++;
+  }
+
+  // Launch the batch operations
+  NCCLCHECKGOTO(ncclCeLaunchBatchOps(comm, &batchOpsParams, stream), ret, fail);
+
+  // Ensure all transfers are complete across all ranks
+  NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
+
+exit:
+  ncclCeFreeBatchOpsParams(&batchOpsParams);
+  return ret;
+fail:
+  goto exit;
+}
+
+ncclResult_t ncclLaunchCeColl(struct ncclComm* comm, struct ncclKernelPlan* plan) {
+  ncclResult_t ret = ncclSuccess;
+  cudaStream_t stream = comm->planner.streams->stream;
+  struct ncclCeCollArgs* args = plan->ceCollArgs;
+
+  switch (args->func) {
+    case ncclFuncAllGather:
+      NCCLCHECKGOTO(ncclCeAllGather(comm, args, stream), ret, fail);
+      break;
+    case ncclFuncAlltoAll:
+      NCCLCHECKGOTO(ncclCeAlltoAll(comm, args, stream), ret, fail);
+      break;
+    case ncclFuncScatter:
+      NCCLCHECKGOTO(ncclCeScatter(comm, args, stream), ret, fail);
+      break;
+    case ncclFuncGather:
+      NCCLCHECKGOTO(ncclCeGather(comm, args, stream), ret, fail);
+      break;
+    default:
+      ret = ncclInvalidUsage;
+  }
+
+exit:
+  return ret;
+fail:
+  goto exit;
+}
@@ -13,16 +13,23 @@
 #include "nvtx_payload_schemas.h"
 #include "msccl/msccl_lifecycle.h"

+#ifdef ENABLE_ROCSHMEM
+#include <rocshmem/rocshmem.hpp>
+#endif
+
 using namespace rccl;

 const char* ncclFuncToString(ncclFunc_t fn) {
  switch (fn) {
  case ncclFuncAllGather: return "AllGather";
  case ncclFuncAllReduce: return "AllReduce";
+  case ncclFuncAlltoAll: return "AlltoAll";
  case ncclFuncBroadcast: return "Broadcast";
+  case ncclFuncGather: return "Gather";
  case ncclFuncRecv: return "Recv";
  case ncclFuncReduce: return "Reduce";
  case ncclFuncReduceScatter: return "ReduceScatter";
+  case ncclFuncScatter: return "Scatter";
  case ncclFuncSendRecv: return "SendRecv";
  case ncclFuncSend: return "Send";
  default: return "Invalid";
@@ -81,7 +88,6 @@ const char* ncclProtoToString(int proto) {

 NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
-
 ncclResult_t ncclAllGather_impl(const void* sendbuff, void* recvbuff, size_t sendcount,
    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
  NVTX3_FUNC_WITH_PARAMS(AllGather, NcclNvtxParamsAllGather,
@@ -91,9 +97,12 @@ ncclResult_t ncclAllGather_impl(const void* sendbuff, void* recvbuff, size_t sen
    sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
    ALLGATHER_CHUNKSTEPS, comm -> rcclUseOneSlice ? ALLGATHER_SLICESTEPS_SINGLE_NODE : ALLGATHER_SLICESTEPS, nullptr };

-  int nRanks;
+  int nRanks, rank;
  int in_place = 0;
+  const void* srcBuf;
+  void* dstBuf;
  NCCLCHECK(ncclCommCount(comm, &nRanks));
+  NCCLCHECK(ncclCommUserRank(comm, &rank));
  size_t msgSize = sendcount * ncclTypeSize(datatype) * nRanks;

  if (!mscclIsCaller())
@@ -108,21 +117,28 @@ ncclResult_t ncclAllGather_impl(const void* sendbuff, void* recvbuff, size_t sen
  }

  if (rcclUseAllGatherDirect(comm, msgSize)) {
+     INFO(NCCL_INIT, "RCCL DIRECT ALLGATHER count = %zu, msgSize = %zu, comm = %p, stream = %p, rank = %d, sendbuff = %p, recvbuff = %p", 
+		     sendcount, msgSize, comm, stream, rank, sendbuff, recvbuff);	  
     // use direct allgather
     if (sendcount == 0) return ncclSuccess;
     size_t rankOffset = sendcount * ncclTypeSize(datatype);
-     if (((char*)sendbuff) == (((char*)recvbuff) + comm->rank * rankOffset)) {
+     if (sendbuff == (((char*)recvbuff) + rank * rankOffset)) {
+        srcBuf = ((char*)recvbuff) + rank * rankOffset;
+        dstBuf = recvbuff;
        in_place = 1;
-     } 
+     } else {
+        srcBuf = sendbuff;
+        dstBuf = recvbuff;
+     }

     NCCLCHECK(ncclGroupStart());
+
     for (int r = 0; r < nRanks; r++) {
-         int peer = (comm->rank + r) % nRanks;
-         if (in_place && (peer == comm->rank)) {
-            continue;
-         }
-         NCCLCHECK(ncclSend(sendbuff, sendcount, datatype, peer, comm, stream));
-         NCCLCHECK(ncclRecv(((char*)recvbuff) + peer * rankOffset, sendcount, datatype, peer, comm, stream));
+         if (r == rank && in_place)
+             continue;
+         
+         NCCLCHECK(ncclSend(((char*)srcBuf), sendcount, datatype, r, comm, stream));
+         NCCLCHECK(ncclRecv(((char*)dstBuf) + r * rankOffset, sendcount, datatype, r, comm, stream));
     }
     NCCLCHECK(ncclGroupEnd());
     return ncclSuccess;
@@ -132,10 +148,101 @@ ncclResult_t ncclAllGather_impl(const void* sendbuff, void* recvbuff, size_t sen
  }
 }

+RCCL_PARAM(AlltoAllPivotEnable, "ALL_TO_ALL_PIVOT_ENABLE", 0);
+
+NCCL_API(ncclResult_t, ncclAlltoAll, const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclComm* comm, cudaStream_t stream);
+ncclResult_t ncclAlltoAll_impl(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclComm* comm, cudaStream_t stream) {
+  NVTX3_FUNC_WITH_PARAMS(AlltoAll, NcclNvtxParamsAlltoAll,
+    NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), datatype));
+  
+  if (!mscclIsCaller()) // when msccl falls back to
+  {
+    NCCLCHECK(Recorder::instance().record(rrAllToAll, sendbuff, recvbuff, count, datatype, comm, stream));
+  }
+
+  if (mscclAvailable(comm) && !mscclIsCaller()) {
+    return mscclEnqueueCheck(
+      sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
+      count, datatype, 0, 0, ncclSum, mscclFuncAllToAll, comm, stream);
+  }
+
+  size_t rankOffset = count * ncclTypeSize(datatype);
+  size_t rankAlign = rankOffset & ((~rankOffset) + 1);
+  size_t msgSize = count * ncclTypeSize(datatype) * comm->nRanks;
+
+  struct ncclInfo info;
+  if (comm->topo->pivotA2AEnabled && comm->nChannels >= comm->topo->pivotA2ANumBiRings * 2 &&
+      rankOffset >= 744 * 1024 && rankAlign != 4 && rcclParamAlltoAllPivotEnable()) {
+      info = { ncclFuncAlltoAllPivot, "AlltoAllPivot",
+        sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream, /* Args */
+        ALLTOALL_PIVOT_CHUNKSTEPS, ALLTOALL_PIVOT_SLICESTEPS, nullptr };
+  } else {
+      #ifdef ENABLE_ROCSHMEM
+      if (rcclUseAllToAllGda(comm) && msgSize <= comm->rocshmemThreshold) {	
+        struct ncclInfo info = { ncclFuncAllToAllGda, "AllToAllGda",
+              sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream,
+              ALLTOALL_PIVOT_CHUNKSTEPS, ALLTOALL_PIVOT_SLICESTEPS, nullptr };
+            
+        return ncclEnqueueCheck(&info);
+      }
+      #endif ENABLE_ROCSHMEM
+    info = { ncclFuncAlltoAll, "AlltoAll",
+      sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream, /* Args */
+      ALLTOALL_CHUNKSTEPS, ALLTOALL_SLICESTEPS };
+  }
+  return ncclEnqueueCheck(&info);
+}
+
+NCCL_API(ncclResult_t, ncclAlltoAllv, const void *sendbuff, const size_t sendcounts[], const size_t sdispls[],
+    void *recvbuff, const size_t recvcounts[], const size_t rdispls[],
+    ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
+ncclResult_t ncclAlltoAllv_impl(const void *sendbuff, const size_t sendcounts[], const size_t sdispls[],
+    void *recvbuff, const size_t recvcounts[], const size_t rdispls[],
+    ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
+  NVTX3_FUNC_WITH_PARAMS(AlltoAllv, NcclNvtxParamsAlltoAllv,
+    NVTX3_PAYLOAD(comm ? comm->commHash : 0, sendcounts[comm->rank] * ncclTypeSize(datatype),
+      recvcounts[comm->rank] * ncclTypeSize(datatype), datatype));
+
+  if (!mscclIsCaller()) // when msccl falls back to
+  {
+    NCCLCHECK(Recorder::instance().record(rrAllToAllv, sendbuff, recvbuff, 0, datatype, comm, stream, -1, sendcounts, sdispls, recvcounts, rdispls));
+  }
+
+  if (mscclAvailable(comm) && !mscclIsCaller()) {
+    return mscclEnqueueCheck(
+      sendbuff, sendcounts, sdispls, recvbuff, recvcounts, rdispls,
+      0, datatype, 0, 0, ncclSum, mscclFuncAllToAllv, comm, stream);
+  }
+
+  int nRanks;
+  NCCLCHECK(ncclCommCount(comm, &nRanks));
+  if (!mscclIsCaller()) Recorder::instance().skip(true);
+  NCCLCHECK(ncclGroupStart());
+  for (int r=0; r<nRanks; r++) {
+    NCCLCHECK(ncclSend(
+        ((char*)sendbuff) + sdispls[r]*ncclTypeSize(datatype),
+        sendcounts[r],
+        datatype,
+        r,
+        comm,
+        stream));
+    NCCLCHECK(ncclRecv(
+        ((char*)recvbuff) + rdispls[r]*ncclTypeSize(datatype),
+        recvcounts[r],
+        datatype,
+        r,
+        comm,
+        stream));
+  }
+  NCCLCHECK(ncclGroupEnd());
+  if (!mscclIsCaller()) Recorder::instance().skip(false);
+  return ncclSuccess;
+}
+
 NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
-
-
 ncclResult_t ncclAllReduce_impl(const void* sendbuff, void* recvbuff, size_t count,
    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
  NVTX3_FUNC_WITH_PARAMS(AllReduce, NcclNvtxParamsAllReduce,
@@ -186,104 +293,8 @@ ncclResult_t ncclAllReduceWithBias_impl(const void* sendbuff, void* recvbuff, si
  return ncclEnqueueCheck(&info);
 }

-RCCL_PARAM(AllToAllPivotEnable, "ALL_TO_ALL_PIVOT_ENABLE", 0);
-
-NCCL_API(ncclResult_t, ncclAllToAll, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
-  ncclComm_t comm, hipStream_t stream);
-
-
-ncclResult_t ncclAllToAll_impl(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
-  ncclComm_t comm, hipStream_t stream) {
-  NVTX3_FUNC_WITH_PARAMS(AllToAll, NcclNvtxParamsAllToAll,
-    NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), datatype));
-
-  if (!mscclIsCaller()) // when msccl falls back to
-  {
-    NCCLCHECK(Recorder::instance().record(rrAllToAll, sendbuff, recvbuff, count, datatype, comm, stream));
-  }
-
-  if (mscclAvailable(comm) && !mscclIsCaller()) {
-    return mscclEnqueueCheck(
-      sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
-      count, datatype, 0, 0, ncclSum, mscclFuncAllToAll, comm, stream);
-  }
-
-  size_t rankOffset = count * ncclTypeSize(datatype);
-  size_t rankAlign = rankOffset & ((~rankOffset) + 1);
-  // Determine Pivot A2A support now that we know number of channels
-  if (comm->topo->pivotA2AEnabled && comm->nChannels >= comm->topo->pivotA2ANumBiRings * 2 &&
-      rankOffset >= 744 * 1024 && rankAlign != 4 && rcclParamAllToAllPivotEnable()) {
-    struct ncclInfo info = { ncclFuncAllToAllPivot, "AllToAllPivot",
-      sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream, /* Args */
-      ALLTOALL_PIVOT_CHUNKSTEPS, ALLTOALL_PIVOT_SLICESTEPS, nullptr };
-    return ncclEnqueueCheck(&info);
-  } else {
-    int nRanks;
-    NCCLCHECK(ncclCommCount(comm, &nRanks));
-    if (count == 0) return ncclSuccess;
-    if (!mscclIsCaller()) Recorder::instance().skip(true);
-    NCCLCHECK(ncclGroupStart());
-    for (int r=0; r<nRanks; r++) {
-      NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, count, datatype, r, comm, stream));
-      NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, count, datatype, r, comm, stream));
-    }
-    NCCLCHECK(ncclGroupEnd());
-    if (!mscclIsCaller()) Recorder::instance().skip(false);
-    return ncclSuccess;
-  }
-}
-
-NCCL_API(ncclResult_t, ncclAllToAllv, const void *sendbuff, const size_t sendcounts[], const size_t sdispls[],
-    void *recvbuff, const size_t recvcounts[], const size_t rdispls[],
-    ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
-
-
-ncclResult_t ncclAllToAllv_impl(const void *sendbuff, const size_t sendcounts[], const size_t sdispls[],
-    void *recvbuff, const size_t recvcounts[], const size_t rdispls[],
-    ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
-  NVTX3_FUNC_WITH_PARAMS(AllToAllv, NcclNvtxParamsAllToAllv,
-    NVTX3_PAYLOAD(comm ? comm->commHash : 0, sendcounts[comm->rank] * ncclTypeSize(datatype),
-      recvcounts[comm->rank] * ncclTypeSize(datatype), datatype));
-
-  if (!mscclIsCaller()) // when msccl falls back to
-  {
-    NCCLCHECK(Recorder::instance().record(rrAllToAllv, sendbuff, recvbuff, 0, datatype, comm, stream, -1, sendcounts, sdispls, recvcounts, rdispls));
-  }
-
-  if (mscclAvailable(comm) && !mscclIsCaller()) {
-    return mscclEnqueueCheck(
-      sendbuff, sendcounts, sdispls, recvbuff, recvcounts, rdispls,
-      0, datatype, 0, 0, ncclSum, mscclFuncAllToAllv, comm, stream);
-  }
-
-  int nRanks;
-  NCCLCHECK(ncclCommCount(comm, &nRanks));
-  if (!mscclIsCaller()) Recorder::instance().skip(true);
-  NCCLCHECK(ncclGroupStart());
-  for (int r=0; r<nRanks; r++) {
-    NCCLCHECK(ncclSend(
-        ((char*)sendbuff) + sdispls[r]*ncclTypeSize(datatype),
-        sendcounts[r],
-        datatype,
-        r,
-        comm,
-        stream));
-    NCCLCHECK(ncclRecv(
-        ((char*)recvbuff) + rdispls[r]*ncclTypeSize(datatype),
-        recvcounts[r],
-        datatype,
-        r,
-        comm,
-        stream));
-  }
-  NCCLCHECK(ncclGroupEnd());
-  if (!mscclIsCaller()) Recorder::instance().skip(false);
-  return ncclSuccess;
-}
-
 NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
    ncclComm_t comm, cudaStream_t stream);
-
 ncclResult_t ncclBroadcast_impl(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
    ncclComm_t comm, cudaStream_t stream) {
  NVTX3_FUNC_WITH_PARAMS(Broadcast, NcclNvtxParamsBroadcast,
@@ -315,46 +326,32 @@ ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int ro
  return ncclBroadcast(buff, buff, count, datatype, root, comm, stream);
 }

-NCCL_API(ncclResult_t, ncclGather, const void* sendbuff, void* recvbuff, size_t sendcount,
-    ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream);
-
-ncclResult_t ncclGather_impl(const void* sendbuff, void* recvbuff, size_t sendcount,
-    ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream) {
+NCCL_API(ncclResult_t, ncclGather, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm* comm, cudaStream_t stream);
+ncclResult_t ncclGather_impl(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm* comm, cudaStream_t stream) {
  NVTX3_FUNC_WITH_PARAMS(Gather, NcclNvtxParamsGather,
-    NVTX3_PAYLOAD(comm ? comm->commHash : 0, sendcount * ncclTypeSize(datatype), root, datatype));
+    NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), root));

  if (!mscclIsCaller()) // when msccl falls back to
  {
-    NCCLCHECK(Recorder::instance().record(rrGather, sendbuff, recvbuff, sendcount, datatype, comm, stream, root));
+    NCCLCHECK(Recorder::instance().record(rrGather, sendbuff, recvbuff, count, datatype, comm, stream, root));
  }

  if (mscclAvailable(comm) && !mscclIsCaller()) {
    return mscclEnqueueCheck(
      sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
-      sendcount, datatype, root, 0, ncclSum, mscclFuncGather, comm, stream);
+      count, datatype, root, 0, ncclSum, mscclFuncGather, comm, stream);
  }

-  int nRanks;
-  NCCLCHECK(ncclCommCount(comm, &nRanks));
-  size_t rankOffset = sendcount * ncclTypeSize(datatype);
-  if (sendcount == 0) return ncclSuccess;
-  int rank;
-  NCCLCHECK(ncclCommUserRank(comm, &rank));
-  if (!mscclIsCaller()) Recorder::instance().skip(true);
-  NCCLCHECK(ncclGroupStart());
-  if (rank == root) {
-    for (int r=0; r<nRanks; r++)
-      NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, sendcount, datatype, r, comm, stream));
-  }
-  NCCLCHECK(ncclSend(sendbuff, sendcount, datatype, root, comm, stream));
-  NCCLCHECK(ncclGroupEnd());
-  if (!mscclIsCaller()) Recorder::instance().skip(false);
-  return ncclSuccess;
+  struct ncclInfo info = { ncclFuncGather, "Gather",
+    sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
+    GATHER_CHUNKSTEPS, GATHER_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
 }

 NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
-
 ncclResult_t ncclReduce_impl(const void* sendbuff, void* recvbuff, size_t count,
    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
  NVTX3_FUNC_WITH_PARAMS(Reduce, NcclNvtxParamsReduce,
@@ -380,8 +377,6 @@ ncclResult_t ncclReduce_impl(const void* sendbuff, void* recvbuff, size_t count,

 NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
-
-
 ncclResult_t ncclReduceScatter_impl(const void* sendbuff, void* recvbuff, size_t recvcount,
    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
  NVTX3_FUNC_WITH_PARAMS(ReduceScatter, NcclNvtxParamsReduceScatter,
@@ -405,48 +400,32 @@ ncclResult_t ncclReduceScatter_impl(const void* sendbuff, void* recvbuff, size_t
  return ncclEnqueueCheck(&info);
 }

-NCCL_API(ncclResult_t, ncclScatter, const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, int root,
-    ncclComm_t comm, hipStream_t stream);
-
-
-ncclResult_t ncclScatter_impl(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, int root,
-    ncclComm_t comm, hipStream_t stream) {
+NCCL_API(ncclResult_t, ncclScatter, const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, int root, ncclComm* comm, cudaStream_t stream);
+ncclResult_t ncclScatter_impl(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, int root, ncclComm* comm, cudaStream_t stream) {
  NVTX3_FUNC_WITH_PARAMS(Scatter, NcclNvtxParamsScatter,
-    NVTX3_PAYLOAD(comm ? comm->commHash : 0, recvcount * ncclTypeSize(datatype), root, datatype));
+    NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), root, datatype));

  if (!mscclIsCaller()) // when msccl falls back to
  {
-    NCCLCHECK(Recorder::instance().record(rrScatter, sendbuff, recvbuff, recvcount, datatype, comm, stream, root));
+    NCCLCHECK(Recorder::instance().record(rrScatter, sendbuff, recvbuff, count, datatype, comm, stream, root));
  }

  if (mscclAvailable(comm) && !mscclIsCaller()) {
    return mscclEnqueueCheck(
      sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
-      recvcount, datatype, root, 0, ncclSum, mscclFuncScatter, comm, stream);
+      count, datatype, root, 0, ncclSum, mscclFuncScatter, comm, stream);
  }

-  int nRanks;
-  NCCLCHECK(ncclCommCount(comm, &nRanks));
-  size_t rankOffset = recvcount * ncclTypeSize(datatype);
-  if (recvcount == 0) return ncclSuccess;
-  int rank;
-  NCCLCHECK(ncclCommUserRank(comm, &rank));
-  if (!mscclIsCaller()) Recorder::instance().skip(true);
-  NCCLCHECK(ncclGroupStart());
-  if (rank == root) {
-    for (int r=0; r<nRanks; r++)
-      NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, recvcount, datatype, r, comm, stream));
-  }
-  NCCLCHECK(ncclRecv(recvbuff, recvcount, datatype, root, comm, stream));
-  NCCLCHECK(ncclGroupEnd());
-  if (!mscclIsCaller()) Recorder::instance().skip(false);
-  return ncclSuccess;
+  struct ncclInfo info = { ncclFuncScatter, "Scatter",
+    sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
+    SCATTER_CHUNKSTEPS, SCATTER_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
 }

 NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
    ncclComm_t comm, cudaStream_t stream);
-
-
 ncclResult_t ncclSend_impl(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
    ncclComm_t comm, cudaStream_t stream) {
  NVTX3_FUNC_WITH_PARAMS(Send, NcclNvtxParamsSendRecv,
@@ -472,7 +451,6 @@ ncclResult_t ncclSend_impl(const void* sendbuff, size_t count, ncclDataType_t da

 NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
    ncclComm_t comm, cudaStream_t stream);
-
 ncclResult_t ncclRecv_impl(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
    ncclComm_t comm, cudaStream_t stream) {
  NVTX3_FUNC_WITH_PARAMS(Recv, NcclNvtxParamsSendRecv,
@@ -0,0 +1,26 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#include "nccl.h"
+#include <cstring>
+#include "comm.h"
+#include "device.h"
+#include "archinfo.h"
+
+__attribute__ ((visibility("default")))
+ncclResult_t ncclCommDump(
+    const ncclComm_t comm,
+    std::unordered_map<std::string, std::string>& map) {
+  if (comm == nullptr) {
+    WARN("ncclCommDump comm is null");
+    return ncclSuccess;
+  }
+  if (comm->proxyState->proxyTrace == nullptr) {
+    WARN("ncclCommDump comm->proxyState->proxyTrace is null");
+    return ncclSuccess;
+  }
+
+  WARN("ncclCommDump() ProxyTrace:");
+  WARN("%s", comm->proxyState->proxyTrace->dump().c_str());
+
+  return ncclSuccess;
+}
@@ -28,7 +28,7 @@ static int pid = -1;
 static char hostname[1024];
 thread_local int ncclDebugNoWarn = 0;
 char ncclLastError[1024] = ""; // Global string for the last error in human readable form
-static uint64_t ncclDebugMask = 0;
+uint64_t ncclDebugMask = 0;
 FILE *ncclDebugFile = stdout;
 static pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER;
 static std::chrono::steady_clock::time_point ncclEpoch;
@@ -419,4 +419,4 @@ void ncclSetThreadName(pthread_t thread, const char *fmt, ...) {
  va_end(vargs);
  pthread_setname_np(thread, threadName);
 #endif
-}
+}
@@ -0,0 +1,60 @@
+# Run the scripts once during configuration to get the file lists
+execute_process(
+    COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/generate.py ${CMAKE_CURRENT_BINARY_DIR}/gensrc "${ONLY_FUNCS}"
+    OUTPUT_VARIABLE files
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+)
+string(STRIP "${files}" files)
+list(TRANSFORM files PREPEND ${CMAKE_CURRENT_BINARY_DIR}/gensrc/)
+
+execute_process(
+    COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/symmetric/generate.py ${CMAKE_CURRENT_BINARY_DIR}/gensrc/symmetric "${ONLY_FUNCS}"
+    OUTPUT_VARIABLE symmetric_files
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+)
+string(STRIP "${symmetric_files}" symmetric_files)
+list(TRANSFORM symmetric_files PREPEND ${CMAKE_CURRENT_BINARY_DIR}/gensrc/symmetric/)
+
+# Create custom commands to generate source files with proper dependencies
+add_custom_command(
+    OUTPUT  ${files}
+    BYPRODUCTS ${files}
+    COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/generate.py ${CMAKE_CURRENT_BINARY_DIR}/gensrc "${ONLY_FUNCS}"
+    DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/generate.py
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+    COMMENT "Generating device source files"
+)
+
+add_custom_command(
+    OUTPUT  ${symmetric_files}
+    BYPRODUCTS ${symmetric_files}
+    COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/symmetric/generate.py ${CMAKE_CURRENT_BINARY_DIR}/gensrc/symmetric "${ONLY_FUNCS}"
+    DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/symmetric/generate.py
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+    COMMENT "Generating symmetric device source files"
+)
+
+# Add library target
+add_library(nccl_device OBJECT
+            ${files}
+            ${symmetric_files}
+            ${CMAKE_CURRENT_SOURCE_DIR}/common.cu
+            ${CMAKE_CURRENT_SOURCE_DIR}/onerank.cu
+)
+
+set_target_properties(nccl_device PROPERTIES
+    CUDA_SEPARABLE_COMPILATION ON
+    CUDA_RESOLVE_DEVICE_SYMBOLS ON
+)
+
+# Set include directories for the target
+target_include_directories(nccl_device PUBLIC
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${CMAKE_SOURCE_DIR}/src/include
+    ${CMAKE_SOURCE_DIR}/src/include/plugin
+    ${CMAKE_BINARY_DIR}/include
+    ${CUDAToolkit_INCLUDE_DIRS}
+    ${CUDAToolkit_INCLUDE_DIRS}/cccl
+)
+
+add_dependencies(nccl_device nccl_header)
@@ -19,7 +19,7 @@ OBJDIR := $(BUILDDIR)/obj/device
 MANIFEST := $(OBJDIR)/manifest
 DEVGLUE_OBJ  := $(OBJDIR)/device_glue.o

-INCFLAGS  = -I. -I.. -I$(BUILDDIR)/include -I../include
+INCFLAGS  = -I. -I.. -I$(BUILDDIR)/include -I../include -I../include/plugin
 NVCUFLAGS += $(INCFLAGS) --compiler-options "-fPIC -fvisibility=hidden"
 CXXFLAGS  += $(INCFLAGS)

@@ -47,7 +47,11 @@ endif
 define COMPILE_SYM
@$(SAY) "Compiling" $2;\
 mkdir -p $(dir $1);\
- $(NVCC) $(NVCUFLAGS_SYM) $3 -dw $2 -o $1
+ if [[ -n "$3" ]]; then\
+ $(NVCC) $(NVCUFLAGS_SYM) $3 -dw $2 -o $1;\
+ else\
+ touch $2.empty.cu; $(NVCC) $(NVCUFLAGS_SYM) -dw $2.empty.cu -o $1; rm $2.empty.cu;\
+ fi
 endef

 DEPENDS.cu = $(NVCC) $(NVCUFLAGS) -M -dc $1
@@ -20,11 +20,20 @@ namespace {
    const int bid = ncclShmem.channelId - work->channelLo;
    int npKitCtxIdx = bid; // unused variable - compiler warning
 #endif
+#ifdef ENABLE_WARP_SPEED
+    int warp = threadIdx.x / WARP_SIZE;
+    ncclRing *ring = &ncclShmem.warpChannel[warp].ring;
+#else
    ncclRing *ring = &ncclShmem.channel.ring;
+#endif
    const int *ringRanks = ring->userRanks;
    const int nranks = ncclShmem.comm.nRanks;
    ssize_t count, partOffset, partCount, chunkCount;
+#ifdef ENABLE_WARP_SPEED
+    ncclCollCbdPart(work, ncclShmem.warpChannelId[warp], Proto::Id, sizeof(T), &count, &partOffset, &partCount, &chunkCount);
+#else
    ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &partOffset, &partCount, &chunkCount);
+#endif
    ssize_t offset;
    ssize_t dataOffset;
    int nelem;
@@ -142,7 +151,7 @@ namespace {
 #endif
        // Final wait/copy.
        prims.directRecv(offset, nelem);
-  
+
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_EXIT)
        if (tid == 0) {
          NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_EXIT, nelem*sizeof(T), prims.npKitDataProcessTotalTime, NPKIT_GET_GPU_TIMESTAMP(),
@@ -671,4 +680,4 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
      return;
    }
  }
-};
+};
@@ -20,8 +20,14 @@ namespace {
 #else
  __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
 #endif
+#ifdef ENABLE_WARP_SPEED
+    int warp = threadIdx.x / WARP_SIZE;
+    ncclRing *ring = &ncclShmem.warpChannel[warp].ring;
+#else
    ncclRing *ring = &ncclShmem.channel.ring;
+#endif
    int ringIx = ring->index;
+
    const int nranks = ncclShmem.comm.nRanks;
 #if defined(ENABLE_NPKIT)
    const int bid = ncclShmem.channelId - work->channelLo;
@@ -31,7 +37,11 @@ namespace {
    ssize_t gridOffset;
    ssize_t channelCount;
    ssize_t chunkCount;
+#ifdef ENABLE_WARP_SPEED
+    ncclCollCbdPart(work, ncclShmem.warpChannelId[warp], Proto::Id, sizeof(T), &size, &gridOffset, &channelCount, &chunkCount);
+#else
    ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &size, &gridOffset, &channelCount, &chunkCount);
+#endif
    const ssize_t loopCount = nranks * chunkCount;
    ssize_t offset;
    int nelem;
@@ -0,0 +1,33 @@
+/*************************************************************************
+ * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "device.h"
+#include "collectives.h"
+#include "primitives.h"
+
+#ifdef ENABLE_ROCSHMEM
+#include <rocshmem/rocshmem.hpp>
+
+template<typename T, typename RedOp>
+struct RunWorkColl<ncclFuncAllToAllGda, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
+  __device__ __forceinline__ void run(int tid, int nThreads, struct ncclDevWorkColl* work) {
+    if (blockIdx.x == 0) {
+        int num_pes = rocshmem::rocshmem_n_pes();
+
+        reduceCopy<COLL_UNROLL, USE_ACC, RedOp, T, 0,1, 1, 0, 1, 1, 0>(
+            tid, nThreads, 0, nullptr, false, 1, (void **)&work->sendbuff, 1, (void **)&work->sndbuff, 
+            (work->size*num_pes));
+
+        rocshmem::rocshmem_char_alltoall_wg(work->team, ((char*)work->tempbuff), ((char*)work->sndbuff), work->size);
+
+        reduceCopy<COLL_UNROLL, USE_ACC, RedOp, T, 0,1, 1, 0, 1, 1, 0>(
+            tid, nThreads, 0, nullptr, false, 1, (void **)&work->tempbuff, 1, (void **)&work->recvbuff, 
+            (work->size*num_pes));
+        }
+  }
+};
+#endif
+
@@ -75,7 +75,7 @@ namespace {
 }

 template<typename T, typename RedOp>
-struct RunWorkColl<ncclFuncAllToAllPivot, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
+struct RunWorkColl<ncclFuncAlltoAllPivot, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
  __device__ __forceinline__ void run(int tid, int nThreads, struct ncclDevWorkColl* work) {
    using Proto = ProtoSimple<ALLTOALL_PIVOT_CHUNKSTEPS/ALLTOALL_PIVOT_SLICESTEPS, ALLTOALL_PIVOT_SLICESTEPS>;
    runRing<T, RedOp, Proto>(tid, nThreads, work);
@@ -19,7 +19,12 @@ namespace {
    const int bid = ncclShmem.channelId - work->channelLo;
    int npKitCtxIdx = bid; // unused variable - compiler warning
 #endif
+#ifdef ENABLE_WARP_SPEED
+    int warp = threadIdx.x / WARP_SIZE;
+    ncclRing *ring = &ncclShmem.warpChannel[warp].ring;
+#else
    ncclRing *ring = &ncclShmem.channel.ring;
+#endif
    const int rank = ring->userRanks[0];
    const int nextRank = ring->userRanks[1];
    const int root = work->root;
@@ -27,7 +32,11 @@ namespace {
    ssize_t chunkCount;
    ssize_t channelCount;
    ssize_t gridOffset;
+#ifdef ENABLE_WARP_SPEED
+    ncclCollCbdPart(work, ncclShmem.warpChannelId[warp], Proto::Id, sizeof(T), &size, &gridOffset, &channelCount, &chunkCount);
+#else
    ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &size, &gridOffset, &channelCount, &chunkCount);
+#endif
    size_t offset;
    int nelem;
    int workNthreads;
@@ -17,24 +17,24 @@ struct RunWorkNop {
  __device__ void run() {}
 };

-__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
-  ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/1>(&args4K.args);
+__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_1(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage) {
+  ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/1>(&argsStorage.args);
 }
-__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
-  ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/2>(&args4K.args);
+__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_2(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage) {
+  ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/2>(&argsStorage.args);
 }
-__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
-  ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/4>(&args4K.args);
+__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_4(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage) {
+  ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/4>(&argsStorage.args);
 }
 #ifdef ENABLE_COLLTRACE
-__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
-  ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/1>(&args4K.args);
+__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_1(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage) {
+  ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/1>(&argsStorage.args);
 }
-__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
-  ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/2>(&args4K.args);
+__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_2(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage) {
+  ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/2>(&argsStorage.args);
 }
-__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
-  ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/4>(&args4K.args);
+__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_4(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage) {
+  ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/4>(&argsStorage.args);
 }
 #endif

@@ -27,17 +27,30 @@
 #endif

 #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1200__) || defined(__gfx1201__)
-#define __trace_hwreg()
+#define __trace_hwreg() \
+  collTrace->data_0 = 0;
 #else
 #define __trace_hwreg() \
-  asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (collTrace->data_0));
+  { int32_t hwid; \
+    asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (hwid)); \
+    collTrace->data_0 = hwid >> 4; }
 #endif
+
+#if defined(__gfx942__) || defined(__gfx950__)
+#define __trace_xccid() \
+  { int32_t xccId; \
+    asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (xccId)); \
+    collTrace->xccId = xccId; }
+#else
+#define __trace_xccid() \
+  collTrace->xccId = 0;
+#endif
+
 #ifdef ENABLE_COLLTRACE
  #define INC_COLL_TRACE \
    uint32_t pos = __hip_atomic_fetch_add(&ncclShmem.collTraceTail->tail, 1, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_WORKGROUP)%COLLTRACE_NUM_ITEMS; \
    struct ncclCollTrace* collTrace = ncclShmem.collTrace+pos; \
    collTrace->timeStamp = wall_clock64(); \
-    collTrace->bid = blockIdx.x; \
    collTrace->tid = threadIdx.x; \
    collTrace->channelId = ncclShmem.channelId;
    // TODO: switch to atomicInc after llvm crash is fixed
@@ -46,7 +59,8 @@
  #define traceKernelLaunch(launch_type, ix) { \
    INC_COLL_TRACE \
    collTrace->funcIndex = ncclShmem.funcId; \
-    __trace_hwreg()\
+    __trace_hwreg() \
+    __trace_xccid() \
    collTrace->batchIx = ix; \
    if (ncclShmem.workType == ncclDevWorkTypeP2p) { \
      struct ncclDevWorkP2p *p2pWork = (struct ncclDevWorkP2p*)ncclShmem.workStorage; \
@@ -63,7 +77,7 @@
      collTrace->p2p.recvRegistered = p2pWork->recvNetReg; \
      collTrace->p2pOpCount[0] = p2pWork->sendOpCount; \
      collTrace->p2pOpCount[1] = p2pWork->recvOpCount; \
-      collTrace->type = (launch_type) | ncclCollTraceP2pElemType; \
+      __hip_atomic_store(&collTrace->type, (launch_type) | ncclCollTraceP2pElemType, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_WORKGROUP); \
    } else if (ncclShmem.workType == ncclDevWorkTypeColl) { \
      struct ncclDevWorkColl *collWork = (struct ncclDevWorkColl*)ncclShmem.workStorage; \
      collTrace->coll.nWarps = collWork->nWarps; \
@@ -71,7 +85,7 @@
      collTrace->coll.bid = ncclShmem.channelId - collWork->channelLo; \
      collTrace->coll.root = collWork->root; \
      collTrace->opCount = collWork->opCount; \
-      collTrace->type = (launch_type) | ncclCollTraceCollElemType; \
+      __hip_atomic_store(&collTrace->type, (launch_type) | ncclCollTraceCollElemType, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_WORKGROUP); \
    } \
  }
  #define traceKernelEnd(end_type)  { \
@@ -81,11 +95,11 @@
      struct ncclDevWorkP2p *p2pWork = (struct ncclDevWorkP2p*)ncclShmem.workStorage; \
      collTrace->p2pOpCount[0] = p2pWork->sendOpCount; \
      collTrace->p2pOpCount[1] = p2pWork->recvOpCount; \
-      collTrace->type = (end_type) | ncclCollTraceP2pElemType; \
+      __hip_atomic_store(&collTrace->type, (end_type) | ncclCollTraceP2pElemType, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_WORKGROUP); \
    } else if (ncclShmem.workType == ncclDevWorkTypeColl) { \
      struct ncclDevWorkColl *collWork = (struct ncclDevWorkColl*)ncclShmem.workStorage; \
      collTrace->opCount = collWork->opCount; \
-      collTrace->type = (end_type) | ncclCollTraceCollElemType; \
+      __hip_atomic_store(&collTrace->type, (end_type) | ncclCollTraceCollElemType, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_WORKGROUP); \
    } \
  }
  #define traceData(data2, data4, data8_0, data8_1) { \
@@ -94,12 +108,12 @@
    collTrace->data_0 = data4; \
    collTrace->opCount = data8_0; \
    collTrace->data_1 = data8_1; \
-    collTrace->type = ncclCollTraceDataType; \
+    __hip_atomic_store(&collTrace->type, ncclCollTraceDataType, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_WORKGROUP); \
  }
  #define traceAbort(){\
    INC_COLL_TRACE\
    collTrace->funcIndex = ncclShmem.funcId;\
-    collTrace->type = ncclCollTraceAbortType;\
+    __hip_atomic_store(&collTrace->type, ncclCollTraceAbortType, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_WORKGROUP); \
  }
 #else
 #define traceKernelLaunch(launch_type, batchIx)
@@ -136,9 +150,13 @@ struct ncclShmemData {
  struct ncclDevKernelArgs args;
  int channelId;
  int aborted;
-  alignas(16) struct ncclDevComm comm;
+  alignas(16) struct ncclKernelComm comm;
  alignas(16) struct ncclDevChannel channel;
-
+#ifdef ENABLE_WARP_SPEED
+  int warpComm;
+  alignas(16) struct ncclDevChannel warpChannel[NCCL_MAX_GROUPS];
+  int warpChannelId[NCCL_MAX_GROUPS];
+#endif
  int batchIx, nextBatchIx;
  enum ncclDevWorkType workType;
  uint8_t directMode;
@@ -284,10 +302,10 @@ __device__ __forceinline__ void loadWorkBatchToShmem(

    if (WARP_SIZE == 64) {
      if (uint64_t(batch.offsetBitset) & (1ull<<lane)) {
-        int nWorksBelow = __popc(uint64_t(batch.offsetBitset) & ((1ull<<lane)-1));
+        int nWorksBelow = __popcll(uint64_t(batch.offsetBitset) & ((1ull<<lane)-1));
        fnsOfBitset[nWorksBelow] = lane;
      }
-      nWorks = __popc(uint64_t(batch.offsetBitset));
+      nWorks = __popcll(uint64_t(batch.offsetBitset));
    } else {
      // WARP_SIZE == 32
      if (uint32_t(batch.offsetBitset) & (1u<<lane)) {
@@ -442,10 +460,17 @@ struct RunWorkBatch {
        if (work->nWarps != workPrev->nWarps) __syncthreads();
      }
      int subtn = work->nWarps*WARP_SIZE;
+#ifdef ENABLE_WARP_SPEED
+      if (tid < subtn) {
+        if(ncclShmem.warpComm == 0 || Algo != NCCL_ALGO_RING) RunWorkColl<Fn, T, RedOp, Algo, Proto>().run(tid, subtn, work);
+        else if (ncclShmem.warpChannelId[tid / WARP_SIZE] >= 0) RunWorkColl<Fn, T, RedOp, Algo, Proto>().run(tid % WARP_SIZE, WARP_SIZE, work);
+      }
+#else
      // Coverity reports a possible thread divergence due to not all threads participating in the collective.
      // However, the code ensures that the participation is on a per-warp basis.
      // coverity[device_thread_diverged:FALSE]
      if (tid < subtn) RunWorkColl<Fn, T, RedOp, Algo, Proto>().run(tid, subtn, work);
+#endif
    }
  }
 };
@@ -477,7 +502,7 @@ __device__ __forceinline__ void profiler(int action) {
        ncclShmem.comm.workCompleted[ncclShmem.channelId].data[wc%MAX_PROFILER_EVENTS_PER_CHANNEL].counter = wc;
      }
      ncclShmem.channel.workCounter += ncclShmem.nWorks;
-      if (action == FINI) ((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter = ncclShmem.channel.workCounter;
+      if (action == FINI) ((ncclKernelCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter = ncclShmem.channel.workCounter;
    }
  }
 }
@@ -489,7 +514,12 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
  int x = tid;
  int total = 0, y;
  int num = MAXCHANNELS/64 > 0 ? MAXCHANNELS/64 : 1;
-
+#ifdef ENABLE_WARP_SPEED
+  int warpCount    = tn / WARP_SIZE;
+  int localWarpId  = tid / WARP_SIZE;
+  int globalWarpId = (warpCount * blockIdx.x) + localWarpId;
+  int laneId = tid % WARP_SIZE;
+#endif
  // Copy kernel args to shmem and then only read those. Otherwise the compiler
  // will end up putting the args into thread local stack which is very wasteful.
  if (tid < sizeof(ncclDevKernelArgs)/sizeof(uint32_t)) {
@@ -549,7 +579,7 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
  /* set abort flag to 0 */
  if (tid == 0) {
    ncclShmem.aborted = 0;
-    ncclShmem.channel.workCounter = ((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter;
+    ncclShmem.channel.workCounter = ((ncclKernelCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter;
  }

  // Use first 2 warps to load comm and channel, and remaining load work batch.
@@ -557,14 +587,14 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
  case 0:
    { void* dst = &ncclShmem.comm;
      void* src = ncclShmem.args.comm;
-      int bytes = sizeof(ncclDevComm);
-      static_assert(sizeof(ncclDevComm) <= 16*WARP_SIZE, "ncclDevComm cannot be loaded by a single warp in one insn.");
+      int bytes = sizeof(ncclKernelComm);
+      static_assert(sizeof(ncclKernelComm) <= 16*WARP_SIZE, "ncclKernelComm cannot be loaded by a single warp in one insn.");
      copyToShmem16(tid, dst, src, bytes);
    } break;
  case 1:
-    { // Get address of channel without incurring indirect load from ncclDevComm::channels
+    { // Get address of channel without incurring indirect load from ncclKernelComm::channels
      void* dst = &ncclShmem.channel;
-      void* src = &((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId];
+      void* src = &((ncclKernelCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId];
      int bytes = sizeof(ncclDevChannel);
      static_assert(sizeof(ncclDevChannel) <= 16*WARP_SIZE, "ncclDevChannel cannot be loaded by a single warp in one insn.");
      copyToShmem16(tid-WARP_SIZE, dst, src, bytes);
@@ -583,9 +613,52 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
    ncclShmem.collTrace = args->comm->collTrace + COLLTRACE_NUM_ITEMS*ncclShmem.channelId;
    ncclShmem.collTraceTail = args->comm->collTraceTail + ncclShmem.channelId;
  }
+#endif
+#ifdef ENABLE_WARP_SPEED
+  if(tid == 0) {
+    ncclShmem.warpComm = args->comm->warpLevelComm;
+  }
 #endif
  __syncthreads(); // publish shmem

+#ifdef ENABLE_WARP_SPEED
+  // Determine per-warp channel assignment for WarpSpeed enablement
+  total = 0;
+  if(ncclShmem.warpComm == 1) {  // If warpComm is enabled, assign warps to channels that have the corresponding channel mask enabled
+    ncclShmem.warpChannelId[localWarpId] = -1;
+     __syncthreads();
+    for (int i = 0; i < num; i++) {
+      if (args->channelMask.masks[i] & (1ull<<laneId)) {
+        y = __popcll(args->channelMask.masks[i] & ((1ull<<laneId)-1));
+        y = total + y;
+        if (globalWarpId == y) {
+          ncclShmem.warpChannelId[localWarpId] = laneId + total;
+          break;
+        }
+      }
+      total = total + __popcll(args->channelMask.masks[i]);
+    }
+    __syncthreads();
+    if(ncclShmem.warpChannelId[localWarpId] >= 0) {
+      void* dst = &ncclShmem.warpChannel[localWarpId];
+      void* src = &((ncclKernelCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.warpChannelId[localWarpId]];
+      int bytes = sizeof(ncclDevChannel);
+      static_assert(sizeof(ncclDevChannel) <= 16*WARP_SIZE, "ncclDevChannel cannot be loaded by a single warp in one insn.");
+      // assert((tid-localWarpId*WARP_SIZE) >= 0 && (tid-localWarpId*WARP_SIZE) < WARP_SIZE);
+      copyToShmem16(tid-localWarpId*WARP_SIZE, dst, src, bytes);
+    }
+  } else {  // If warpComm is disabled, all warps use the same channel as the block
+    if(laneId == 0) {
+      ncclShmem.warpChannelId[localWarpId] = ncclShmem.channelId;
+    }
+    // Use all threads in the warp to copy the channel data in parallel
+    void* dst = &ncclShmem.warpChannel[localWarpId];
+    void* src = &ncclShmem.channel;
+    int bytes = sizeof(ncclDevChannel);
+    copyToShmem16(laneId, dst, src, bytes);
+  }
+  __syncthreads();
+#endif
 #ifdef ENABLE_PROFILING
  if (tid == 0) {
    ncclShmem.prof.count = 0;
@@ -648,17 +721,17 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
 #endif
 }

-__global__ void ncclDevKernel_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
-__global__ void ncclDevKernel_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
-__global__ void ncclDevKernel_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
+__global__ void ncclDevKernel_Generic_1(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage);
+__global__ void ncclDevKernel_Generic_2(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage);
+__global__ void ncclDevKernel_Generic_4(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage);
 #ifdef ENABLE_COLLTRACE
-__global__ void ncclDevKernelDebug_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
-__global__ void ncclDevKernelDebug_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
-__global__ void ncclDevKernelDebug_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
+__global__ void ncclDevKernelDebug_Generic_1(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage);
+__global__ void ncclDevKernelDebug_Generic_2(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage);
+__global__ void ncclDevKernelDebug_Generic_4(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage);
 #endif

 #define DEFINE_ncclDevKernel_nop(suffix, coll, redop, ty, algo, proto, specializedFnId) \
-  __global__ void ncclDevKernel_##suffix(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {}
+  __global__ void ncclDevKernel_##suffix(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage) {}

 #ifdef USE_INDIRECT_FUNCTION_CALL
 #define DEFINE_ncclDevFunc(suffix, coll, redop, ty, algo, proto, acc, pipeline, unroll) \
@@ -3,9 +3,10 @@ import os
 import sys
 import subprocess
 from dataclasses import dataclass
+import shutil

 # Order of colls, redops, tys, protos, algos must match src/include/device.h
-all_colls     = ["Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce", "SendRecv", "", "", "AllToAllPivot"]
+all_colls     = ["Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce", "SendRecv", "", "", "AlltoAllPivot", "AllToAllGda"]
 all_redops    = ["Sum","Prod","MinMax","PreMulSum","SumPostDiv"]
 all_tys       = ["i8","u8","i32","u32","i64","u64","f16","f32","f64","bf16","f8e4m3","f8e5m2"]
 all_protos    = ["LL","LL128","SIMPLE"]
@@ -24,8 +25,11 @@ gensrc = sys.argv[1]

 if os.path.exists(gensrc):
  for name in os.listdir(gensrc):
-    os.remove(os.path.join(gensrc, name))
-    #os.truncate(os.path.join(gensrc, name), 0)
+    path = os.path.join(gensrc, name)
+    if os.path.isfile(path):
+      os.remove(path)
+    elif os.path.isdir(path):
+      shutil.rmtree(path)
 else:
  os.makedirs(gensrc)

@@ -64,7 +68,7 @@ else:
 # make ONLY_FUNCS="AllReduce RING SIMPLE * *|ReduceScatter RING LL * f32"
 #                         --- or ---
 # make ONLY_FUNCS="AllReduce RING SIMPLE|ReduceScatter RING LL * f32"
-# make ONLY_FUNCS="AllReduce RING/TREE LL/SIMPLE Sum/MinMax i8/u8/f16/f32/f64/bf16/f8e4m3/f8e5m2|AllGather RING LL/SIMPLE Sum i8|AllToAllPivot RING SIMPLE Sum i8|Broadcast RING LL/SIMPLE Sum i8|Reduce RING LL/SIMPLE Sum/MinMax i8/u8/f16/f32/f64/bf16/f8e4m3/f8e5m2|ReduceScatter RING LL/SIMPLE Sum/MinMax i8/u8/f16/f32/f64/bf16/f8e4m3/f8e5m2|SendRecv RING SIMPLE Sum i8"
+# make ONLY_FUNCS="AllReduce RING/TREE LL/SIMPLE Sum/MinMax i8/u8/f16/f32/f64/bf16/f8e4m3/f8e5m2|AllGather RING LL/SIMPLE Sum i8|AlltoAllPivot RING SIMPLE Sum i8|Broadcast RING LL/SIMPLE Sum i8|Reduce RING LL/SIMPLE Sum/MinMax i8/u8/f16/f32/f64/bf16/f8e4m3/f8e5m2|ReduceScatter RING LL/SIMPLE Sum/MinMax i8/u8/f16/f32/f64/bf16/f8e4m3/f8e5m2|SendRecv RING SIMPLE Sum i8"

 # Paste all non-None arguments together with `sep`.
 def paste(sep, *args):
@@ -79,14 +83,15 @@ func_pattern = sys.argv[6:7]
 if func_pattern and func_pattern[0]:
  func_pattern = func_pattern[0]
 else:
-  func_pattern = "AllGather|AllReduce|AllToAllPivot|Broadcast|Reduce|ReduceScatter|SendRecv"
+  func_pattern = "AllGather|AllReduce|AlltoAllPivot|AllToAllGda|Broadcast|Reduce|ReduceScatter|SendRecv"

 ################################################################################

 algos_of_coll = {
  "AllGather":             ["RING", "PAT"],
  "AllReduce":             ["RING", "TREE"],
-  "AllToAllPivot":         ["RING"],
+  "AlltoAllPivot":         ["RING"],
+  "AllToAllGda":           ["RING"],
  "Broadcast":             ["RING"],
  "Reduce":                ["RING"],
  "ReduceScatter":         ["RING", "PAT"],
@@ -96,7 +101,8 @@ algos_of_coll = {
 protos_of_coll = {
  "AllGather":              all_protos,
  "AllReduce":              all_protos,
-  "AllToAllPivot":          ["SIMPLE"],
+  "AlltoAllPivot":          ["SIMPLE"],
+  "AllToAllGda":            ["SIMPLE"],
  "Broadcast":              all_protos,
  "Reduce":                 all_protos,
  "ReduceScatter":          all_protos,
@@ -106,7 +112,8 @@ protos_of_coll = {
 redops_of_coll = {
  "AllGather":            ["Sum"],
  "AllReduce":            all_redops,
-  "AllToAllPivot":        ["Sum"],
+  "AlltoAllPivot":        ["Sum"],
+  "AllToAllGda":          ["Sum"],
  "Broadcast":            ["Sum"],
  "Reduce":               all_redops,
  "ReduceScatter":        all_redops,
@@ -116,7 +123,8 @@ redops_of_coll = {
 tys_of_coll = {
  "AllGather":             ["i8"],
  "AllReduce":             all_tys,
-  "AllToAllPivot":         ["i8"],
+  "AlltoAllPivot":         ["i8"],
+  "AllToAllGda":           ["i8"],
  "Broadcast":             ["i8"],
  "Reduce":                all_tys,
  "ReduceScatter":         all_tys,
@@ -126,7 +134,8 @@ tys_of_coll = {
 acc_of_coll = {
  "AllGather":             ["0"],
  "AllReduce":             all_accs,
-  "AllToAllPivot":         ["0"],
+  "AlltoAllPivot":         ["0"],
+  "AllToAllGda":           ["0"],
  "Broadcast":             ["0"],
  "Reduce":                ["0"],
  "ReduceScatter":         ["0"],
@@ -136,7 +145,8 @@ acc_of_coll = {
 pipelines_of_coll = {
  "AllGather":             ["0"],
  "AllReduce":             all_pipelines,
-  "AllToAllPivot":         ["0"],
+  "AlltoAllPivot":         ["0"],
+  "AllToAllGda":           ["0"],
  "Broadcast":             ["0"],
  "Reduce":                all_pipelines,
  "ReduceScatter":         all_pipelines,
@@ -147,7 +157,8 @@ pipelined_types = ["bf16"]
 coll_camel_to_lower = {
  "AllGather":             "all_gather",
  "AllReduce":             "all_reduce",
-  "AllToAllPivot":         "alltoall_pivot",
+  "AlltoAllPivot":         "alltoall_pivot",
+  "AllToAllGda":           "alltoall_gda",
  "Broadcast":             "broadcast",
  "Reduce":                "reduce",
  "ReduceScatter":         "reduce_scatter",
@@ -503,7 +514,7 @@ with open(os.path.join(gensrc, "host_table.cpp"), "w") as f:
      )
      if fn.coll == "Broadcast":
        key = ((coll_idx & 0x3F) | ((proto_idx & 0x3F) << 8))
-      if fn.coll in ["SendRecv", "AllToAllPivot"]:
+      if fn.coll in ["SendRecv", "AlltoAllPivot", "AllToAllGda"]:
        key = ((coll_idx & 0x3F))
      
      out(f'  {{{key}, {fn_id}}}, {comment}\n')
@@ -93,7 +93,7 @@ __device__ __forceinline__ static void mscclReduce(int c, int numReductions, int

 template<typename T, typename RedOp, typename Proto, bool fullOps>
 __device__ __forceinline__ void mscclRunInterpreter(
-  struct ncclDevComm* comm, struct mscclAlgo* algo, struct mscclWork* work) {
+  struct ncclKernelComm* comm, struct mscclAlgo* algo, struct mscclWork* work) {
  const int tid = threadIdx.x;
  const int bid = blockIdx.x;
  const int nthreads = MSCCL_MAX_NTHREADS;
@@ -120,12 +120,12 @@ __device__ __forceinline__ void mscclRunInterpreter(
    case 0:
      dst = &ncclShmem.comm;
      src = comm;
-      bytes = sizeof(ncclDevComm);
+      bytes = sizeof(ncclKernelComm);
      break;
    case 1:
-      // Get address of channel without incurring indirect load from ncclDevComm::channels
+      // Get address of channel without incurring indirect load from ncclKernelComm::channels
      dst = &ncclShmem.channel;
-      src = &((ncclDevCommAndChannels*)comm)->channels[channelId];
+      src = &((ncclKernelCommAndChannels*)comm)->channels[channelId];
      bytes = sizeof(ncclDevChannel);
      break;
    case 2:
@@ -146,6 +146,9 @@ __device__ __forceinline__ void mscclRunInterpreter(
    }
    if (bytes) copyToShmem8(tid%WARP_SIZE, dst, src, bytes);
  }
+#ifdef ENABLE_WARP_SPEED
+   ncclShmem.warpComm = 0;
+#endif
  __syncthreads(); // publish shmem

 #if defined(ENABLE_NPKIT)
@@ -369,13 +372,13 @@ __device__ __forceinline__ void mscclRunInterpreter(
 }

 #define MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, type, fullOps) \
-__global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, LL, fullOps)(struct ncclDevComm* comm, struct mscclAlgo* algo, struct mscclWork* work) { \
+__global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, LL, fullOps)(struct ncclKernelComm* comm, struct mscclAlgo* algo, struct mscclWork* work) { \
  mscclRunInterpreter<type, Func##devredop<type>, ProtoLL, fullOps>(comm, algo, work); \
 } \
-__global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, LL128, fullOps)(struct ncclDevComm* comm, struct mscclAlgo* algo, struct mscclWork* work) { \
+__global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, LL128, fullOps)(struct ncclKernelComm* comm, struct mscclAlgo* algo, struct mscclWork* work) { \
  mscclRunInterpreter<type, Func##devredop<type>, ProtoLL128, fullOps>(comm, algo, work); \
 } \
-__global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, Simple, fullOps)(struct ncclDevComm* comm, struct mscclAlgo* algo, struct mscclWork* work) { \
+__global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, Simple, fullOps)(struct ncclKernelComm* comm, struct mscclAlgo* algo, struct mscclWork* work) { \
  mscclRunInterpreter<type, Func##devredop<type>, ProtoSimple<MSCCL_CHUNKSTEPS/MSCCL_SLICESTEPS, MSCCL_SLICESTEPS, 0, 2>, fullOps>(comm, algo, work); \
 }

@@ -654,7 +654,11 @@ public:
    redOp(redOpArg),
    tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group), threadsPerBlock(blockDim.x),
    stepLines(ncclShmem.comm.buffSizes[NCCL_PROTO_LL]/NCCL_STEPS/sizeof(ncclLLFifoLine)) {
+#ifdef ENABLE_WARP_SPEED
+    auto *channel = isMsccl(Metadata) ? &ncclShmem.channel : &ncclShmem.warpChannel[threadIdx.x / WARP_SIZE];
+#else
    auto *channel = &ncclShmem.channel;
+#endif
    barriers = &ncclShmem.groups[group].barrier;
    // If we are going to support oneshot collNet + LL, then we would need to add connector index here
    int nrecv=0, nsend=0;
@@ -579,7 +579,11 @@ public:
    tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE),                                /*compiler warnings*/
    stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS/sizeof(uint64_t)),
    warp(tid/WARP_SIZE), warpInBlock(threadIdx.x/WARP_SIZE), flagThread((tid%4)==3), group(group), threadsPerBlock(blockDim.x){
+#ifdef ENABLE_WARP_SPEED
+    auto *channel = isMsccl(Metadata) ? &ncclShmem.channel : &ncclShmem.warpChannel[warpInBlock];
+#else
    auto *channel = &ncclShmem.channel;
+#endif
    barriers = &ncclShmem.groups[group].barrier;
    int nrecv=0, nsend=0;
    while (nrecv < MaxRecv && recvPeers[nrecv] >= 0) {
@@ -502,14 +502,22 @@ private:

 public:
  static inline __device__ void sendPeerNotify(int peer, int connIndex, int steps) {
+#ifdef ENABLE_WARP_SPEED
+    ncclDevChannelPeer* peerPtr = ncclShmem.warpChannel[threadIdx.x/WARP_SIZE].peers[peer];
+#else
    ncclDevChannelPeer* peerPtr = ncclShmem.channel.peers[peer];
+#endif
    peerPtr->send[connIndex].step += steps;
    st_relaxed_sys_global(peerPtr->send[connIndex].tail, peerPtr->send[connIndex].step);
  }

  static inline __device__ void recvPeerNotify(int peer, int connIndex, int steps) {
    int spins = 0;
+#ifdef ENABLE_WARP_SPEED
+    ncclDevChannelPeer* peerPtr = ncclShmem.warpChannel[threadIdx.x/WARP_SIZE].peers[peer];
+#else
    ncclDevChannelPeer* peerPtr = ncclShmem.channel.peers[peer];
+#endif
    peerPtr->recv[connIndex].step += steps;
    st_relaxed_sys_global(peerPtr->recv[connIndex].head, peerPtr->recv[connIndex].step);
    while (ld_volatile_global(peerPtr->recv[connIndex].tail) < peerPtr->recv[connIndex].step) {
@@ -770,13 +778,20 @@ public:
      struct ncclDevWorkP2p* p2pWork = nullptr, int stepSize_ = 0, int mode = primsModeDefault
    ):
    tid(tid), tidInBlock(threadIdx.x), nthreads(nthreads), /*compiler warnings*/
+#ifdef ENABLE_WARP_SPEED
+    stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_), group(ncclShmem.warpComm? tidInBlock / WARP_SIZE : group), threadsPerBlock(blockDim.x){
+#else
    stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_), group(group), threadsPerBlock(blockDim.x){
-
+#endif
    barriers = &ncclShmem.groups[group].barrier;
    // PAT uses the same barrier for each group
    barriers_pat = &ncclShmem.barrier_pat;
    this->nworkers = nthreads;
-
+#ifdef ENABLE_WARP_SPEED
+    auto *channel = isMsccl(Metadata) ? &ncclShmem.channel : &ncclShmem.warpChannel[tidInBlock/WARP_SIZE];
+#else
+    auto *channel = &ncclShmem.channel;
+#endif
    int peer = -1;
    flags = 0;
    index = -1;
@@ -831,9 +846,9 @@ public:
      }

      // coverity[overrun-call] => Coverity think prims.index can be greater than 1
-      if (flags & (RoleWaitRecv|RolePostRecv)) loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, collWork ? collWork->direct : 0, recvIpcReg, recvNetReg);
+      if (flags & (RoleWaitRecv|RolePostRecv)) loadRecvConn(channel->peers[peer], connIndexRecv, collWork ? collWork->direct : 0, recvIpcReg, recvNetReg);
      // coverity[overrun-call] => Coverity think prims.index can be greater than 1
-      if (flags & (RoleWaitSend|RolePostSend)) loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, collWork ? collWork->direct : 0, sendIpcReg, sendNetReg);
+      if (flags & (RoleWaitSend|RolePostSend)) loadSendConn(channel->peers[peer], connIndexSend, collWork ? collWork->direct : 0, sendIpcReg, sendNetReg);

      // if (barrierAny(flags & NetDeviceUnpack)) {
      //   flags |= AnyNetDeviceUnpack;
@@ -861,7 +876,7 @@ public:
        // Load recv peer
        int recvPeer = mode == primsModePatRs ? (rank - delta + nranks) % nranks : (rank + delta) % nranks;
        struct ncclPatPeer* peer = ((struct ncclPatPeer*)recvPeers)+tid;
-        struct ncclConnInfo* conn = peer->conn = ncclShmem.channel.peers[recvPeer]->recv+connIndexRecv;
+        struct ncclConnInfo* conn = peer->conn = channel->peers[recvPeer]->recv+connIndexRecv;
        peer->step = conn->step;
        peer->buff = conn->buffs[NCCL_PROTO_SIMPLE];
        peer->stepCache = loadStepValue(peer->tailPtr = conn->tail);
@@ -871,7 +886,7 @@ public:
        // Load send peer
        int sendPeer = mode == primsModePatAg ? (rank - delta + nranks) % nranks : (rank + delta) % nranks;
        peer = ((struct ncclPatPeer*)sendPeers)+tid;
-        conn = peer->conn = ncclShmem.channel.peers[sendPeer]->send+connIndexSend;
+        conn = peer->conn = channel->peers[sendPeer]->send+connIndexSend;
        peer->step = conn->step;
        peer->connFifo = conn->connFifo;
        peer->buff = conn->buffs[NCCL_PROTO_SIMPLE];
@@ -16,7 +16,12 @@ namespace {
 #else
  __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
 #endif
+#ifdef ENABLE_WARP_SPEED
+    int warp = threadIdx.x / WARP_SIZE;
+    ncclRing *ring = &ncclShmem.warpChannel[warp].ring;
+#else
    ncclRing *ring = &ncclShmem.channel.ring;
+#endif
    const int nranks = ncclShmem.comm.nRanks;
    const int rank = ncclShmem.comm.rank;
    const int prevRank = ring->userRanks[nranks-1];
@@ -24,7 +29,11 @@ namespace {
    size_t chunkCount;
    size_t channelCount;
    size_t gridOffset;
+#ifdef ENABLE_WARP_SPEED
+    ncclCollCbdPart(work, ncclShmem.warpChannelId[warp], Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
+#else
    ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
+#endif
    size_t offset;
    int nelem;

@@ -414,7 +414,7 @@ SPECIALIZE_REDUCE(FuncMinMax, half, 1, half, fn.isMinNotMax ? __hmin(x, y) : __h
  SPECIALIZE_REDUCE(FuncMinMax, __nv_bfloat16, 1, __nv_bfloat16, fn.isMinNotMax ? __hmin(x, y) : __hmax(x, y))
  // coverity[copy_constructor_call]
  SPECIALIZE_REDUCE(FuncMinMax, __nv_bfloat16, 2, __nv_bfloat162, fn.isMinNotMax ? __hmin2(x, y) : __hmax2(x, y))
-#elif ROCM_VERSION < 60000
+#else
  SPECIALIZE_REDUCE(FuncSum, hip_bfloat16, 1, hip_bfloat16, (hip_bfloat16)((float)(x) + (float)(y)))
  SPECIALIZE_REDUCE(FuncProd, hip_bfloat16, 1, hip_bfloat16, (hip_bfloat16)((float)(x) * (float)(y)))
  SPECIALIZE_REDUCE(FuncMinMax, hip_bfloat16, 1, hip_bfloat16, (hip_bfloat16)(fn.isMinNotMax ? fminf((float)(x), (float)(y)) : fmaxf((float)(x), (float)(y))))
@@ -16,14 +16,23 @@ namespace {
 #else
  __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
 #endif
+#ifdef ENABLE_WARP_SPEED
+    int warp = threadIdx.x / WARP_SIZE;
+    ncclRing *ring = &ncclShmem.warpChannel[warp].ring;
+#else
    ncclRing *ring = &ncclShmem.channel.ring;
+#endif
    int const *ringRanks = ring->userRanks;
    const int nranks = ncclShmem.comm.nRanks;
    size_t count;
    size_t gridOffset;
    size_t channelCount;
    size_t chunkCount;
+#ifdef ENABLE_WARP_SPEED
+    ncclCollCbdPart(work, ncclShmem.warpChannelId[warp], Proto::Id, sizeof(T), &count, &gridOffset, &channelCount, &chunkCount);
+#else
    ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &gridOffset, &channelCount, &chunkCount);
+#endif
    size_t offset;
    size_t dataOffset;
    uint32_t nelem;
@@ -1,35 +1,36 @@
 // Modification Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT 

-#include "symmetric.h"
+#include "sym_kernels.h"
 #include "symmetric/kernel.h"
 #include "symmetric/primitives.h"

 template<int BytePerPack, int UnrollPacks, int UnrollPeers>
 static __device__ void bcastDeep(
-    ncclSymPrims& prim, int tn, int t, bool waitNeeded,
-    char* inputHere, char* outputRank0, bool inPlace, int nIters
+    ncclSymkArgsHandler const& handler, int tn, int t,
+    bool waitNeeded, ncclLsaBarrierSession<ncclCoopCta>& bar,
+    ncclSymPtr<char> input, ncclSymPtr<char> output, bool inPlace, int nIters
  ) {
  using Pack = BytePack<BytePerPack>;
  int wn = tn/WARP_SIZE;
  int w = t/WARP_SIZE;
  int lane = t%WARP_SIZE;
-  int const& rank = prim.rank;
-  int const& nRanks = prim.nRanks;
-  uint32_t const& stride4G = prim.stride4G;
-  Pack* inpHere = (Pack*)inputHere + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
-  Pack* outRank0 = (Pack*)outputRank0 + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
+  int const& rank = handler.comm.rank;
+  int const& nRanks = handler.comm.nRanks;
+
+  Pack* inpPacks = (Pack*)input.localPtr() + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
+  ncclSymPtr<Pack> outPacks = (ncclSymPtr<Pack>)output + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
  Pack tmp[UnrollPacks];

  nIters -= w;
  if (0 < nIters) {
    #pragma unroll
    for (int u=0; u < UnrollPacks; u++) {
-      tmp[u] = inpHere[u*WARP_SIZE];
+      tmp[u] = inpPacks[u*WARP_SIZE];
    }
  }

-  if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+  if (waitNeeded) bar.wait(ncclCoopCta(), cuda::memory_order_relaxed);

  if (0 < nIters) {
    while (true) {
@@ -47,21 +48,21 @@ static __device__ void bcastDeep(
            if (partial && dr == nRanks) break;
            #pragma unroll UnrollPacks
            for (int u=0; u < UnrollPacks; u++) {
-              add4G(outRank0, r*stride4G)[u*WARP_SIZE] = tmp[u];
+              outPacks.lsaPtr(r)[u*WARP_SIZE] = tmp[u];
            }
            if (++r == nRanks) r = 0;
          }
        }
      }
-      inpHere += intptr_t(wn)*UnrollPacks*WARP_SIZE;
-      outRank0 += intptr_t(wn)*UnrollPacks*WARP_SIZE;
+      inpPacks += intptr_t(wn)*UnrollPacks*WARP_SIZE;
+      outPacks += intptr_t(wn)*UnrollPacks*WARP_SIZE;
      nIters -= wn;
      if (nIters <= 0) break;

      // Load data for next iteration.
      #pragma unroll
      for (int u=0; u < UnrollPacks; u++) {
-        tmp[u] = inpHere[u*WARP_SIZE];
+        tmp[u] = inpPacks[u*WARP_SIZE];
      }
    }
  }
@@ -69,18 +70,17 @@ static __device__ void bcastDeep(

 template<int UnrollPeers, typename T>
 static __device__ void bcastEnds(
-    ncclSymPrims& prim, int tn, int t,
-    T* inputHere, T* outputRank0, bool inPlace, size_t nElts, uint32_t nPreElts, size_t nSufElts
+    ncclSymkArgsHandler const& handler, int tn, int t,
+    ncclSymPtr<T> input, ncclSymPtr<T> output, bool inPlace, size_t nElts, uint32_t nPreElts, size_t nSufElts
  ) {
-  int const& rank = prim.rank;
-  int const& nRanks = prim.nRanks;
-  uint32_t const& stride4G = prim.stride4G;
-  BytePack<sizeof(T)>* inpHere = (BytePack<sizeof(T)>*)inputHere;
-  BytePack<sizeof(T)>* outRank0 = (BytePack<sizeof(T)>*)outputRank0;
+  int const& rank = handler.comm.rank;
+  int const& nRanks = handler.comm.nRanks;
+  BytePack<sizeof(T)>* inpPacks = (BytePack<sizeof(T)>*)input.localPtr();
+  ncclSymPtr<BytePack<sizeof(T)>> outPacks = (ncclSymPtr<BytePack<sizeof(T)>>)output;
  #pragma unroll 1
  for (size_t i = t; i < nPreElts+nSufElts; i += tn) {
    size_t elt = i < nPreElts ? i : nElts-nPreElts-nSufElts+i;
-    BytePack<sizeof(T)> tmp = inpHere[elt];
+    BytePack<sizeof(T)> tmp = inpPacks[elt];
    int dr = inPlace ? 1 : 0;
    int r = rank + dr;
    if (r == nRanks) r = 0;
@@ -88,14 +88,14 @@ static __device__ void bcastEnds(
    for (; dr + UnrollPeers <= nRanks; dr += UnrollPeers) {
      #pragma unroll UnrollPeers
      for (int u=0; u < UnrollPeers; u++) {
-        *add4G(outRank0+elt, r*stride4G) = tmp;
+        outPacks.lsaPtr(r)[elt] = tmp;
        if (++r == nRanks) r = 0;
      }
    }
    #pragma unroll UnrollPeers
    for (int u=0; u < UnrollPeers; u++) {
      if (dr+u == nRanks) break;
-      *add4G(outRank0+elt, r*stride4G) = tmp;
+      outPacks.lsaPtr(r)[elt] = tmp;
      if (++r == nRanks) r = 0;
    }
  }
@@ -103,95 +103,95 @@ static __device__ void bcastEnds(

 template<typename T>
 static __device__ void bcast(
-    ncclSymPrims& prim, int tn, int t, bool waitNeeded, T* input, T* output, size_t nElts
+    ncclSymkArgsHandler const& handler, int tn, int t, int nBlocks,
+    bool waitNeeded, ncclLsaBarrierSession<ncclCoopCta>& bar,
+    ncclSymPtr<T> input, ncclSymPtr<T> output, size_t nElts
  ) {
  bool inPlace = (input == output);
-  // Mpve to rank=0
-  output = prim.peerPtr(0, output);
-
-  uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
-  uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
  size_t nBytes = nElts*sizeof(T);
+  uint32_t nBlocks_rcp32 = nccl::utility::idivRcp32_upto64(nBlocks);

-  uint32_t nPreBytes = (128u - inputUptr)%128u;
+  uint32_t nPreBytes = (16 - input.offset)%16;
  nPreBytes = min((size_t)nPreBytes, nBytes);
  uintptr_t cursor = nPreBytes;

  constexpr int MinWarpPerBlock = 4;

-  if ((inputUptr-outputUptr)%16 == 0) {
-    constexpr int BytePerPack = 16, UnrollPacks = 1, UnrollPeers = 1;
+  if ((input.offset - output.offset)%16 == 0) {
+    constexpr int BytePerPack = 16, UnrollPacks = 4, UnrollPeers = 2;
    constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
    uint32_t chunks = (nBytes-cursor)/BytePerChunk;
-    chunks -= imodFast32(chunks, prim.nBlocks, prim.nBlocks_rcp32);
+    chunks -= imodFast32(chunks, nBlocks, nBlocks_rcp32);
    if (chunks != 0) {
      uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
      bcastDeep<BytePerPack, UnrollPacks, UnrollPeers>(
-        prim, tn, t, waitNeeded,
-        (char*)input + cursor, (char*)output + cursor, inPlace,
-        chunks*MinWarpPerBlock
+        handler, tn, t, waitNeeded, bar,
+        (ncclSymPtr<char>)input + cursor,
+        (ncclSymPtr<char>)output + cursor,
+        inPlace, chunks*MinWarpPerBlock
      );
      cursor = cursorAfter;
      waitNeeded = false;
    }
  }

-  if (sizeof(T) == 4 || (sizeof(T) < 4 && (inputUptr-outputUptr)%4 == 0)) {
-    constexpr int BytePerPack = 4, UnrollPacks = 1, UnrollPeers = 1;
-    constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
-    uint32_t chunks = (nBytes-cursor)/BytePerChunk;
-    chunks -= imodFast32(chunks, prim.nBlocks, prim.nBlocks_rcp32);
+  if (sizeof(T) == 4 || (sizeof(T) < 4 && (input.offset - output.offset)%4 == 0)) {
+    chunks -= imodFast32(chunks, nBlocks, nBlocks_rcp32);
    if (chunks != 0) {
-      uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
      bcastDeep<(sizeof(T) <= BytePerPack ? BytePerPack : 0), UnrollPacks, UnrollPeers>(
-        prim, tn, t, waitNeeded,
-        (char*)input + cursor, (char*)output + cursor, inPlace,
-        chunks*MinWarpPerBlock
+        handler, tn, t, waitNeeded, bar,
+        (ncclSymPtr<char>)input + cursor,
+        (ncclSymPtr<char>)output + cursor,
+        inPlace, chunks*MinWarpPerBlock
      );
      cursor = cursorAfter;
      waitNeeded = false;
    }
  }

-  if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+  if (waitNeeded) bar.wait(ncclCoopCta(), cuda::memory_order_relaxed);

  constexpr int UnrollPeers = 8;
  size_t nSufElts = (nBytes-cursor)/sizeof(T);
-  bcastEnds<UnrollPeers>(prim, tn, t, input, output, inPlace, nElts, nPreBytes/sizeof(T), nSufElts);
+  bcastEnds<UnrollPeers>(handler, tn, t, input, output, inPlace, nElts, nPreBytes/sizeof(T), nSufElts);
 }

-__device__ __forceinline__ void ncclSymRun_AllGather_ST(ncclSymDevArgs const* args) {
-  ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier);
-  int const& rank = prim.rank;
+__device__ __forceinline__ void ncclSymkRun_AllGather_ST(ncclSymkDevWorkArgs const* args) {
+  ncclSymkArgsHandler handler{args};
+  ncclLsaBarrierSession<ncclCoopCta> bar{
+    ncclCoopCta(), handler.comm, ncclTeamTagLsa(), blockIdx.x
+  };
+  int const& rank = handler.comm.rank;

-  // Threads numbered over rank.
-  int bt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
-                     prim.block, prim.nBlocks,
-                     threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
-  int btn = prim.nBlocks*blockDim.x;
+  bar.arrive(ncclCoopCta(), cuda::memory_order_relaxed);

-  prim.barrierArrive(ncclCoopCta(), /*release=*/false);
-  //prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+  bool waitNeeded = true;
+  handler.forEachWork<char>(
+      [&]__device__(int block, int nBlocks, size_t nElts, size_t nAllElts,
+                    ncclSymPtr<char> input, ncclSymPtr<char> output) {
+        // Threads numbered over rank.
+        int bt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
+                           block, nBlocks,
+                           threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
+        int btn = nBlocks*blockDim.x;

-  bcast(prim, btn, bt, /*waitNeeded=*/true, (char*)args->input, (char*)args->output + rank*args->nElts, args->nElts);
+        bcast(handler, btn, bt, nBlocks, waitNeeded, bar, input, output + rank*nAllElts, nElts);

-  prim.barrierArrive(ncclCoopCta(), /*release=*/true);
-  prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+        waitNeeded = false;
+      }
+    );
+
+  bar.sync(ncclCoopCta(), cuda::memory_order_release);
 }

-
 template<typename T>
 static __device__ void bcastMultimem(
-    ncclSymPrims& prim, int tn, int t, T* input, T* output, size_t nElts
+    ncclSymkArgsHandler& handler, int tn, int t, ncclSymPtr<T> input, ncclSymPtr<T> output, size_t nElts
  ) {
-  // Move output to multimem
-  output = prim.multimemPtr(output);
-
-  uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
-  uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
  size_t nBytes = nElts*sizeof(T);
-
-  uint32_t nPreBytes = (16-inputUptr)%16;
+  uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input.localPtr());
+  uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output.multimemPtr(handler.comm.lsaMultimem));
+  uint32_t nPreBytes = (16 - input.offset)%16;
  nPreBytes = min((size_t)nPreBytes, nBytes);
  uintptr_t nSufBytes;

@@ -230,51 +230,52 @@ static __device__ void bcastMultimem(
    uintptr_t cursor = i < nPreBytes ? i : nBytes-nSufBytes+(i-nPreBytes);
    BytePack<sizeof(T)> val = *reinterpret_cast<BytePack<sizeof(T)>*>(inputUptr + cursor);
    multimem_st_global(outputUptr + cursor, val);
-    cursor += tn*sizeof(T);
  }
 }

-__device__ __forceinline__ void ncclSymRun_AllGather_STMC(ncclSymDevArgs const* args) {
-  ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier|ncclSymPrims_UseMultimem);
-  int const& rank = prim.rank;
+__device__ __forceinline__ void ncclSymkRun_AllGather_STMC(ncclSymkDevWorkArgs const* args) {
+  ncclSymkArgsHandler handler{args};
+  ncclLsaBarrierSession<ncclCoopCta> bar(
+    ncclCoopCta(), handler.comm, ncclTeamTagLsa(), blockIdx.x, /*multimem=*/true
+  );
+  int const& rank = handler.comm.rank;

-  char* input = args->input;
-  char* output = args->output;
-  size_t bytes = args->nElts;
-  // Round robin memory to blocks.
-  int t = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
-                    prim.block, prim.nBlocks,
-                    threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
-  int tn = prim.nBlocks*blockDim.x;
+  bar.sync(ncclCoopCta(), cuda::memory_order_relaxed);

-  prim.barrierArrive(ncclCoopCta(), /*release=*/false);
-  prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+  handler.forEachWork<char>(
+      [&]__device__(int block, int nBlocks, size_t nElts, size_t nAllElts,
+                    ncclSymPtr<char> input, ncclSymPtr<char> output) {
+        // Round robin memory to blocks.
+        int t = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
+                          block, nBlocks,
+                          threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
+        int tn = nBlocks*blockDim.x;

-  bcastMultimem(prim, tn, t, input, output + rank*bytes, bytes);
+        bcastMultimem(handler, tn, t, input, output + rank*nAllElts, nElts);
+      }
+    );

-  prim.barrierArrive(ncclCoopCta(), /*release=*/true);
-  prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+  bar.sync(ncclCoopCta(), cuda::memory_order_release);
 }

 template<typename EltType>
 static __device__ void allgather_LL_body(
-    ncclSymPrims &prim, EltType* input, EltType* output, int nElts, int nPacks, int nStrideElts
+    ncclSymkArgsHandler& handler, ncclLLA2ASession<ncclCoopCta>& lla2a,
+    EltType* input, EltType* output, int nElts, int nPacks, int nStrideElts
  ) {
  using Pack = BytePack<8>;
  constexpr int EltPerPack = 8/sizeof(EltType);
-
-  ncclCoopCta cta;
-  int rank = prim.rank;
-  int nRanks = prim.nRanks;
-  constexpr int tn = ncclSymMaxThreads;
+  int const& rank = handler.comm.rank;
+  int const& nRanks = handler.comm.nRanks;
  int t = threadIdx.x;
+  constexpr int tn = ncclSymkMaxThreads;

  #pragma unroll 1
  while (0 < nElts) {
    int nIterPacks = min(nPacks, tn);
    if (t < nIterPacks) {
      Pack x = loadPack<Pack>(input, t*EltPerPack, nElts);
-      prim.bcastLL(/*slot=*/nIterPacks*rank + t, x);
+      lla2a.bcast(/*slot=*/nIterPacks*rank + t, x);
    }

    int tn_div_nPacks = tn/nIterPacks;
@@ -287,7 +288,7 @@ static __device__ void allgather_LL_body(
      #pragma unroll 1
      for (int i = t; i < (nRanks*nIterPacks & -(Unroll*tn)); i += Unroll*tn) {
        Pack got[Unroll];
-        prim.template recvLL<Unroll, Unroll>(i, Unroll, tn, /*&*/got);
+        lla2a.template recvUnrolled<Unroll, Unroll>(i, Unroll, tn, /*&*/got);
        #pragma unroll
        for (int u=0; u < Unroll; u++) {
          storePack<Pack>(output + peer*nStrideElts, pack*EltPerPack, nElts, got[u]);
@@ -302,7 +303,7 @@ static __device__ void allgather_LL_body(
      if (i + n*tn < nRanks*nIterPacks) n += 1;
      if (n != 0) {
        Pack got[Unroll];
-        prim.template recvLL<1, Unroll>(i, n, tn, /*&*/got);
+        lla2a.template recvUnrolled<1, Unroll>(i, n, tn, /*&*/got);
        #pragma unroll
        for (int u=0; u < Unroll; u++) {
          if (u != 0 && u == n) break;
@@ -316,7 +317,7 @@ static __device__ void allgather_LL_body(
      // The non-unrolled but "obviously correct" implementation for reference.
      #pragma unroll 1
      for (int i = t; i < nRanks*nIterPacks; i += tn) {
-        Pack got = prim.template recvLL<Pack>(i);
+        Pack got = lla2a.template recv<Pack>(i);
        storePack(output + peer*nStrideElts, pack*EltPerPack, nElts, got);
        peer += tn_div_nPacks;
        pack += tn_mod_nPacks;
@@ -324,7 +325,7 @@ static __device__ void allgather_LL_body(
      }
    #endif

-    prim.endLL(cta);
+    lla2a.endEpoch(ncclCoopCta());

    input += tn*EltPerPack;
    output += tn*EltPerPack;
@@ -333,38 +334,41 @@ static __device__ void allgather_LL_body(
  }
 }

-static __device__ void ncclSymRun_AllGather_LL_impl(ncclSymDevArgs const* args, bool multimem) {
-  ncclSymPrims prim(args->comm, ncclSymPrims_UseLL | multimem*ncclSymPrims_UseMultimem);
+static __device__ void ncclSymkRun_AllGather_LL_impl(ncclSymkDevWorkArgs const* args, bool multimem) {
+  ncclSymkArgsHandler handler{args};
+  ncclLLA2ASession<ncclCoopCta> lla2a(
+    ncclCoopCta(), handler.comm, ncclTeamLsa(handler.comm), handler.lsaLLA2A, blockIdx.x, /*maxElts=*/ncclSymkMaxThreads, multimem, handler.comm.lsaMultimem
+  );
+
  using Pack = BytePack<8>;
  constexpr int BytePerPack = 8;
-  int nElts = args->nElts;
-  int nPacks = divUp(nElts, BytePerPack);

-  uint32_t nPackPerBlock, nPackModBlock;
-  idivmodFast32(&nPackPerBlock, &nPackModBlock, nPacks, prim.nBlocks, prim.nBlocks_rcp32);
-  int blockPackBegin = prim.block*nPackPerBlock + minval<int>(prim.block, nPackModBlock);
-  int blockPackEnd = blockPackBegin + nPackPerBlock + (prim.block < nPackModBlock ? 1 : 0);
-  int nBlockPacks = blockPackEnd - blockPackBegin;
-  int nBlockElts = nElts - blockPackBegin*BytePerPack;
-  nBlockElts = min(nBlockElts, nBlockPacks*BytePerPack);
-  char* blockInput = args->input + blockPackBegin*BytePerPack;
-  char* blockOutput = args->output + blockPackBegin*BytePerPack;
+  handler.singleWork<char>(
+      [&]__device__(int nElts, int nAllElts,
+                    ncclSymPtr<char> input, ncclSymPtr<char> output) {
+        int nPacks = divUp(nElts, BytePerPack);

-  uint32_t lowBits = args->nElts;
-  lowBits |= (uint32_t)reinterpret_cast<uintptr_t>(args->input);
-  lowBits |= (uint32_t)reinterpret_cast<uintptr_t>(args->output);
-  if (__builtin_expect(lowBits%8 == 0, true)) {
-    // NOTE: Specializing for 8-byte alignment in one case help at size=65K: 8.9us vs 5.6us
-    allgather_LL_body(prim, (BytePack<8>*)blockInput, (BytePack<8>*)blockOutput, nBlockElts/8, nBlockPacks, nElts/8);
-  } else {
-    allgather_LL_body(prim, blockInput, blockOutput, nBlockElts, nBlockPacks, nElts);
-  }
+        char* blockInput = input.localPtr();
+        char* blockOutput = output.localPtr();
+
+        uint32_t lowBits = nElts;
+        lowBits |= (uintptr_t)blockInput;
+        lowBits |= (uintptr_t)blockOutput;
+        if (__builtin_expect(lowBits%8 == 0, true)) {
+          // NOTE: Specializing for 8-byte alignment in one case help at size=65K: 8.9us vs 5.6us
+          allgather_LL_body(handler, lla2a, (BytePack<8>*)blockInput, (BytePack<8>*)blockOutput,
+                            nElts/8, nPacks, nAllElts/8);
+        } else {
+          allgather_LL_body(handler, lla2a, blockInput, blockOutput, nElts, nPacks, nAllElts);
+        }
+      }
+    );
 }

-__device__ __forceinline__ void ncclSymRun_AllGather_LL(ncclSymDevArgs const* args) {
-  ncclSymRun_AllGather_LL_impl(args, /*multimem=*/false);
+__device__ __forceinline__ void ncclSymkRun_AllGather_LL(ncclSymkDevWorkArgs const* args) {
+  ncclSymkRun_AllGather_LL_impl(args, /*multimem=*/false);
 }

-__device__ __forceinline__ void ncclSymRun_AllGather_LLMC(ncclSymDevArgs const* args) {
-  ncclSymRun_AllGather_LL_impl(args, /*multimem=*/true);
+__device__ __forceinline__ void ncclSymkRun_AllGather_LLMC(ncclSymkDevWorkArgs const* args) {
+  ncclSymkRun_AllGather_LL_impl(args, /*multimem=*/true);
 }
@@ -1,38 +1,41 @@
 // Modification Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT 
+// SPDX-License-Identifier: MIT

-#include "symmetric.h"
+#include "sym_kernels.h"
+#include "nccl_device.h"
 #include "symmetric/kernel.h"
 #include "symmetric/primitives.h"

 template<int BytePerPack, int UnrollPacks, int UnrollPeers, typename T, typename Red>
 static __device__ __forceinline__ void allreduceDeep(
-    ncclSymPrims& prim, int tn, int t, bool waitNeeded,
-    Red red, char* inputRank0, char* outputRank0, int32_t nIters
+    ncclSymkArgsHandler const& handler, int tn, int t,
+    bool waitNeeded, ncclLsaBarrierSession<ncclCoopCta>& bar,
+    Red red, ncclSymPtr<char> input, ncclSymPtr<char> output, int32_t nIters
  ) {
  using Pack = BytePack<BytePerPack>;
  using Acc = typename Red::EltType;
  using AccPack = BytePack<BytePerPack*sizeof(Acc)/sizeof(T)>;

+  ncclTeam world = ncclTeamWorld(handler.comm);
  int wn = tn/WARP_SIZE;
  int w = t/WARP_SIZE;
  int lane = t%WARP_SIZE;
-  int const& rank = prim.rank;
-  int const& nRanks = prim.nRanks;
-  uint32_t const& stride4G = prim.stride4G;
-  Pack* inpRank0 = (Pack*)inputRank0 + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
-  Pack* outRank0 = (Pack*)outputRank0 + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
+  int const& rank = handler.comm.rank;
+  int const& nRanks = handler.comm.nRanks;
+
+  ncclSymPtr<Pack> inpPacks = (ncclSymPtr<Pack>)input + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
+  ncclSymPtr<Pack> outPacks = (ncclSymPtr<Pack>)output + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
  Pack acc0[UnrollPacks];

  nIters -= w;
  if (0 < nIters) {
    #pragma unroll
    for (int u=0; u < UnrollPacks; u++) {
-      acc0[u] = add4G(inpRank0, rank*stride4G)[u*WARP_SIZE];
+      acc0[u] = inpPacks.peerPtr(world, rank)[u*WARP_SIZE];
    }
  }

-  if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+  if (waitNeeded) bar.wait(ncclCoopCta(), cuda::memory_order_relaxed);

  if (0 < nIters) {
    while (true) {
@@ -42,7 +45,7 @@ static __device__ __forceinline__ void allreduceDeep(
      { Pack tmp1[UnrollPacks];
        #pragma unroll
        for (int u=0; u < UnrollPacks; u++) {
-          tmp1[u] = add4G(inpRank0, r*stride4G)[u*WARP_SIZE];
+          tmp1[u] = inpPacks.peerPtr(world, r)[u*WARP_SIZE];
        }
        #pragma unroll
        for (int u=0; u < UnrollPacks; u++) {
@@ -67,7 +70,7 @@ static __device__ __forceinline__ void allreduceDeep(
            if (partial && ur!=0 && dr+ur == nRanks) break;
            #pragma unroll UnrollPacks
            for (int u=0; u < UnrollPacks; u++) {
-              tmp1[ur][u] = add4G(inpRank0, r*stride4G)[u*WARP_SIZE];
+              tmp1[ur][u] = inpPacks.peerPtr(world, r)[u*WARP_SIZE];
            }
            if (++r == nRanks) r = 0;
          }
@@ -98,22 +101,22 @@ static __device__ __forceinline__ void allreduceDeep(
            if (partial && dr == nRanks) break;
            #pragma unroll UnrollPacks
            for (int u=0; u < UnrollPacks; u++) {
-              add4G(outRank0, r*stride4G)[u*WARP_SIZE] = acc0[u];
+              outPacks.peerPtr(world, r)[u*WARP_SIZE] = acc0[u];
            }
            if (++r == nRanks) r = 0;
          }
        }
      }

-      inpRank0 += intptr_t(wn)*UnrollPacks*WARP_SIZE;
-      outRank0 += intptr_t(wn)*UnrollPacks*WARP_SIZE;
+      inpPacks += intptr_t(wn)*UnrollPacks*WARP_SIZE;
+      outPacks += intptr_t(wn)*UnrollPacks*WARP_SIZE;
      nIters -= wn;
      if (nIters <= 0) break;

      // Load data for next iteration.
      #pragma unroll
      for (int u=0; u < UnrollPacks; u++) {
-        acc0[u] = add4G(inpRank0, rank*stride4G)[u*WARP_SIZE];
+        acc0[u] = inpPacks.peerPtr(world, rank)[u*WARP_SIZE];
      }
    }
  }
@@ -121,21 +124,23 @@ static __device__ __forceinline__ void allreduceDeep(

 template<int UnrollPeers, typename Red, typename T>
 static __device__ __forceinline__ void allreduceEnds(
-    ncclSymPrims& prim, int tn, int t, Red red,
-    T* inputRank0, T* outputRank0, size_t nElts, uint32_t nPreElts, size_t nSufElts
+    ncclSymkArgsHandler const& handler, int tn, int t, Red red,
+    ncclSymPtr<T> input, ncclSymPtr<T> output,
+    size_t nElts, uint32_t nPreElts, size_t nSufElts
  ) {
  using Acc = typename Red::EltType;

-  int const& rank = prim.rank;
-  int const& nRanks = prim.nRanks;
-  uint32_t const& stride4G = prim.stride4G;
-  BytePack<sizeof(T)>* inpRank0 = (BytePack<sizeof(T)>*)inputRank0;
-  BytePack<sizeof(T)>* outRank0 = (BytePack<sizeof(T)>*)outputRank0;
+  ncclTeam world = ncclTeamWorld(handler.comm);
+  int const& rank = handler.comm.rank;
+  int const& nRanks = handler.comm.nRanks;
+
+  ncclSymPtr<BytePack<sizeof(T)>> inpPacks = (ncclSymPtr<BytePack<sizeof(T)>>)input;
+  ncclSymPtr<BytePack<sizeof(T)>> outPacks = (ncclSymPtr<BytePack<sizeof(T)>>)output;

  #pragma unroll 1
  for (size_t i = t; i < nPreElts+nSufElts; i += tn) {
    size_t elt = i < nPreElts ? i : nElts-nSufElts-nPreElts+i;
-    BytePack<sizeof(T)> acc0 = *add4G(inpRank0+elt, rank*stride4G);
+    BytePack<sizeof(T)> acc0 = inpPacks.peerPtr(world, rank)[elt];
    BytePack<sizeof(Acc)> acc1;
    BytePack<sizeof(T)> tmp[UnrollPeers];
    int dr = 1;
@@ -154,7 +159,7 @@ static __device__ __forceinline__ void allreduceEnds(
        #pragma unroll
        for (int u=0; u < UnrollPeers-partial; u++) {
          if (partial && u!=0 && dr+u == nRanks) break;
-          tmp[u] = *add4G(inpRank0+elt, r*stride4G);
+          tmp[u] = inpPacks.peerPtr(world, r)[elt];
          r += 1;
          if (r == nRanks) r = 0;
        }
@@ -182,7 +187,7 @@ static __device__ __forceinline__ void allreduceEnds(
        #pragma unroll
        for (int u=0; u < UnrollPeers-partial; u++) {
          if (partial && dr+u == nRanks) break;
-          *add4G(outRank0+elt, r*stride4G) = acc0;
+          outPacks.peerPtr(world, r)[elt] = acc0;
          r += 1;
          if (r == nRanks) r = 0;
        }
@@ -193,35 +198,33 @@ static __device__ __forceinline__ void allreduceEnds(

 template<typename Red, typename T>
 static __device__ void allreduce(
-    ncclSymPrims& prim, int tn, int t, bool waitNeeded,
-    Red red, T* input, T* output, size_t nElts
+    ncclSymkArgsHandler const& handler, int tn, int t, int nBlocks,
+    bool waitNeeded, ncclLsaBarrierSession<ncclCoopCta>& bar,
+    Red red, ncclSymPtr<T> input, ncclSymPtr<T> output, size_t nElts
  ) {
-  int nRanks = prim.nRanks;
-  int nBlocks = prim.nBlocks;
-  // Mpve to rank=0
-  input = prim.peerPtr(0, input);
-  output = prim.peerPtr(0, output);
-
-  uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
-  uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
+  int const& nRanks = handler.comm.nRanks;
+  int const& nRanks_rcp32 = handler.nRanks_rcp32;
  size_t nBytes = nElts*sizeof(T);
+  uint32_t nBlocks_rcp32 = nccl::utility::idivRcp32_upto64(nBlocks);
+  uint32_t nRanks_nBlocks_rcp32 = nccl::utility::imulRcp32(nRanks, nRanks_rcp32, nBlocks, nBlocks_rcp32);

-  uint32_t nPreBytes = (16u - inputUptr)%16u;
+  uint32_t nPreBytes = (16u - input.offset)%16u;
  nPreBytes = min((size_t)nPreBytes, nBytes);
  uintptr_t cursor = nPreBytes;

  constexpr int MinWarpPerBlock = 4;

-  if ((inputUptr-outputUptr)%16 == 0) {
+  if ((input.offset - output.offset)%16 == 0) {
    constexpr int BytePerPack = 16, UnrollPacks = 4, UnrollPeers = 2;
    constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
    uint32_t chunks = (nBytes-cursor)/BytePerChunk;
-    chunks -= imodFast32(chunks, nRanks*nBlocks, prim.nRanks_nBlocks_rcp32);
+    chunks -= imodFast32(chunks, nRanks*nBlocks, nRanks_nBlocks_rcp32);
    if (chunks != 0) {
      uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
      allreduceDeep<BytePerPack, UnrollPacks, UnrollPeers, T>(
-        prim, tn, t, waitNeeded, red,
-        (char*)input + cursor, (char*)output + cursor,
+        handler, tn, t, waitNeeded, bar, red,
+        (ncclSymPtr<char>)input + cursor,
+        (ncclSymPtr<char>)output + cursor,
        chunks*MinWarpPerBlock
      );
      cursor = cursorAfter;
@@ -229,16 +232,17 @@ static __device__ void allreduce(
    }
  }

-  if (sizeof(T) == 4 || (sizeof(T) < 4 && (inputUptr-outputUptr)%4 == 0)) {
+  if (sizeof(T) == 4 || (sizeof(T) < 4 && (input.offset - output.offset)%4 == 0)) {
    constexpr int BytePerPack = 4, UnrollPacks = 4, UnrollPeers = 4;
    constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
    uint32_t chunks = (nBytes-cursor)/BytePerChunk;
-    chunks -= imodFast32(chunks, nRanks*nBlocks, prim.nRanks_nBlocks_rcp32);
+    chunks -= imodFast32(chunks, nRanks*nBlocks, nRanks_nBlocks_rcp32);
    if (chunks != 0) {
      uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
      allreduceDeep<(sizeof(T) <= BytePerPack ? BytePerPack : 0), UnrollPacks, UnrollPeers, T>(
-        prim, tn, t, waitNeeded, red,
-        (char*)input + cursor, (char*)output + cursor,
+        handler, tn, t, waitNeeded, bar, red,
+        (ncclSymPtr<char>)input + cursor,
+        (ncclSymPtr<char>)output + cursor,
        chunks*MinWarpPerBlock
      );
      cursor = cursorAfter;
@@ -246,46 +250,51 @@ static __device__ void allreduce(
    }
  }

-  if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+  if (waitNeeded) bar.wait(ncclCoopCta(), cuda::memory_order_relaxed);

  constexpr int UnrollPeers = 8;
  size_t nSufElts = (nBytes-cursor)/sizeof(T);
-  allreduceEnds<UnrollPeers>(prim, tn, t, red, input, output, nElts, nPreBytes/sizeof(T), nSufElts);
+  allreduceEnds<UnrollPeers>(handler, tn, t, red, input, output, nElts, nPreBytes/sizeof(T), nSufElts);
 }

-
 template<template<typename> typename Red, typename T>
-__device__ __forceinline__ void ncclSymRun_AllReduce_RSxLD_AGxST(ncclSymDevArgs const* args) {
-  ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier);
-  int /*const&*/ rank = prim.rank;
-  int /*const&*/ nRanks = prim.nRanks;
-  Red<typename ncclSymAccumType<Red, T, /*nvls=*/false>::Type> red(args->redOpArg);
+__device__ __forceinline__ void ncclSymkRun_AllReduce_RSxLD_AGxST(ncclSymkDevWorkArgs const* args) {
+  ncclSymkArgsHandler handler{args};
+  ncclLsaBarrierSession<ncclCoopCta> bar{
+    ncclCoopCta(), handler.comm, ncclTeamTagLsa(), blockIdx.x
+  };

-  // Threads numbered globally such that we round robin warps by rank then block.
-  int gt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
-                     rank, nRanks,
-                     prim.block, prim.nBlocks,
-                     threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
-  int gtn = nRanks*prim.nBlocks*blockDim.x;
+  Red<typename ncclSymkAccumType<Red, T, /*nvls=*/false>::Type> red(handler.devWork->redOpArg);

-  prim.barrierArrive(ncclCoopCta(), /*release=*/false);
-  //prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+  int const& rank = handler.comm.rank;
+  int const& nRanks = handler.comm.nRanks;

-  allreduce(prim, gtn, gt, /*waitNeeded=*/true, red, (T*)args->input, (T*)args->output, args->nElts);
+  bar.arrive(ncclCoopCta(), cuda::memory_order_relaxed);

-  prim.barrierArrive(ncclCoopCta(), /*release=*/true);
-  prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+  bool waitNeeded = true;
+  handler.forEachWork<T>(
+      [&]__device__(int block, int nBlocks, size_t nElts, size_t nAllElts,
+                    ncclSymPtr<T> input, ncclSymPtr<T> output) {
+        // Threads numbered globally such that we round robin warps by rank then block.
+        int gt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
+                           rank, nRanks,
+                           block, nBlocks,
+                           threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
+        int gtn = nRanks*nBlocks*blockDim.x;
+
+        allreduce(handler, gtn, gt, nBlocks, waitNeeded, bar, red, input, output, nElts);
+
+        waitNeeded = false;
+      }
+    );
+
+  bar.sync(ncclCoopCta(), cuda::memory_order_release);
 }

-
 template<typename Red, typename T>
 static __device__ void allreduceMultimem(
-    ncclSymPrims& prim, int tn, int t, Red red, T* input, T* output, size_t nElts
+    int tn, int t, Red red, T* input, T* output, size_t nElts
  ) {
-  // Mpve to multimem
-  input = prim.multimemPtr(input);
-  output = prim.multimemPtr(output);
-
  uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
  uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
  size_t nBytes = nElts*sizeof(T);
@@ -330,106 +339,132 @@ static __device__ void allreduceMultimem(
    uintptr_t cursor = i < nPreBytes ? i : nBytes-nSufBytes+(i-nPreBytes);
    BytePack<sizeof(T)> val = applyLoadMultimem<Red, sizeof(T)>(red, inputUptr + cursor);
    multimem_st_global(outputUptr + cursor, val);
-    cursor += tn*sizeof(T);
  }
 }

 template<template<typename> typename Red, typename T>
-__device__ __forceinline__ void ncclSymRun_AllReduce_RSxLDMC_AGxSTMC(ncclSymDevArgs const* args) {
-  ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier|ncclSymPrims_UseMultimem);
-  Red<typename ncclSymAccumType<Red, T, /*nvls=*/true>::Type> red(args->redOpArg);
+__device__ __forceinline__ void ncclSymkRun_AllReduce_RSxLDMC_AGxSTMC(ncclSymkDevWorkArgs const* args) {
+  ncclSymkArgsHandler handler{args};
+  ncclLsaBarrierSession<ncclCoopCta> bar{
+    ncclCoopCta(), handler.comm, ncclTeamTagLsa(), blockIdx.x, /*multimem=*/true
+  };

-  // Threads numbered globally such that we round robin warps by rank then block.
-  int gt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
-                     prim.rank, prim.nRanks,
-                     prim.block, prim.nBlocks,
-                     threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
-  int gtn = prim.nRanks*prim.nBlocks*blockDim.x;
+  Red<typename ncclSymkAccumType<Red, T, /*nvls=*/true>::Type> red(handler.devWork->redOpArg);

-  prim.barrierArrive(ncclCoopCta(), /*release=*/false);
-  prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+  int const& rank = handler.comm.rank;
+  int const& nRanks = handler.comm.nRanks;
+  auto const& multimem = handler.comm.lsaMultimem;

-  allreduceMultimem(prim, gtn, gt, red, (T*)args->input, (T*)args->output, args->nElts);
+  bar.sync(ncclCoopCta(), cuda::memory_order_relaxed);

-  prim.barrierArrive(ncclCoopCta(), /*release=*/true);
-  prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+  handler.forEachWork<T>(
+      [&]__device__(int block, int nBlocks, size_t nElts, size_t nAllElts,
+                    ncclSymPtr<T> input, ncclSymPtr<T> output) {
+        // Threads numbered globally such that we round robin warps by rank then block.
+        int gt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
+                           rank, nRanks,
+                           block, nBlocks,
+                           threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
+        int gtn = nRanks*nBlocks*blockDim.x;
+
+        allreduceMultimem(gtn, gt, red, input.multimemPtr(multimem), output.multimemPtr(multimem), nElts);
+      }
+    );
+
+  bar.sync(ncclCoopCta(), cuda::memory_order_release);
 }

 template<template<typename> typename Red, typename T>
-__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLL_R_impl(ncclSymDevArgs const* args, bool multimem) {
-  ncclSymPrims prim(args->comm, ncclSymPrims_UseLL | multimem*ncclSymPrims_UseMultimem);
-  int /*const&*/ rank = prim.rank;
-  using Acc = typename ncclSymAccumType<Red, T, /*nvls=*/false>::Type;
-  Red<Acc> red(args->redOpArg);
+__device__ __forceinline__ void ncclSymkRun_AllReduce_AGxLL_R_impl(ncclSymkDevWorkArgs const* args, bool multimem) {
+  ncclSymkArgsHandler handler{args};
+  ncclLLA2ASession<ncclCoopCta> lla2a(
+    ncclCoopCta(), handler.comm, ncclTeamLsa(handler.comm), handler.lsaLLA2A,
+    blockIdx.x, ncclSymkMaxThreads, multimem, handler.comm.lsaMultimem
+  );
+
+  int const& rank = handler.comm.rank;
+  int const& nRanks = handler.comm.nRanks;
+  using Acc = typename ncclSymkAccumType<Red, T, /*nvls=*/false>::Type;
+  Red<Acc> red(handler.devWork->redOpArg);

  using Pack = BytePack<8>;
  using AccPack = BytePack<8*sizeof(Acc)/sizeof(T)>;
  constexpr int EltPerPack = 8/sizeof(T);
-  int nElts = args->nElts;
-  int nPacks = divUp(nElts, EltPerPack);

-  bool packAligned = 8 <= alignof(T) || (
-      args->nElts*sizeof(T) |
-      (uint32_t)reinterpret_cast<uintptr_t>(args->input) |
-      (uint32_t)reinterpret_cast<uintptr_t>(args->output)
-    )%8 == 0;
+  handler.singleWork<T>(
+      [&]__device__(int nElts, int nAllElts,
+                    ncclSymPtr<T> inputPtr, ncclSymPtr<T> outputPtr) {
+        int nPacks = divUp(nElts, EltPerPack);

-  uint32_t nPackPerBlock, nPackModBlock;
-  idivmodFast32(&nPackPerBlock, &nPackModBlock, nPacks, prim.nBlocks, prim.nBlocks_rcp32);
-  int begin = prim.block*nPackPerBlock + minval<int>(prim.block, nPackModBlock);
-  int end = begin + nPackPerBlock + (prim.block < nPackModBlock ? 1 : 0);
+        T* input = (T*)inputPtr.localPtr();
+        T* output = (T*)outputPtr.localPtr();

-  nPacks = end - begin;
-  nElts -= begin*EltPerPack;
-  nElts = min(nElts, nPacks*EltPerPack);
-  T* input = (T*)args->input + begin*EltPerPack;
-  T* output = (T*)args->output + begin*EltPerPack;
+        bool packAligned = 8 <= alignof(T) || (nElts*sizeof(T) | (uintptr_t)input | (uintptr_t)output)%8 == 0;

-  ncclCoopCta cta;
-  int t = threadIdx.x;
-  int tn = ncclSymMaxThreads;
+        ncclCoopCta cta;
+        int t = threadIdx.x;
+        int tn = ncclSymkMaxThreads;

-  if (__builtin_expect(packAligned, true)) {
-    #pragma unroll 1
-    while (0 < nPacks) {
-      if (t < nPacks) {
-        int nIterPacks = min(nPacks, tn);
-        Pack inp = loadPack<Pack>((Pack*)input, t, nPacks);
-        prim.bcastLL(/*slot=*/nIterPacks*rank + t, inp);
-        Pack out = prim.template recvReduceLL<Pack, T>(t, nIterPacks, red);
-        storePack((Pack*)output, t, nPacks, out);
+        if (__builtin_expect(packAligned, true)) {
+          #pragma unroll 1
+          while (0 < nPacks) {
+            if (t < nPacks) {
+              int nIterPacks = min(nPacks, tn);
+              Pack inp = loadPack<Pack>((Pack*)input, t, nPacks);
+              lla2a.bcast(/*slot=*/nIterPacks*rank + t, inp);
+              AccPack out = lla2a.template recvReduce</*Unroll=*/8, Pack>(
+                /*slotStart=*/t, /*slotCount=*/nRanks, /*slotStride=*/nIterPacks,
+                /*eltToAcc=*/[&] __device__ (Pack x)->AccPack {
+                  return applyCast<T, Acc>(x);
+                },
+                /*reduce=*/[&] __device__ (AccPack a, AccPack b)->AccPack {
+                  return applyReduce(red, a, b);
+                }
+              );
+              storePack((Pack*)output, t, nPacks, applyCast<Acc, T>(out));
+            }
+            lla2a.endEpoch(cta);
+
+            input += tn*EltPerPack;
+            output += tn*EltPerPack;
+            nPacks -= tn;
+          }
+        } else {
+          #pragma unroll 1
+          while (0 < nElts) {
+            if (t*EltPerPack < nElts) {
+              int nIterPacks = min(nPacks, tn);
+              Pack inp = loadPack<Pack>(input, t*EltPerPack, nElts);
+              lla2a.bcast(/*slot=*/nIterPacks*rank + t, inp);
+              AccPack out = lla2a.template recvReduce</*Unroll=*/8, Pack>(
+                /*slotStart=*/t, /*slotCount=*/nRanks, /*slotStride=*/nIterPacks,
+                /*eltToAcc=*/[&] __device__ (Pack x)->AccPack {
+                  return applyCast<T, Acc>(x);
+                },
+                /*reduce=*/[&] __device__ (AccPack a, AccPack b)->AccPack {
+                  return applyReduce(red, a, b);
+                }
+              );
+              storePack(output, t*EltPerPack, nElts, applyCast<Acc, T>(out));
+            }
+            lla2a.endEpoch(cta);
+
+            input += tn*EltPerPack;
+            output += tn*EltPerPack;
+            nElts -= tn*EltPerPack;
+            nPacks -= tn;
+          }
+        }
      }
-      prim.endLL(cta);
-
-      input += tn*EltPerPack;
-      output += tn*EltPerPack;
-      nPacks -= tn;
-    }
-  } else {
-    #pragma unroll 1
-    while (0 < nElts) {
-      if (t*EltPerPack < nElts) {
-        int nIterPacks = min(nPacks, tn);
-        Pack inp = loadPack<Pack>(input, t*EltPerPack, nElts);
-        prim.bcastLL(/*slot=*/nIterPacks*rank + t, inp);
-        Pack out = prim.template recvReduceLL<Pack, T>(t, nIterPacks, red);
-        storePack(output, t*EltPerPack, nElts, out);
-      }
-      prim.endLL(cta);
-
-      input += tn*EltPerPack;
-      output += tn*EltPerPack;
-      nElts -= tn*EltPerPack;
-      nPacks -= tn;
-    }
-  }
+    );
 }

 template<template<typename> typename Red, typename T>
-__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLL_R(ncclSymDevArgs const* args) {
-  ncclSymRun_AllReduce_AGxLL_R_impl<Red, T>(args, /*multimem=*/false);
+__device__ __forceinline__ void ncclSymkRun_AllReduce_AGxLL_R(ncclSymkDevWorkArgs const* args) {
+  ncclSymkRun_AllReduce_AGxLL_R_impl<Red, T>(args, /*multimem=*/false);
 }
+
 template<template<typename> typename Red, typename T>
-__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLLMC_R(ncclSymDevArgs const* args) {
-  ncclSymRun_AllReduce_AGxLL_R_impl<Red, T>(args, /*multimem=*/true);
+__device__ __forceinline__ void ncclSymkRun_AllReduce_AGxLLMC_R(ncclSymkDevWorkArgs const* args) {
+  ncclSymkRun_AllReduce_AGxLL_R_impl<Red, T>(args, /*multimem=*/true);
 }
--- a/Show More
+++ b/Show More