Merge remote-tracking branch 'nccl-tests/master' into develop

[ROCm/rccl-tests commit: 5625599dda]
2025-04-11 12:00:15 -05:00
@@ -1,4 +1,7 @@
 # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENCE.txt for license information
-/build
+build/
+*.gcov
+/coverage/
+__pycache__/
@@ -1,73 +1,190 @@
-# ########################################################################
-# Copyright 2022 Advanced Micro Devices, Inc.
-# ########################################################################
-#Adding pthread flag for linking
-set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
+# Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.

-cmake_minimum_required(VERSION 3.16.3 FATAL_ERROR)
+# CMake version minimum requirements
+#==================================================================================================
+cmake_minimum_required(VERSION 3.16 FATAL_ERROR)

-project(RCCL-tests VERSION 2.12.10 LANGUAGES CXX)
-
-# Get ROCm path from environment if available
-if (DEFINED ENV{ROCM_PATH})
-    set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to ROCm installation")
-else()
-    set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to ROCm installation")
+# CMake Toolchain file to define compilers and path to ROCm
+#==================================================================================================
+if (NOT CMAKE_TOOLCHAIN_FILE)
+  set(CMAKE_TOOLCHAIN_FILE "${CMAKE_CURRENT_SOURCE_DIR}/toolchain-linux.cmake")
+  message(STATUS "CMAKE_TOOLCHAIN_FILE: ${CMAKE_TOOLCHAIN_FILE}")
 endif()

-# Set CMake/CPack variables
-list( APPEND CMAKE_PREFIX_PATH ${ROCM_PATH} ${ROCM_PATH}/llvm)
-set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}/install" CACHE PATH "Prefix install path")
-set(CPACK_PACKAGING_INSTALL_PREFIX "${ROCM_PATH}" CACHE PATH "Path to install to when packaged.")
-set(CMAKE_CXX_STANDARD 14)
+# RCCL Tests project
+#==================================================================================================
+project(rccl-tests LANGUAGES CXX)

-# Get additional packages required
-find_package(ROCM 0.7.3 CONFIG REQUIRED PATHS "${ROCM_PATH}")
-include(ROCMSetupVersion)
-include(ROCMCreatePackage)
-include(ROCMInstallTargets)
-include(ROCMCheckTargetIds)
-include(ROCMClients)
+# Build options
+#==================================================================================================
+option(USE_MPI                     "Build RCCL-tests with MPI support."           OFF)
+option(BUILD_LOCAL_GPU_TARGET_ONLY "Build only for GPUs detected on this machine" OFF)

-# Build variables
-option(USE_MPI "Build RCCL-tests with MPI support.")
+if (NOT CMAKE_BUILD_TYPE)
+  message(WARNING "CMAKE_BUILD_TYPE is not defined. Setting to Release")
+  set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Default build type")
+endif()

 # Default GPU architectures to build
 #==================================================================================================
 set(DEFAULT_GPUS
-      gfx803
-      gfx900:xnack-
-      gfx906:xnack-
-      gfx908:xnack-
-      gfx90a:xnack-
-      gfx90a:xnack+
-      gfx940
-      gfx941
+      gfx906
+      gfx908
+      gfx90a
      gfx942
+      gfx950
      gfx1030
      gfx1100
      gfx1101
-      gfx1102)
+      gfx1102
+      gfx1200
+      gfx1201)

-set(AMDGPU_TARGETS ${DEFAULT_GPUS} CACHE STRING "Target default GPUs if AMDGPU_TARGETS is not defined.")
-## Determine which GPU architectures to build for
-if (COMMAND rocm_check_target_ids)
-    message(STATUS "Checking for ROCm support for GPU targets:")
-    rocm_check_target_ids(SUPPORTED_GPUS TARGETS "${AMDGPU_TARGETS}")
-else()
-    message(WARNING "Unable to check for supported GPU targets. Falling back to default GPUs")
-    set(SUPPORTED_GPUS ${DEFAULT_GPUS})
+# Get additional packages required
+include(CheckIncludeFiles)
+include(CheckSymbolExists)
+include(cmake/Dependencies.cmake) # rocm-cmake, rocm_local_targets
+include(cmake/CheckSymbolExistsNoWarn.cmake)
+
+# Build only for local GPU architecture
+if (BUILD_LOCAL_GPU_TARGET_ONLY)
+  message(STATUS "Building only for local GPU target")
+  if (COMMAND rocm_local_targets)
+    rocm_local_targets(DEFAULT_GPUS)
+  else()
+    message(WARNING "Unable to determine local GPU targets. Falling back to default GPUs.")
+  endif()
 endif()
-set(GPU_TARGETS "${SUPPORTED_GPUS}" CACHE STRING "List of specific GPU architectures to build for.")
+
+# Determine which GPU architectures to build for
+set(GPU_TARGETS "${DEFAULT_GPUS}" CACHE STRING "Target default GPUs if GPU_TARGETS is not defined.")
+
+# Check if clang compiler can offload to GPU_TARGETS
+if (COMMAND rocm_check_target_ids)
+  message(STATUS "Checking for ROCm support for GPU targets: " "${GPU_TARGETS}")
+  rocm_check_target_ids(SUPPORTED_GPUS TARGETS ${GPU_TARGETS})
+else()
+  message(WARNING "Unable to check for supported GPU targets. Falling back to default GPUs.")
+  set(SUPPORTED_GPUS ${DEFAULT_GPUS})
+endif()
+
+set(GPU_TARGETS "${SUPPORTED_GPUS}")
 message(STATUS "Compiling for ${GPU_TARGETS}")

-find_package(RCCL HINTS CONFIG REQUIRED PATHS "${ROCM_PATH}")
+## NOTE: Reload rocm-cmake in order to update GPU_TARGETS
+include(cmake/Dependencies.cmake) # Reloading to use desired GPU_TARGETS instead of defaults
+
+# Try to establish ROCM_PATH (for find_package)
+#==================================================================================================
+if(NOT DEFINED ROCM_PATH)
+  # Guess default location
+  set(ROCM_PATH "/opt/rocm")
+  message(WARNING "Unable to find ROCM_PATH: Falling back to ${ROCM_PATH}")
+else()
+  message(STATUS "ROCM_PATH found: ${ROCM_PATH}")
+endif()
+set(ENV{ROCM_PATH} ${ROCM_PATH})
+
+if("${CMAKE_CXX_COMPILER}" MATCHES ".*amdclang\\+\\+")
+  message(STATUS "Compiling with amdclang++")
+  set(COMPILER_EXE_NAME amdclang++)
+  set(COMPILER_GREP_STRING "AMD clang version")
+  set(COMPILER_AWK_CMD "awk -F\" \" '{ printf $4}'")
+elseif("${CMAKE_CXX_COMPILER}" MATCHES ".*clang\\+\\+")
+  message(STATUS "Compiling with clang++")
+  set(COMPILER_EXE_NAME clang++)
+  set(COMPILER_GREP_STRING "AMD clang version")
+  set(COMPILER_AWK_CMD "awk -F\" \" '{ printf $4}'")
+elseif("${CMAKE_CXX_COMPILER}" MATCHES ".*hipcc$")
+  message(STATUS "Compiling with hipcc")
+  set(COMPILER_EXE_NAME hipcc)
+  set(COMPILER_GREP_STRING "HIP version")
+  set(COMPILER_AWK_CMD "awk -F\" \" '{ printf $3}' | awk -F\"-\" '{ printf $1}'")
+else()
+  message(FATAL_ERROR "RCCL-Tests can be built only with hipcc or amdclang++")
+endif()
+
+# Set CMAKE flags
+#==================================================================================================
+set(CMAKE_INSTALL_PREFIX "${ROCM_PATH}" CACHE PATH "")
+set(CMAKE_CXX_STANDARD   14)   # We use C++14 features, this will add compile option: -std=c++14
+set(CMAKE_CXX_EXTENSIONS OFF)  # Without this line, it will add -std=gnu++14 instead, which has some issues.
+set(CPACK_PACKAGING_INSTALL_PREFIX "${ROCM_PATH}" CACHE PATH "Path to install to when packaged.")
+if(ROCM_PATH)
+  #list(APPEND CMAKE_PREFIX_PATH  # Temporary workaround 
+  list(PREPEND CMAKE_PREFIX_PATH  # Add ROCM_PATH to CMake search paths (for finding HIP / HSA
+              ${ROCM_PATH}
+              ${ROCM_PATH}/hip
+              ${ROCM_PATH}/llvm)
+endif()
+
+# Check for required dependencies
+#==================================================================================================
+## Check for Threads
+set(THREADS_PREFER_PTHREAD_FLAG ON)
+find_package(Threads REQUIRED)
+
+##Adding pthread flag for linking
+#set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
+
+## Check for HIP
+find_package(hip REQUIRED)
+message(STATUS "HIP compiler:     ${HIP_COMPILER}")
+message(STATUS "HIP runtime:      ${HIP_RUNTIME}")
+if(NOT "${HIP_COMPILER}" MATCHES "clang")
+  message(FATAL_ERROR "RCCL requires clang-based compiler (amdclang++ or hipcc)")
+endif()
+
+## Check for compiler version
+find_program(compiler_executable ${COMPILER_EXE_NAME})
+message(STATUS "${COMPILER_EXE_NAME} executable: ${compiler_executable}")
+execute_process(
+  COMMAND         bash "-c" "${compiler_executable} --version | grep \"${COMPILER_GREP_STRING}\" | ${COMPILER_AWK_CMD}"
+  OUTPUT_VARIABLE compiler_version_string)
+message(STATUS "${COMPILER_EXE_NAME} version:    ${compiler_version_string}")
+
+## Check for HIP version
+find_program(hipconfig_executable hipconfig)
+message(STATUS "hipconfig executable: ${hipconfig_executable}")
+execute_process(
+  COMMAND         bash "-c" "${hipconfig_executable} -v | awk -F\"-\" '{ printf $1 }'"
+  OUTPUT_VARIABLE hip_version_string)
+message(STATUS "${COMPILER_EXE_NAME} HIP version:    ${hip_version_string}")
+
+##Check for ROCm version
+set(EXPLICIT_ROCM_VERSION "" CACHE STRING "Explicit ROCM version to compile to (auto detect if empty)")
+if(EXPLICIT_ROCM_VERSION)
+  set(rocm_version_string "${EXPLICIT_ROCM_VERSION}")
+elseif(ROCM_PATH)
+  message(STATUS "Reading ROCM version from ${ROCM_PATH}/.info/version")
+  file(READ "${ROCM_PATH}/.info/version" rocm_version_string)
+else()
+  message(FATAL_ERROR "Could not determine ROCM version (set EXPLICIT_ROCM_VERSION or set ROCM_PATH to a valid installation)")
+endif()
+string(REGEX MATCH "([0-9]+)\\.([0-9]+)\\.([0-9]+)" rocm_version_matches ${rocm_version_string})
+if (rocm_version_matches)
+  set(ROCM_MAJOR_VERSION ${CMAKE_MATCH_1})
+  set(ROCM_MINOR_VERSION ${CMAKE_MATCH_2})
+  set(ROCM_PATCH_VERSION ${CMAKE_MATCH_3})
+
+  message(STATUS "ROCm version: ${ROCM_MAJOR_VERSION}.${ROCM_MINOR_VERSION}.${ROCM_PATCH_VERSION}")
+
+  # Convert the version components to int for comparison
+  math(EXPR ROCM_VERSION "(10000 * ${ROCM_MAJOR_VERSION}) + (100 * ${ROCM_MINOR_VERSION}) + ${ROCM_PATCH_VERSION}")
+  add_definitions("-DROCM_VERSION=${ROCM_VERSION}")
+else()
+  message(WARNING "Failed to extract ROCm version.")
+endif()
+
+## Check for RCCL
+find_package(RCCL CONFIG REQUIRED HINTS "${CMAKE_PREFIX_PATH}" PATHS "${ROCM_PATH}")
 if (RCCL_FOUND)
    message(STATUS "RCCL version : ${RCCL_VERSION}")
    message(STATUS "RCCL include path : ${RCCL_INCLUDE_DIRS}")
    message(STATUS "RCCL libraries : ${RCCL_LIBRARIES}")
 endif()

+## Check for MPI (if enabled)
 if (USE_MPI)
    find_package(MPI REQUIRED)
    if (MPI_FOUND)
@@ -11,7 +11,7 @@ override BUILDDIR := $(abspath $(BUILDDIR))

 default: src.build

-TARGETS=$(filter-out src/hypercube.cu, $(wildcard src/*))
+TARGETS=src

 all:   ${TARGETS:%=%.build}
 clean: ${TARGETS:%=%.clean}
@@ -1,72 +1,108 @@
 # RCCL Tests

-These tests check both the performance and the correctness of RCCL operations. They can be compiled against [RCCL](https://github.com/ROCmSoftwarePlatform/rccl).
+These tests check both the performance and the correctness of RCCL operations. They can be compiled against [RCCL](https://github.com/ROCm/rccl).

 ## Build

 To build the tests, just type `make`.

-If HIP is not installed in /opt/rocm, you may specify HIP\_HOME. Similarly, if RCCL is not installed in /usr, you may specify NCCL\_HOME and CUSTOM\_RCCL\_LIB.
+If HIP is not installed in `/opt/rocm`, you may specify `HIP_HOME`. Similarly, if RCCL (`librccl.so`) is not installed in `/opt/rocm/lib/`, you may specify `NCCL_HOME` and `CUSTOM_RCCL_LIB`.

 ```shell
-$ make HIP_HOME=/path/to/hip NCCL_HOME=/path/to/rccl CUSTOM_RCCL_LIB=/path/to/rccl/lib/librccl.so
+$ make HIP_HOME=/path/to/hip NCCL_HOME=/path/to/rccl
 ```

-RCCL tests rely on MPI to work on multiple processes, hence multiple nodes. If you want to compile the tests with MPI support, you need to set MPI=1 and set MPI\_HOME to the path where MPI is installed.
+RCCL Tests rely on MPI to work on multiple processes, hence multiple nodes.
+
+> [!TIP]
+> To compile RCCL tests with MPI support, you need to set `MPI=1` and set `MPI_HOME` to the path where MPI is installed.

 ```shell
 $ make MPI=1 MPI_HOME=/path/to/mpi HIP_HOME=/path/to/hip NCCL_HOME=/path/to/rccl
 ```

-RCCL tests can also be built using cmake. A typical sequence will be:
+RCCL Tests can also be built using cmake. A typical sequence will be:

 ```shell
 $ mkdir build
 $ cd build
-$ CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_PREFIX_PATH=/path/to/rccl ..
+$ cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH=/path/to/rocm ..
 $ make
 ```

-When using the cmake build procedure, please make sure that RCCL has also been built using cmake (i.e. not using the install.sh script), since cmake will check
-for cmake target and config files that are created during the RCCL build.
+When using the cmake build procedure for building RCCL-Tests with custom/user-built `librccl.so`, please make sure that RCCL has been installed (i.e. using `make install`) and not pointing to the RCCL `build` directory, since cmake will check for cmake target and config files. This is not necessary as one can modify `LD_LIBRARY_PATH` to point to the custom/user-built `librccl.so` when running RCCL Tests.

-Using the cmake method also has the advantage that the build is automatically checking for MPI installations. The tests can be compiled with MPI support by adding the `-DUSE_MPI=ON` flag to the cmake command line. A user can request to use a particular MPI library by setting the environment variable `MPI_HOME` or add the path of the MPI library to the cmake prefix path with `-DCMAKE_PREFIX_PATH`.
+Using the cmake method also has the advantage that it automatically checks for MPI installation during the build. The tests can be compiled with MPI support by adding the `-DUSE_MPI=ON` flag to the cmake command line.

+> [!TIP]
+> Users can choose to link against a particular MPI library by using one of these options:
+> * setting the environment variable `MPI_HOME`.
+> * by adding the path to the MPI library to the cmake prefix path with `-DCMAKE_PREFIX_PATH`.
+> * including the paths to MPI `bin` and `lib` in the `PATH` and `LD_LIBRARY_PATH` environment variables, respectively.
+
+e.g.,
+```shell
+$ mkdir build
+$ cd build
+$ cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="/path/to/mpi;/path/to/rocm" -DUSE_MPI=ON ..
+$ make
+```
+
+By default, for both Makefile and `cmake` based builds, RCCL Tests will link against all supported GPU targets (defined in `src/Makefile` and as `DEFAULT_GPUS` in `CMakeLists.txt`).
+
+To target specific GPU(s), and potentially reduce build time, use:
+* `GPU_TARGETS` as a `,` separated string listing GPU(s) to target for Makefile based build.
+e.g. build RCCL-Tests using Makefile only for `gfx942` and `gfx950`. e.g.,
+    ```shell
+    $ GPU_TARGETS="gfx942,gfx950" make MPI=1 MPI_HOME=/path/to/mpi NCCL_HOME=/opt/rocm
+    ```
+* `-DGPU_TARGETS` as a `;` separated string listing GPU(s) to target for `cmake` based build.
+e.g. build RCCL-Tests using CMake for `gfx90a`, `gfx942` and `gfx1200`. e.g.,
+    ```shell
+    $ cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="/path/to/mpi;/path/to/rocm" -DUSE_MPI=ON -DGPU_TARGETS="gfx90a;gfx942;gfx1200;" ..
+    ```
+* For CMake builds, we also have another flag `DBUILD_LOCAL_GPU_TARGET_ONLY` that queries and builds for the local GPU target only (similar to RCCL).
+    ```shell
+    $ cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="/path/to/mpi;/path/to/rocm" -DUSE_MPI=ON -DBUILD_LOCAL_GPU_TARGET_ONLY=ON ..
+    ```
+
+`-DBUILD_LOCAL_GPU_TARGET_ONLY` will not work with `docker build`-based setups, as the docker build engine is unable to query the local GPU architecture. Please use `-DGPU_TARGETS` for CMake-based builds or `GPU_TARGETS` for Makefile-based builds when building RCCL-Tests using a Dockerfile and `docker build`.

 ## Usage

-RCCL tests can run on multiple processes, multiple threads, and multiple HIP devices per thread. The number of process is managed by MPI and is therefore not passed to the tests as argument. The total number of ranks (=HIP devices) will be equal to (number of processes)\*(number of threads)\*(number of GPUs per thread).
+RCCL Tests can run on multiple processes, multiple threads, and multiple HIP devices per thread. The number of process is managed by MPI and is therefore not passed to the tests as argument. The total number of ranks (=HIP devices) will be equal to (number of processes)\*(number of threads)\*(number of GPUs per thread).

 ### Quick examples

-Run on 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes :
+Run on single node with 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes :
 ```shell
 $ ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8
 ```

-Run with MPI on 10 processes (potentially on multiple nodes) with 4 GPUs each, for a total of 40 GPUs:
+Run 64 MPI processes on nodes with 8 GPUs each, for a total of 64 GPUs spread across 8 nodes :
+(NB: The rccl-tests binaries must be compiled with `MPI=1` for this case)
 ```shell
-$ mpirun -np 10 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4
+$ mpirun -np 64 -N 8 ./build/all_reduce_perf -b 8 -e 8G -f 2 -g 1
 ```

-For performance-oriented runs, on both single-node and multi-node, we suggest using 1 MPI process per GPU and `-g 1`. So, a run on 8 GPUs looks like :
-```shell
-$ mpirun -np 8 --bind-to numa ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 1
-```
-Running with 1 MPI process per GPU ensures a 1:1 mapping for CPUs and GPUs, which can be beneficial for smaller message sizes and better represents the real-world use of RCCL in Deep Learning frameworks like Pytorch and TensorFlow.
+> [!TIP]
+> For performance-oriented runs, on both single-node and multi-node, we suggest using 1 MPI process per GPU and `-g 1`. So, a run on 8 GPUs looks like :
+> ```shell
+> $ mpirun -np 8 --bind-to numa ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 1
+> ```
+> Running with 1 MPI process per GPU ensures a 1:1 mapping for CPUs and GPUs, which can be beneficial for smaller message sizes and better represents the real-world use of RCCL in Deep Learning frameworks like Pytorch and TensorFlow.

 ### Performance

 See the [Performance](doc/PERFORMANCE.md) page for explanation about numbers, and in particular the "busbw" column.

-### Environment variables
-On some older versions of ROCm before 6.4.0, setting `HSA_NO_SCRATCH_RECLAIM=1`
- as part of the environment might be necessary to achieve better performance.  When running without MPI, a command similar to the following one should be sufficient:
+#### Environment variables
+On some earlier versions of ROCm (before ROCm 6.4.0), setting `HSA_NO_SCRATCH_RECLAIM=1` as part of the environment is necessary to achieve better performance on MI300 GPUs. When running without MPI, a command similar to the following one should be sufficient:
 ```shell
 HSA_NO_SCRATCH_RECLAIM=1 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8
 ```

-For MPI, you might need to use a command similar to the following:
+For MPI (using MPICH), you need to use a command similar to the following:
 ```shell
 mpirun.mpich -np 8 -env NCCL_DEBUG=VERSION -env HSA_NO_SCRATCH_RECLAIM=1 ./build/all_reduce_perf -b 8M -e 128M -i 8388608 -g 1 -d bfloat16
 ```
@@ -89,37 +125,58 @@ All tests support the same set of arguments :
  * `-d,--datatype <nccltype/all>` Specify which datatype to use. Default : Float.
  * `-r,--root <root/all>` Specify which root to use. Only for operations with a root like broadcast or reduce. Default : 0.
  * `-y,--memory_type <coarse/fine/host/managed>` Default: Coarse
-  * `-s,--stress_cycles <number of cycles>` Default: 1
  * `-u,--cumask <d0,d1,d2,d3>` Default: None
 * Performance
  * `-n,--iters <iteration count>` number of iterations. Default : 20.
  * `-w,--warmup_iters <warmup iteration count>` number of warmup iterations (not timed). Default : 5.
  * `-m,--agg_iters <aggregation count>` number of operations to aggregate together in each iteration. Default : 1.
+  * `-N,--run_cycles <cycle count>` run & print each cycle. Default : 1; 0=infinite.
  * `-a,--average <0/1/2/3>` Report performance as an average across all ranks (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. Default : 1.
 * Test operation
  * `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. Default : 0.
  * `-c,--check <check iteration count>` perform count iterations, checking correctness of results on each iteration. This can be quite slow on large numbers of GPUs. Default : 1.
-  * `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0.
-  * `-G,--cudagraph <num graph launches>` Capture iterations as a CUDA graph and then replay specified number of times. Default : 0.
+  * `-z,--blocking <0/1>` Make RCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0.
+  * `-G,--hipgraph <num graph launches>` Capture iterations as a HIP graph and then replay specified number of times. Default : 0.
+  * `-C,--report_cputime <0/1>]` Report CPU time instead of latency. Default : 0.
+  * `-R,--local_register <1/0>` enable local buffer registration on send/recv buffers. Default : 0.
+  * `-T,--timeout <time in seconds>` timeout each test after specified number of seconds. Default : disabled.
  * `-F,--cache_flush <cache flush after every -F iteration>` Enable cache flush after every -F iteration. Default : 0 (No cache flush).
+  * `-q,--delay <delay>` Delay between out-of-place and in-place runs (in microseconds). Default: 10.
+* Parsing RCCL-Tests output
+  * `-Z,--output_format <csv|json>` Parse RCCL-Tests output as a CSV or JSON. Default : disabled.
+  * `-x,--output_file <output file name>` RCCL-Tests output file name. Default : disabled.
+
+### Running multiple operations in parallel
+
+RCCL Tests allow to partition the set of GPUs into smaller sets, each executing the same operation in parallel. 
+To split the GPUs, RCCL will compute a "color" for each rank, based on the `NCCL_TESTS_SPLIT` environment variable, then all ranks
+with the same color will end up in the same group. The resulting group is printed next to each GPU at the beginning of the test.
+
+`NCCL_TESTS_SPLIT` takes the following syntax: `<operation><value>`. Operation can be `AND`, `OR`, `MOD` or `DIV`. The `&`, `|`, `%`, and `/` symbols are also supported. The value can be either decimal, hexadecimal (prefixed by `0x`) or binary (prefixed by `0b`).
+
+`NCCL_TESTS_SPLIT_MASK="<value>"` is equivalent to `NCCL_TESTS_SPLIT="&<value>"`.
+
+Here are a few examples:
+ - `NCCL_TESTS_SPLIT="AND 0x7"` or `NCCL_TESTS_SPLIT="MOD 8`: On systems with 8 GPUs, run 8 parallel operations, each with 1 GPU per node (purely communicating on the network)
+ - `NCCL_TESTS_SPLIT="OR 0x7"` or `NCCL_TESTS_SPLIT="DIV 8"`: On systems with 8 GPUs, run one operation per node, purely intra-node.
+ - `NCCL_TESTS_SPLIT="AND 0x1"` or `NCCL_TESTS_SPLIT="MOD 2"`: Run two operations, each operation using every other rank.
+
+Note that the reported bandwidth is per group, hence to get the total bandwidth used by all groups, one must multiply by the number of groups.

 ## Unit tests

-Unit tests for rccl-tests are implemented with pytest (python3 is also required).  Several notes for the unit tests:
+Unit tests for rccl-tests are implemented with pytest (python3 is also required). Several notes for the unit tests:

-1. The LD_LIBRARY_PATH environment variable will need to be set to include /path/to/rccl-install/lib/ in order to run the unit tests.
-2. The HSA_FORCE_FINE_GRAIN_PCIE environment variable will need to be set to 1 in order to run the unit tests which use fine-grained memory type.
+1. The `LD_LIBRARY_PATH` environment variable will need to be set to include `/path/to/rccl-install/lib/` in order to run the unit tests.
+2. The `HSA_FORCE_FINE_GRAIN_PCIE` environment variable will need to be set to 1 in order to run the unit tests which use fine-grained memory type.

-The unit tests can be invoked within the rccl-tests root, or in the test subfolder.  An example call to the unit tests:
+The unit tests can be invoked within the rccl-tests root, or in the test subfolder. An example call to the unit tests:
 ```shell
 $ LD_LIBRARY_PATH=/path/to/rccl-install/lib/ HSA_FORCE_FINE_GRAIN_PCIE=1 python3 -m pytest
 ```

 ## Copyright

-RCCL tests are provided under the BSD license.
-
-All source code and accompanying documentation is copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
-
-All modifications are copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+NCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.

+All modifications are copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
@@ -0,0 +1,40 @@
+# MIT License
+#
+# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# These overrides are due to CMake CHECK_SYMBOL_EXISTS modifying CMAKE_CXX_FLAGS to do a test compile,
+# while ROCMChecks gives a warning if this variable is modified manually without a target.
+
+# We now choose to disable ROCMChecks for this one case.
+
+set(DISABLE_ROCM_CHECK OFF)
+
+function(rocm_check_toolchain_var var access value list_file)
+  if(NOT DISABLE_ROCM_CHECK)
+    _rocm_check_toolchain_var("${var}" "${access}" "${value}" "${list_file}")
+  endif()
+endfunction()
+
+macro(CHECK_SYMBOL_EXISTS)
+  set(DISABLE_ROCM_CHECK ON)
+  _check_symbol_exists(${ARGN})
+  set(DISABLE_ROCM_CHECK OFF)
+endmacro()
@@ -0,0 +1,124 @@
+# MIT License
+#
+# Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# Dependencies
+
+# HIP dependency is handled earlier in the project cmake file
+# when VerifyCompiler.cmake is included.
+
+# GIT
+
+# Test dependencies
+
+
+# Find or download/install rocm-cmake project
+set(PROJECT_EXTERN_DIR ${CMAKE_CURRENT_BINARY_DIR}/extern)
+find_package(ROCmCMakeBuildTools 0.7.3 QUIET CONFIG PATHS "${ROCM_PATH}")
+if(NOT ROCmCMakeBuildTools_FOUND)
+    set(rocm_cmake_tag "master" CACHE STRING "rocm-cmake tag to download")
+    file(
+        DOWNLOAD https://github.com/ROCm/rocm-cmake/archive/${rocm_cmake_tag}.zip
+        ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip
+        STATUS rocm_cmake_download_status LOG rocm_cmake_download_log
+    )
+    list(GET rocm_cmake_download_status 0 rocm_cmake_download_error_code)
+    if(rocm_cmake_download_error_code)
+        message(FATAL_ERROR "Error: downloading "
+            "https://github.com/ROCm/rocm-cmake/archive/${rocm_cmake_tag}.zip failed "
+            "error_code: ${rocm_cmake_download_error_code} "
+            "log: ${rocm_cmake_download_log} "
+        )
+    endif()
+
+    execute_process(
+        COMMAND ${CMAKE_COMMAND} -E tar xzf ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip
+        WORKING_DIRECTORY ${PROJECT_EXTERN_DIR}
+        RESULT_VARIABLE rocm_cmake_unpack_error_code
+    )
+    execute_process( COMMAND ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_EXTERN_DIR}/rocm-cmake .
+      WORKING_DIRECTORY ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag} )
+    execute_process( COMMAND ${CMAKE_COMMAND} --build rocm-cmake-${rocm_cmake_tag} --target install
+      WORKING_DIRECTORY ${PROJECT_EXTERN_DIR})
+
+    if(rocm_cmake_unpack_error_code)
+        message(FATAL_ERROR "Error: unpacking ${CMAKE_CURRENT_BINARY_DIR}/rocm-cmake-${rocm_cmake_tag}.zip failed")
+    endif()
+    find_package(ROCmCMakeBuildTools 0.7.3 REQUIRED CONFIG PATHS ${PROJECT_EXTERN_DIR}/rocm-cmake )
+endif()
+
+# Find available local ROCM targets
+# NOTE: This will eventually be part of ROCm-CMake and should be removed at that time
+function(rocm_local_targets VARIABLE)
+  set(${VARIABLE} "NOTFOUND" PARENT_SCOPE)
+  find_program(_rocm_agent_enumerator rocm_agent_enumerator HINTS ocm/bin ENV ROCM_PATH)
+  if(NOT _rocm_agent_enumerator STREQUAL "_rocm_agent_enumerator-NOTFOUND")
+    execute_process(
+      COMMAND "${_rocm_agent_enumerator}"
+      RESULT_VARIABLE _found_agents
+      OUTPUT_VARIABLE _rocm_agents
+      ERROR_QUIET
+      )
+    if (_found_agents EQUAL 0)
+      string(REPLACE "\n" ";" _rocm_agents "${_rocm_agents}")
+      unset(result)
+      foreach (agent IN LISTS _rocm_agents)
+        if (NOT agent STREQUAL "gfx000")
+          list(APPEND result "${agent}")
+        endif()
+      endforeach()
+      if(result)
+        list(REMOVE_DUPLICATES result)
+        set(${VARIABLE} "${result}" PARENT_SCOPE)
+      endif()
+    endif()
+  endif()
+endfunction()
+
+# Iterate over the "source" list and check if there is a duplicate file name
+# NOTE: This is due to compiler bug '--save-temps' and can be removed when fix availabe
+function(add_file_unique FILE_LIST FILE)
+  get_filename_component(FILE_NAME "${FILE}" NAME)
+
+  # Iterate over whatever is in the list so far
+  foreach(curr_file IN LISTS ${FILE_LIST})
+    get_filename_component(curr_file_name ${curr_file} NAME)
+
+    # Check if duplicate
+    if(${FILE_NAME} STREQUAL ${curr_file_name})
+      get_filename_component(DIR_PATH "${FILE}" DIRECTORY)
+      get_filename_component(FILE_NAME_WE "${FILE}" NAME_WE)
+      get_filename_component(FILE_EXT "${FILE}" EXT)
+
+      # Construct a new file name by adding _tmp
+      set(HIP_FILE "${DIR_PATH}/${FILE_NAME_WE}_tmp${FILE_EXT}" PARENT_SCOPE)
+    endif()
+  endforeach()
+endfunction()
+
+include(ROCMSetupVersion)
+include(ROCMCreatePackage)
+include(ROCMInstallTargets)
+include(ROCMPackageConfigHelpers)
+include(ROCMInstallSymlinks)
+include(ROCMCheckTargetIds)
+include(ROCMClients)
+include(ROCMHeaderWrapper)
@@ -140,5 +140,6 @@ To obtain a bus bandwidth which should be independent of the number of ranks _n_
 * AllGather : (_n_-1)/_n_
 * Broadcast : 1
 * Reduce : 1
+* AlltoAll: (_n_-1)/_n_

 The bus bandwidth should reflect the speed of the hardware bottleneck : NVLink, PCI, QPI, or network.
@@ -128,7 +128,7 @@ add_library(rccl_common OBJECT ${HIP_COMMON_SOURCES})
 add_dependencies(rccl_common hipify git_version_check)
 target_link_libraries(rccl_common roc::rccl hip::device)
 if(USE_MPI)
-    target_link_libraries(rccl_common MPI::MPI_CXX)
+  target_link_libraries(rccl_common MPI::MPI_CXX)
 endif()

 # Compile tests
@@ -1,32 +1,96 @@
 #
 # Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
-# Modifications are Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
+# Modifications are Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
 #
 # See LICENSE.txt for license information
 #

 ROCM_PATH ?= /opt/rocm
-MPI_HOME ?= /usr/lib/openmpi
+MPI_HOME ?= /usr/lib/x86_64-linux-gnu
 PREFIX ?= /usr/local
 VERBOSE ?= 0
 DEBUG ?= 0
 NCCL_HOME ?= ""
+CUSTOM_RCCL_LIB ?= ""

-HIPCC = $(ROCM_PATH)/bin/hipcc
+HIPCC = $(ROCM_PATH)/bin/amdclang++
+HIPCONFIG = $(ROCM_PATH)/bin/hipconfig
 CXX = $(HIPCC)

 HIPCUFLAGS := -std=c++14
 LDFLAGS    :=
 HIPLDFLAGS :=

+HIP_VERSION = $(strip $(shell which $(HIPCONFIG) >/dev/null && $(HIPCONFIG) --version))
+HIP_MAJOR = $(shell echo $(HIP_VERSION) | cut -d "." -f 1)
+HIP_MINOR = $(shell echo $(HIP_VERSION) | cut -d "." -f 2)
+
+# Better define GPU_TARGETS in your environment to the minimal set
+# of archs to reduce compile time.
+# Currently, supports gfx906,gfx908,gfx90a,gfx942,gfx950,gfx1030,gfx1100,gfx1101,gfx1102,gfx1200,gfx1201
+ifndef GPU_TARGETS
+GPU_TARGETS = gfx906 gfx908 gfx90a
+  ifeq ($(shell test "0$(HIP_MAJOR)" -eq 6; echo $$?),0)
+    # Include gfx942 support if we're using ROCm 6.0 or above
+    GPU_TARGETS += gfx942
+    ifeq ($(shell test "0$(HIP_MINOR)" -ge 5; echo $$?),0)
+    # Include gfx950 support if we're using ROCm 6.5 or above
+    GPU_TARGETS += gfx950
+    endif
+  endif
+GPU_TARGETS += gfx1030 gfx1100 gfx1101 gfx1102 gfx1200 gfx1201
+endif
+
+GPU_TARGETS_FLAGS = $(foreach target,$(GPU_TARGETS),"--offload-arch=$(target)")
+
+#CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
+#CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
+#CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
+#
+## Better define NVCC_GENCODE in your environment to the minimal set
+## of archs to reduce compile time.
+#ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -ge 13; echo $$?),0)
+## Include Blackwell support if we're using CUDA12.8 or above
+#NVCC_GENCODE ?=	-gencode=arch=compute_80,code=sm_80 \
+#		-gencode=arch=compute_90,code=sm_90 \
+#		-gencode=arch=compute_100,code=sm_100 \
+#		-gencode=arch=compute_120,code=sm_120 \
+#		-gencode=arch=compute_120,code=compute_120
+#else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 12; echo $$?),0)
+#NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
+#                -gencode=arch=compute_61,code=sm_61 \
+#                -gencode=arch=compute_70,code=sm_70 \
+#		-gencode=arch=compute_80,code=sm_80 \
+#		-gencode=arch=compute_90,code=sm_90 \
+#		-gencode=arch=compute_90,code=compute_90
+#else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
+#NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
+#                -gencode=arch=compute_61,code=sm_61 \
+#                -gencode=arch=compute_70,code=sm_70 \
+#		-gencode=arch=compute_80,code=sm_80 \
+#		-gencode=arch=compute_80,code=compute_80
+#else
+#NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
+#                -gencode=arch=compute_50,code=sm_50 \
+#                -gencode=arch=compute_60,code=sm_60 \
+#                -gencode=arch=compute_61,code=sm_61 \
+#                -gencode=arch=compute_70,code=sm_70 \
+#                -gencode=arch=compute_70,code=compute_70
+#endif
+
 ifneq ($(NCCL_HOME), "")
 HIPCUFLAGS += -I$(NCCL_HOME)/ -I$(NCCL_HOME)/include
-HIPLDFLAGS   += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME) -L$(NCCL_HOME)/lib
+HIPLDFLAGS += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME) -L$(NCCL_HOME)/lib
 endif
+
 HIPCUFLAGS += -I$(ROCM_PATH)/include
 HIPCUFLAGS += -I$(ROCM_PATH)/include/hip
+HIPCUFLAGS += -x hip -D__HIP_PLATFORM_AMD__ -D__HIPCC__ $(GPU_TARGETS_FLAGS)
 LDFLAGS    += -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
-HIPLDFLAGS += -L$(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt -pthread
+ifneq ($(CUSTOM_RCCL_LIB), "")
+HIPLDFLAGS += -L$(CUSTOM_RCCL_LIB)
+endif
+HIPLDFLAGS += -L$(ROCM_PATH)/lib -lhsa-runtime64 -lamdhip64 -lstdc++ -lrt -pthread

 ifeq ($(DEBUG), 0)
 HIPCUFLAGS += -O3
@@ -46,20 +110,20 @@ HIPIFY_DIR ?= $(BUILDDIR)/hipify
 .PRECIOUS: $(HIPIFY_DIR)/%.cu.cpp $(HIPIFY_DIR)/%.h

 ifeq ($(MPI), 1)
-HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/include/mpi
-HIPLDFLAGS += -L${MPI_HOME}/lib -lmpi
+HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/include/openmpi -I${MPI_HOME}/openmpi/include -I${MPI_HOME}/openmpi/include/openmpi
+HIPLDFLAGS += -L${MPI_HOME}/lib -L${MPI_HOME}/openmpi/lib -lmpi
 else ifeq ($(MPICH), 1)
-HIPCUFLAGS += -DMPI_SUPPORT -I/usr/include/mpich -I/usr/include/x86_64-linux-gnu/mpich
-HIPLDFLAGS += -L/usr/lib -lmpich
+HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/mpich/include -I/usr/include/x86_64-linux-gnu/mpich
+HIPLDFLAGS += -L${MPI_HOME}/lib -L${MPI_HOME}/mpich/lib -lmpich
 endif

 LIBRARIES += rccl
-HIPLDFLAGS   += $(LIBRARIES:%=-l%)
+HIPLDFLAGS += $(LIBRARIES:%=-l%)

 DST_DIR := $(BUILDDIR)
 SRC_FILES := $(wildcard *.cu)
 OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
-BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv alltoallv
+BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv alltoallv hypercube
 BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)

 GIT_VERSION_FILE := ${DST_DIR}/src/git_version.cpp
@@ -94,8 +158,8 @@ ${HIPIFY_DIR}/%.h: %.h
 ${DST_DIR}/%.o: ${HIPIFY_DIR}/%.cu.cpp ${HIPIFY_DIR}/common.h $(TEST_VERIFIABLE_HDRS) $(GIT_VERSION_FILE)
 	@printf "Compiling  %-35s > %s\n" $< $@
 	@mkdir -p ${DST_DIR}
-	echo "$(HIPCC) -o $@ $(HIPCUFLAGS) -I. -c $<"
-	$(HIPCC) -o $@ $(HIPCUFLAGS) -I. -c $<
+	echo "$(HIPCC) $(HIPCUFLAGS) -I. -c -o $@ $<"
+	$(HIPCC) $(HIPCUFLAGS) -I. -c -o $@ $<

 ${DST_DIR}/timer.o: timer.cc timer.h
 	@printf "Compiling  %-35s > %s\n" $< $@
@@ -105,6 +169,6 @@ ${DST_DIR}/timer.o: timer.cc timer.h
 ${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS) $(DST_DIR)/src/git_version.cpp
 	@printf "Linking  %-35s > %s\n" $< $@
 	@mkdir -p ${DST_DIR}
-	echo "$(HIPCC) -o $@ $(HIPCUFLAGS) $^ ${HIPLDFLAGS}"
-	$(HIPCC) -o $@ $(HIPCUFLAGS) $^ ${HIPLDFLAGS}
+	echo "$(HIPCC) -o $@ $^ $(HIPLDFLAGS)"
+	$(HIPCC) -o $@ $^ $(HIPLDFLAGS)

@@ -8,10 +8,8 @@
 #include "cuda_runtime.h"
 #include "common.h"

-#define ALIGN 4
-
-void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
-  size_t base = (count/(ALIGN*nranks))*ALIGN;
+void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
+  size_t base = (count/nranks) & -(16/eltSize);
  *sendcount = base;
  *recvcount = base*nranks;
  *sendInplaceOffset = base;
@@ -61,7 +59,7 @@ struct testColl allGatherTest = {

 void AllGatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
-  AllGatherGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+  AllGatherGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
 }

 testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
@@ -8,7 +8,7 @@
 #include "cuda_runtime.h"
 #include "common.h"

-void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
  *sendcount = count;
  *recvcount = count;
  *sendInplaceOffset = 0;
@@ -56,7 +56,7 @@ struct testColl allReduceTest = {

 void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
-  AllReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+  AllReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
 }

 testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
@@ -8,12 +8,12 @@
 #include "cuda_runtime.h"
 #include "common.h"

-void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
-  *sendcount = (count/nranks)*nranks;
-  *recvcount = (count/nranks)*nranks;
+void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
+  *paramcount = (count/nranks) & -(16/eltSize);
+  *sendcount = nranks*(*paramcount);
+  *recvcount = *sendcount;
  *sendInplaceOffset = 0;
  *recvInplaceOffset = 0;
-  *paramcount = count/nranks;
 }

 testResult_t AlltoAllInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
@@ -61,7 +61,7 @@ struct testColl alltoAllTest = {

 void AlltoAllGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
-  AlltoAllGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+  AlltoAllGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
 }

 testResult_t AlltoAllRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
@@ -10,7 +10,7 @@

 #define USE_RCCL_GATHER_SCATTER

-void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
  if (count < nranks*nranks/2) {
    *sendcount = 0;
    *recvcount = 0;
@@ -18,11 +18,11 @@ void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *par
    *recvInplaceOffset = 0;
    *paramcount = 0;
  } else {
-    *sendcount = (count/nranks)*nranks;
-    *recvcount = (count/nranks)*nranks;
+    *paramcount = (count/nranks) & -(16/eltSize);
+    *sendcount = nranks*(*paramcount);
+    *recvcount = *sendcount;
    *sendInplaceOffset = 0;
    *recvInplaceOffset = 0;
-    *paramcount = count/nranks;
  }
 }

@@ -161,7 +161,7 @@ struct testColl alltoAllTest = {

 void AlltoAllvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
-  AlltoAllvGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+  AlltoAllvGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
 }

 testResult_t AlltoAllvRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
@@ -8,7 +8,7 @@
 #include "cuda_runtime.h"
 #include "common.h"

-void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
  *sendcount = count;
  *recvcount = count;
  *sendInplaceOffset = 0;
@@ -65,7 +65,7 @@ struct testColl broadcastTest = {

 void BroadcastGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
-  BroadcastGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+  BroadcastGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
 }

 testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
@@ -16,6 +16,8 @@
 #include <type_traits>
 #include <getopt.h>
 #include <libgen.h>
+#include <string.h>
+#include <ctype.h>
 #include "cuda.h"
 #include <vector>
 #include <utility>
@@ -90,13 +92,13 @@ static int datacheck = 1;
 static int warmup_iters = 5;
 static int iters = 20;
 static int agg_iters = 1;
+static int run_cycles = 1;
 static int ncclop = ncclSum;
 static int nccltype = ncclFloat;
 static int ncclroot = 0;
 static int parallel_init = 0;
 static int blocking_coll = 0;
 static int memorytype = 0;
-static int stress_cycles = 1;
 static uint32_t cumask[4];
 static int streamnull = 0;
 static int timeout = 0;
@@ -121,6 +123,7 @@ Reporter::Reporter(std::string fileName, std::string outputFormat) : _outputForm
      _out = std::ofstream(fileName, std::ios_base::out);
      _outputValid = true;
      if (_outputFormat == "csv") {
+        _out << "numCycle, ";
        _out << "collective, ";
 #ifdef MPI_SUPPORT
        _out << "ranks, rankspernode, gpusperrank, ";
@@ -133,10 +136,11 @@ Reporter::Reporter(std::string fileName, std::string outputFormat) : _outputForm
  }
 }

-void Reporter::setParameters(const char* name, const char* typeName, const char* opName) {
+void Reporter::setParameters(const size_t numCycle, const char* name, const char* typeName, const char* opName) {
  if (!isMainThread() || !_outputValid)
    return;

+  _numCycle = numCycle;
  _collectiveName = name;
  _typeName = typeName;
  _opName = opName;
@@ -150,6 +154,7 @@ void Reporter::addResult(int gpusPerRank, int ranksPerNode, int totalRanks, size
  std::string wrongEltsStr = (wrongElts == -1) ? "N/A" : std::to_string(wrongElts);
  int nodes = totalRanks / ranksPerNode;

+  outputValuesKeys.push_back(makeValueKeyPair(_numCycle, "numCycle"));
  outputValuesKeys.push_back(makeValueKeyPair(_collectiveName, "name"));
 #ifdef MPI_SUPPORT
  outputValuesKeys.push_back(makeValueKeyPair(nodes, "nodes"));
@@ -614,8 +619,8 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
  Barrier(args);

 #if HIP_VERSION >= 50221310
-  cudaGraph_t graphs[args->nGpus];
-  cudaGraphExec_t graphExec[args->nGpus];
+  std::vector<cudaGraph_t> graphs(args->nGpus);
+  std::vector<cudaGraphExec_t> graphExec(args->nGpus);
  if (cudaGraphLaunches >= 1) {
    // Begin cuda graph capture
    for (int i=0; i<args->nGpus; i++) {
@@ -642,11 +647,11 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
  if (cudaGraphLaunches >= 1) {
    // End cuda graph capture
    for (int i=0; i<args->nGpus; i++) {
-      CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
+      CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs.data()+i));
    }
    // Instantiate cuda graph
    for (int i=0; i<args->nGpus; i++) {
-      CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
+      CUDACHECK(cudaGraphInstantiate(graphExec.data()+i, graphs[i], NULL, NULL, 0));
    }
    // Resync CPU, restart timing, launch cuda graph
    Barrier(args);
@@ -705,11 +710,11 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
      if (cudaGraphLaunches >= 1) {
        // End cuda graph capture
        for (int i=0; i<args->nGpus; i++) {
-          CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
+          CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs.data()+i));
        }
        // Instantiate cuda graph
        for (int i=0; i<args->nGpus; i++) {
-          CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
+          CUDACHECK(cudaGraphInstantiate(graphExec.data()+i, graphs[i], NULL, NULL, 0));
        }
        // Launch cuda graph
        for (int i=0; i<args->nGpus; i++) {
@@ -774,7 +779,7 @@ void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) {
  size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, recvInplaceOffset;

  count = size / wordSize(type);
-  args->collTest->getCollByteCount(&sendCount, &recvCount, &paramCount, &sendInplaceOffset, &recvInplaceOffset, (size_t)count, (size_t)nranks);
+  args->collTest->getCollByteCount(&sendCount, &recvCount, &paramCount, &sendInplaceOffset, &recvInplaceOffset, (size_t)count, wordSize(type), (size_t)nranks);

  args->nbytes = paramCount * wordSize(type);
  args->sendBytes = sendCount * wordSize(type);
@@ -790,8 +795,8 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
  // Warm-up for large size
  setupArgs(args->maxbytes, type, args);
 #if HIP_VERSION >= 50221310
-  cudaGraph_t graphs[args->nGpus];
-  cudaGraphExec_t graphExec[args->nGpus];
+  std::vector<cudaGraph_t> graphs(args->nGpus);
+  std::vector<cudaGraphExec_t> graphExec(args->nGpus);
  if (cudaGraphLaunches >= 1) {
    // Begin cuda graph capture
    for (int i=0; i<args->nGpus; i++) {
@@ -811,11 +816,11 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
  if (cudaGraphLaunches >= 1) {
    // End cuda graph capture
    for (int i=0; i<args->nGpus; i++) {
-      CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
+      CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs.data()+i));
    }
    // Instantiate cuda graph
    for (int i=0; i<args->nGpus; i++) {
-      CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
+      CUDACHECK(cudaGraphInstantiate(graphExec.data()+i, graphs[i], NULL, NULL, 0));
    }
    // Resync CPU, restart timing, launch cuda graph
    Barrier(args);
@@ -861,11 +866,11 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
  if (cudaGraphLaunches >= 1) {
    // End cuda graph capture
    for (int i=0; i<args->nGpus; i++) {
-      CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
+      CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs.data()+i));
    }
    // Instantiate cuda graph
    for (int i=0; i<args->nGpus; i++) {
-      CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
+      CUDACHECK(cudaGraphInstantiate(graphExec.data()+i, graphs[i], NULL, NULL, 0));
    }
    // Resync CPU, restart timing, launch cuda graph
    Barrier(args);
@@ -889,26 +894,31 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
  }
 #endif

-  if (args->reporter) {
-    args->reporter->setParameters(args->collTest->name, typeName, opName);
-  }
+  // Benchmark
+  long repeat = run_cycles;
+  size_t iter = 0;

-  for (size_t iter = 0; iter < stress_cycles; iter++) {
-    if (iter > 0) PRINT("# Testing %lu cycle.\n", iter+1);
-    // Benchmark
-    for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) {
-        setupArgs(size, type, args);
-        char rootName[100];
-        sprintf(rootName, "%6i", root);
-        PRINT("%12li  %12li  %8s  %6s  %6s", std::max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName);
-        if (enable_out_of_place) {
-          TESTCHECK(BenchTime(args, type, op, root, 0));
-          usleep(delay_inout_place);
-        }
-        TESTCHECK(BenchTime(args, type, op, root, 1));
-        PRINT("\n");
+  do {
+    if (run_cycles > 1) PRINT("# Testing %lu cycle.\n", iter+1);
+    if (args->reporter) {
+      args->reporter->setParameters(iter, args->collTest->name, typeName, opName);
    }
-  }
+    for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) {
+      setupArgs(size, type, args);
+      char rootName[100];
+      sprintf(rootName, "%6i", root);
+      PRINT("%12li  %12li  %8s  %6s  %6s", std::max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName);
+      if (enable_out_of_place) {
+        TESTCHECK(BenchTime(args, type, op, root, 0));
+        usleep(delay_inout_place);
+      }
+      TESTCHECK(BenchTime(args, type, op, root, 1));
+      PRINT("\n");
+    }
+    --repeat;
+    ++iter;
+  } while(repeat != 0);
+
  return testSuccess;
 }

@@ -1052,26 +1062,27 @@ int main(int argc, char* argv[]) {
    {"iters", required_argument, 0, 'n'},
    {"agg_iters", required_argument, 0, 'm'},
    {"warmup_iters", required_argument, 0, 'w'},
+    {"run_cycles", required_argument, 0, 'N'},
    {"parallel_init", required_argument, 0, 'p'},
    {"check", required_argument, 0, 'c'},
    {"op", required_argument, 0, 'o'},
    {"datatype", required_argument, 0, 'd'},
    {"root", required_argument, 0, 'r'},
    {"blocking", required_argument, 0, 'z'},
-    {"memory_type", required_argument, 0, 'y'}, //RCCL
-    {"stress_cycles", required_argument, 0, 's'}, //RCCL
-    {"cumask", required_argument, 0, 'u'},        //RCCL
-    {"stream_null", required_argument, 0, 'y'}, //NCCL
-    {"timeout", required_argument, 0, 'T'},     //NCCL
+    {"stream_null", required_argument, 0, 'y'},
+    {"timeout", required_argument, 0, 'T'},
    {"cudagraph", required_argument, 0, 'G'},
    {"report_cputime", required_argument, 0, 'C'},
    {"average", required_argument, 0, 'a'},
-    {"out_of_place", required_argument, 0, 'O'},
-    {"cache_flush", required_argument, 0, 'F'},
-    {"rotating_tensor", required_argument, 0, 'E'},
    {"local_register", required_argument, 0, 'R'},
-    {"output_file", required_argument, 0, 'x'},
-    {"output_format", required_argument, 0, 'Z'},
+    {"memory_type", required_argument, 0, 'y'},       //RCCL
+    {"cumask", required_argument, 0, 'u'},            //RCCL
+    {"out_of_place", required_argument, 0, 'O'},      //RCCL
+    {"delay_inout_place", required_argument, 0, 'q'}, //RCCL
+    {"cache_flush", required_argument, 0, 'F'},       //RCCL
+    {"rotating_tensor", required_argument, 0, 'E'},   //RCCL
+    {"output_file", required_argument, 0, 'x'},       //RCCL
+    {"output_format", required_argument, 0, 'Z'},     //RCCL
    {"help", no_argument, 0, 'h'},
    {}
  };
@@ -1079,7 +1090,7 @@ int main(int argc, char* argv[]) {
  while(1) {
    int c;

-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:O:F:E:R:a:y:s:u:h:q:x:Z:", longopts, &longindex);
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:N:p:c:o:d:r:z:y:T:G:C:a:R:Y:u:O:q:F:E:x:Z:h", longopts, &longindex);

    if (c == -1)
      break;
@@ -1108,7 +1119,12 @@ int main(int argc, char* argv[]) {
        maxBytes = (size_t)parsed;
        break;
      case 'i':
-        stepBytes = strtol(optarg, NULL, 0);
+        parsed = parsesize(optarg);
+        if (parsed < 0) {
+          fprintf(stderr, "invalid size specified for 'stepBytes'\n");
+          return -1;
+        }
+        stepBytes = (size_t)parsed;
        break;
      case 'f':
        stepFactor = strtol(optarg, NULL, 0);
@@ -1126,12 +1142,15 @@ int main(int argc, char* argv[]) {
      case 'w':
        warmup_iters = (int)strtol(optarg, NULL, 0);
        break;
-      case 'c':
-        datacheck = (int)strtol(optarg, NULL, 0);
+      case 'N':
+        run_cycles = (int)strtol(optarg, NULL, 0);
        break;
      case 'p':
        parallel_init = (int)strtol(optarg, NULL, 0);
        break;
+      case 'c':
+        datacheck = (int)strtol(optarg, NULL, 0);
+        break;
      case 'o':
        ncclop = ncclstringtoop(optarg);
        break;
@@ -1144,22 +1163,6 @@ int main(int argc, char* argv[]) {
      case 'z':
        blocking_coll = strtol(optarg, NULL, 0);
        break;
-      case 'Y':
-        memorytype = ncclstringtomtype(optarg);
-        break;
-      case 's':
-        stress_cycles = strtol(optarg, NULL, 0);
-        break;
-      case 'u':
-        {
-          int nmasks = 0;
-          char *mask = strtok(optarg, ",");
-          while (mask != NULL && nmasks < 4) {
-            cumask[nmasks++] = strtol(mask, NULL, 16);
-            mask = strtok(NULL, ",");
-          };
-        }
-	break;
      case 'y':
        streamnull = strtol(optarg, NULL, 0);
        break;
@@ -1176,9 +1179,37 @@ int main(int argc, char* argv[]) {
      case 'C':
        report_cputime = strtol(optarg, NULL, 0);
        break;
+      case 'a':
+        average = (int)strtol(optarg, NULL, 0);
+        break;
+      case 'R':
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
+        if ((int)strtol(optarg, NULL, 0)) {
+          local_register = 1;
+        }
+#else
+        printf("Option -R (register) is not supported before NCCL 2.19. Ignoring\n");
+#endif
+        break;
+      case 'Y':
+        memorytype = ncclstringtomtype(optarg);
+        break;
+      case 'u':
+        {
+          int nmasks = 0;
+          char *mask = strtok(optarg, ",");
+          while (mask != NULL && nmasks < 4) {
+            cumask[nmasks++] = strtol(mask, NULL, 16);
+            mask = strtok(NULL, ",");
+          };
+        }
+	break;
      case 'O':
        enable_out_of_place = strtol(optarg, NULL, 0);
        break;
+      case 'q':
+        delay_inout_place = (int)strtol(optarg, NULL, 10);
+	break;
      case 'F':
        enable_cache_flush = strtol(optarg, NULL, 0);
        if (enable_cache_flush > 0) {
@@ -1190,20 +1221,6 @@ int main(int argc, char* argv[]) {
      case 'E':
        enable_rotating_tensor = strtol(optarg, NULL, 0);
        break;
-      case 'a':
-        average = (int)strtol(optarg, NULL, 0);
-        break;
-      case 'q':
-        delay_inout_place = (int)strtol(optarg, NULL, 10);
-      case 'R':
-#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
-        if ((int)strtol(optarg, NULL, 0)) {
-          local_register = 1;
-        }
-#else
-        printf("Option -R (register) is not supported before NCCL 2.19. Ignoring\n");
-#endif
-        break;
      case 'x':
        output_file = optarg;
        break;
@@ -1223,6 +1240,7 @@ int main(int argc, char* argv[]) {
            "[-n,--iters <iteration count>] \n\t"
            "[-m,--agg_iters <aggregated iteration count>] \n\t"
            "[-w,--warmup_iters <warmup iteration count>] \n\t"
+            "[-N,--run_cycles <cycle count> run & print each cycle (default: 1; 0=infinite)] \n\t"
            "[-p,--parallel_init <0/1>] \n\t"
            "[-c,--check <check iteration count>] \n\t"
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
@@ -1235,19 +1253,18 @@ int main(int argc, char* argv[]) {
            "[-d,--datatype <nccltype/all>] \n\t"
            "[-r,--root <root/all>] \n\t"
            "[-z,--blocking <0/1>] \n\t"
-            "[-Y,--memory_type <coarse/fine/host/managed>] \n\t"
-            "[-s,--stress_cycles <number of cycles>] \n\t"
-            "[-u,--cumask <d0,d1,d2,d3>] \n\t"
            "[-y,--stream_null <0/1>] \n\t"
            "[-T,--timeout <time in seconds>] \n\t"
            "[-G,--cudagraph <num graph launches>] \n\t"
            "[-C,--report_cputime <0/1>] \n\t"
+            "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
+            "[-R,--local_register <1/0> enable local buffer registration on send/recv buffers (default: disable)] \n\t"
+            "[-Y,--memory_type <coarse/fine/host/managed>] \n\t"
+            "[-u,--cumask <d0,d1,d2,d3>] \n\t"
 	    "[-O,--out_of_place <0/1>] \n\t"
+            "[-q,--delay <delay between out-of-place and in-place in microseconds>] \n\t"
 	    "[-F,--cache_flush <number of iterations between instruction cache flush>] \n\t"
 	    "[-E,--rotating_tensor <0/1>] \n\t"
-            "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
-            "[-q,--delay <delay between out-of-place and in-place in microseconds>] \n\t"
-            "[-R,--local_register <1/0> enable local buffer registration on send/recv buffers (default: disable)] \n\t"
            "[-x,--output_file <output file name>] \n\t"
            "[-Z,--output_format <output format <csv|json>] \n\t"
            "[-h,--help]\n",
@@ -1283,6 +1300,26 @@ int main(int argc, char* argv[]) {
  return 0;
 }

+#ifdef MPI_SUPPORT
+// parse int for base 2/10/16, will ignore first whitespaces
+static bool parseInt(char *s, int *num) {
+  char *p = NULL;
+  if (!s || !num)
+    return false;
+  while (*s && isspace(*s)) ++s;
+  if (!*s) return false;
+
+  if (strncasecmp(s, "0b", 2) == 0)
+    *num = (int)strtoul(s + 2, &p, 2);
+  else
+    *num = (int)strtoul(s, &p, 0);
+
+  if (p == s)
+    return false;
+  return true;
+}
+#endif
+
 testResult_t run() {
  int totalProcs = 1, proc = 0, ncclProcs = 1, ncclProc = 0, color = 0;
  int localRank = 0;
@@ -1293,18 +1330,41 @@ testResult_t run() {
 #ifdef MPI_SUPPORT
  MPI_Comm_size(MPI_COMM_WORLD, &totalProcs);
  MPI_Comm_rank(MPI_COMM_WORLD, &proc);
-  uint64_t hostHashs[totalProcs];
+  std::vector<uint64_t> hostHashs(totalProcs);
  hostHashs[proc] = getHostHash(hostname);
-  MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD);
+  MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs.data(), sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD);
  for (int p=0; p<totalProcs; p++) {
    if (p == proc) break;
    if (hostHashs[p] == hostHashs[proc]) localRank++;
  }

-  char* str = getenv("NCCL_TESTS_SPLIT_MASK");
-  uint64_t mask = str ? strtoul(str, NULL, 16) : 0;
+  char *splitMaskEnv = NULL;
+  if ((splitMaskEnv = getenv("NCCL_TESTS_SPLIT_MASK"))) {
+    color = proc & strtoul(splitMaskEnv, NULL, 16);
+  } else if ((splitMaskEnv = getenv("NCCL_TESTS_SPLIT"))) {
+    if (
+      (strncasecmp(splitMaskEnv, "AND", strlen("AND")) == 0 && parseInt(splitMaskEnv + strlen("AND"), &color)) ||
+      (strncasecmp(splitMaskEnv, "&", strlen("&")) == 0 && parseInt(splitMaskEnv + strlen("&"), &color))
+    )
+        color = proc & color;
+    if (
+      (strncasecmp(splitMaskEnv, "OR", strlen("OR")) == 0 && parseInt(splitMaskEnv + strlen("OR"), &color)) ||
+      (strncasecmp(splitMaskEnv, "|", strlen("|")) == 0 && parseInt(splitMaskEnv + strlen("|"), &color))
+    )
+        color = proc | color;
+    if (
+      (strncasecmp(splitMaskEnv, "MOD", strlen("MOD")) == 0 && parseInt(splitMaskEnv + strlen("MOD"), &color)) ||
+      (strncasecmp(splitMaskEnv, "%", strlen("%")) == 0 && parseInt(splitMaskEnv + strlen("%"), &color))
+    )
+        color = proc % color;
+    if (
+      (strncasecmp(splitMaskEnv, "DIV", strlen("DIV")) == 0 && parseInt(splitMaskEnv + strlen("DIV"), &color)) ||
+      (strncasecmp(splitMaskEnv, "/", strlen("/")) == 0 && parseInt(splitMaskEnv + strlen("/"), &color))
+    )
+        color = proc / color;
+  }
+
  MPI_Comm mpi_comm;
-  color = proc & mask;
  MPI_Comm_split(MPI_COMM_WORLD, color, proc, &mpi_comm);
  MPI_Comm_size(mpi_comm, &ncclProcs);
  MPI_Comm_rank(mpi_comm, &ncclProc);
@@ -1340,11 +1400,13 @@ testResult_t run() {
    int rank = proc*nThreads*nGpus+i;
    cudaDeviceProp prop;
    CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
-    char busIdStr[] = "00000000:00:00.0";
-    CUDACHECK(cudaDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), cudaDev));
-    len += snprintf(line+len, MAX_LINE>len ? MAX_LINE-len : 0, "#   Rank %2d Pid %6d on %10s device %2d [%s] %s\n",
-      rank, getpid(), hostname, cudaDev, busIdStr, prop.name);
-	  maxMem = std::min(maxMem, prop.totalGlobalMem);
+    //char busIdStr[] = "00000000:00:00.0";
+    //CUDACHECK(cudaDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), cudaDev));
+    //len += snprintf(line+len, MAX_LINE>len ? MAX_LINE-len : 0, "#  Rank %2d Group %2d Pid %6d on %10s device %2d [%04x:%s:%02x] %s\n",
+    //                rank, color, getpid(), hostname, cudaDev, prop.pciDomainID, busIdStr, prop.pciDeviceID, prop.name);
+    len += snprintf(line+len, MAX_LINE>len ? MAX_LINE-len : 0, "#  Rank %2d Group %2d Pid %6d on %10s device %2d [%04x:%02x:%02x] %s\n",
+                    rank, color, getpid(), hostname, cudaDev, prop.pciDomainID, prop.pciBusID, prop.pciDeviceID, prop.name);
+    maxMem = std::min(maxMem, prop.totalGlobalMem);
  }
 #if MPI_SUPPORT
  char *lines = (proc == 0) ? (char *)malloc(totalProcs*MAX_LINE) : NULL;
@@ -1376,11 +1438,11 @@ testResult_t run() {
  MPI_Barrier(MPI_COMM_WORLD); // Ensure Bcast is complete for HCOLL
 #endif

-  int gpus[nGpus*nThreads];
-  cudaStream_t streams[nGpus*nThreads];
-  void* sendbuffs[nGpus*nThreads];
-  void* recvbuffs[nGpus*nThreads];
-  void* expected[nGpus*nThreads];
+  std::vector<int> gpus(nGpus*nThreads);
+  std::vector<cudaStream_t> streams(nGpus*nThreads);
+  std::vector<void*> sendbuffs(nGpus*nThreads);
+  std::vector<void*> recvbuffs(nGpus*nThreads);
+  std::vector<void*> expected(nGpus*nThreads);
  size_t sendBytes, recvBytes;

  ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)ncclProcs*nGpus*nThreads);
@@ -1390,11 +1452,11 @@ testResult_t run() {
  for (int i=0; i<nGpus*nThreads; i++) {
    gpus[i] = ((gpu0 != -1 ? gpu0 : localRank*nThreads*nGpus) + i)%numDevices;
    CUDACHECK(cudaSetDevice(gpus[i]));
-    TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes));
+    TESTCHECK(AllocateBuffs(sendbuffs.data()+i, sendBytes, recvbuffs.data()+i, recvBytes, expected.data()+i, (size_t)maxBytes));
    if (streamnull)
      streams[i] = NULL;
    else
-      CUDACHECK(cudaStreamCreateWithFlags(streams+i, cudaStreamNonBlocking));
+      CUDACHECK(cudaStreamCreateWithFlags(streams.data()+i, cudaStreamNonBlocking));
  }

  //if parallel init is not selected, use main thread to initialize NCCL
@@ -1405,7 +1467,7 @@ testResult_t run() {
 #endif
  if (!parallel_init) {
     if (ncclProcs == 1) {
-       NCCLCHECK(ncclCommInitAll(comms, nGpus*nThreads, gpus));
+       NCCLCHECK(ncclCommInitAll(comms, nGpus*nThreads, gpus.data()));
     } else {
       NCCLCHECK(ncclGroupStart());
       for (int i=0; i<nGpus*nThreads; i++) {
@@ -1418,17 +1480,17 @@ testResult_t run() {
     sendRegHandles = (local_register) ? (void **)malloc(sizeof(*sendRegHandles)*nThreads*nGpus) : NULL;
     recvRegHandles = (local_register) ? (void **)malloc(sizeof(*recvRegHandles)*nThreads*nGpus) : NULL;
     for (int i=0; i<nGpus*nThreads; i++) {
-       if (local_register) NCCLCHECK(ncclCommRegister(comms[i], sendbuffs[i], sendBytes, &sendRegHandles[i]));
-       if (local_register) NCCLCHECK(ncclCommRegister(comms[i], recvbuffs[i], recvBytes, &recvRegHandles[i]));
+       if (local_register) NCCLCHECK(ncclCommRegister(comms[i], &sendbuffs[i], maxBytes, &sendRegHandles[i]));
+       if (local_register) NCCLCHECK(ncclCommRegister(comms[i], &recvbuffs[i], maxBytes, &recvRegHandles[i]));
     }
 #endif
  }

-  int errors[nThreads];
-  double bw[nThreads];
+  std::vector<int> errors(nThreads);
+  std::vector<double> bw(nThreads);
  double* delta;
  CUDACHECK(hipHostMalloc(&delta, sizeof(double)*nThreads*NUM_BLOCKS, cudaHostAllocPortable | cudaHostAllocMapped));
-  int bw_count[nThreads];
+  std::vector<int> bw_count(nThreads);
  for (int t=0; t<nThreads; t++) {
    bw[t] = 0.0;
    errors[t] = bw_count[t] = 0;
@@ -1453,8 +1515,8 @@ testResult_t run() {
  }
  Reporter reporter(output_file, output_format);

-  struct testThread threads[nThreads];
-  memset(threads, 0, sizeof(struct testThread)*nThreads);
+  std::vector<testThread> threads(nThreads);
+  memset(threads.data(), 0, sizeof(struct testThread)*nThreads);

  for (int t=nThreads-1; t>=0; t--) {
    threads[t].args.minbytes=minBytes;
@@ -1469,26 +1531,26 @@ testResult_t run() {
    threads[t].args.nThreads=nThreads;
    threads[t].args.thread=t;
    threads[t].args.nGpus=nGpus;
-    threads[t].args.gpus=gpus+t*nGpus;
-    threads[t].args.sendbuffs = sendbuffs+t*nGpus;
-    threads[t].args.recvbuffs = recvbuffs+t*nGpus;
-    threads[t].args.expected = expected+t*nGpus;
+    threads[t].args.gpus=gpus.data()+t*nGpus;
+    threads[t].args.sendbuffs = sendbuffs.data()+t*nGpus;
+    threads[t].args.recvbuffs = recvbuffs.data()+t*nGpus;
+    threads[t].args.expected = expected.data()+t*nGpus;
    threads[t].args.ncclId = ncclId;
    threads[t].args.comms=comms+t*nGpus;
-    threads[t].args.streams=streams+t*nGpus;
+    threads[t].args.streams=streams.data()+t*nGpus;
    threads[t].args.enable_out_of_place=enable_out_of_place;
    threads[t].args.enable_cache_flush = enable_cache_flush;
    threads[t].args.enable_rotating_tensor = enable_rotating_tensor;
-    threads[t].args.errors=errors+t;
-    threads[t].args.bw=bw+t;
-    threads[t].args.bw_count=bw_count+t;
+    threads[t].args.errors=errors.data()+t;
+    threads[t].args.bw=bw.data()+t;
+    threads[t].args.bw_count=bw_count.data()+t;

    threads[t].args.reportErrors = datacheck;
    threads[t].args.reporter = &reporter;

    threads[t].func = parallel_init ? threadInit : threadRunTests;
    if (t)
-      TESTCHECK(threadLaunch(threads+t));
+      TESTCHECK(threadLaunch(threads.data()+t));
    else
      TESTCHECK(threads[t].func(&threads[t].args));
  }
@@ -92,7 +92,7 @@ struct testColl {
  void (*getCollByteCount)(
      size_t *sendcount, size_t *recvcount, size_t *paramcount,
      size_t *sendInplaceOffset, size_t *recvInplaceOffset,
-      size_t count, int nranks);
+      size_t count, size_t eltSize, int nranks);
  testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type,
      ncclRedOp_t op, int root, int rep, int in_place);
  void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
@@ -110,7 +110,7 @@ class Reporter {
  public:
    Reporter(std::string fileName, std::string outputFormat);
    ~Reporter() { if (_outputValid) { _out.close(); } };
-    void setParameters(const char* name, const char* typeName, const char* opName);// {
+    void setParameters(const size_t numCycle, const char* name, const char* typeName, const char* opName);
    void addResult(int gpusPerRank, int ranksPerNode, int totalRanks, size_t numBytes, int inPlace, double timeUsec, double algBw, double busBw, int64_t wrongElts = -1);

  private:
@@ -121,6 +121,7 @@ class Reporter {
    bool _outputValid = false;
    std::ofstream _out;
    std::string _outputFormat;
+    size_t _numCycle = 0;
    std::string _collectiveName;
    std::string _typeName;
    std::string _opName;
@@ -294,7 +295,7 @@ extern ncclRedOp_t test_ops[];
 extern const char *test_opnames[];

 static int ncclstringtotype(char *str) {
-    for (int t=0; t<ncclNumTypes; t++) {
+    for (int t=0; t<test_typenum; t++) {
      if (strcmp(str, test_typenames[t]) == 0) {
        return t;
      }
@@ -8,12 +8,12 @@
 #include "cuda_runtime.h"
 #include "common.h"

-void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
-  *sendcount = count/nranks;
-  *recvcount = (count/nranks)*nranks;
-  *sendInplaceOffset = count/nranks;
+void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
+  *sendcount = (count/nranks) & -(16/eltSize);
+  *recvcount = (*sendcount)*nranks;
+  *sendInplaceOffset = *sendcount;
  *recvInplaceOffset = 0;
-  *paramcount = count/nranks;
+  *paramcount = *sendcount;
 }

 testResult_t GatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
@@ -74,7 +74,7 @@ struct testColl gatherTest = {

 void GatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
-  GatherGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+  GatherGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
 }

 testResult_t GatherRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
@@ -10,8 +10,8 @@

 #define ALIGN 4

-void HyperCubeGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
-  size_t base = (count/(ALIGN*nranks))*ALIGN;
+void HyperCubeGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
+  size_t base = (count/nranks) & -(16/eltSize);
  *sendcount = base;
  *recvcount = base*nranks;
  *sendInplaceOffset = base;
@@ -78,7 +78,7 @@ struct testColl hyperCubeTest = {

 void HyperCubeGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
-  HyperCubeGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+  HyperCubeGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
 }

 testResult_t HyperCubeRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
@@ -8,7 +8,7 @@
 #include "cuda_runtime.h"
 #include "common.h"

-void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
  *sendcount = count;
  *recvcount = count;
  *sendInplaceOffset = 0;
@@ -55,7 +55,7 @@ struct testColl reduceTest = {

 void ReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
-  ReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+  ReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
 }

 testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
@@ -8,10 +8,8 @@
 #include "cuda_runtime.h"
 #include "common.h"

-#define ALIGN 4
-
-void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
-  size_t base = (count/(ALIGN*nranks))*ALIGN;
+void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
+  size_t base = (count/nranks) & -(16/eltSize);
  *sendcount = base*nranks;
  *recvcount = base;
  *sendInplaceOffset = 0;
@@ -60,7 +58,7 @@ struct testColl reduceScatterTest = {

 void ReduceScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
-  ReduceScatterGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+  ReduceScatterGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
 }

 testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
@@ -8,12 +8,12 @@
 #include "cuda_runtime.h"
 #include "common.h"

-void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
-  *sendcount = (count/nranks)*nranks;
-  *recvcount = count/nranks;
+void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
+  *recvcount = (count/nranks) & -(16/eltSize);
+  *sendcount = (*recvcount)*nranks;
  *sendInplaceOffset = 0;
-  *recvInplaceOffset = count/nranks;
-  *paramcount = count/nranks;
+  *recvInplaceOffset = *recvcount;
+  *paramcount = *recvcount;
 }

 testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
@@ -70,7 +70,7 @@ struct testColl scatterTest = {

 void ScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
-  ScatterGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+  ScatterGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
 }

 testResult_t ScatterRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
@@ -8,7 +8,7 @@
 #include "cuda_runtime.h"
 #include "common.h"

-void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
  *sendcount = count;
  *recvcount = count;
  *sendInplaceOffset = 0;
@@ -69,7 +69,7 @@ struct testColl sendRecvTest = {

 void SendRecvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
-  SendRecvGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+  SendRecvGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
 }

 testResult_t SendRecvRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
@@ -0,0 +1,29 @@
+
+if (DEFINED ENV{ROCM_PATH})
+  set(rocm_bin "$ENV{ROCM_PATH}/bin")
+else()
+  set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to the ROCm installation.")
+  set(rocm_bin "/opt/rocm/bin")
+endif()
+
+if (NOT DEFINED ENV{CXX})
+  set(CMAKE_CXX_COMPILER "${rocm_bin}/amdclang++" CACHE PATH "Path to the C++ compiler")
+else()
+  set(CMAKE_CXX_COMPILER "$ENV{CXX}" CACHE PATH "Path to the C++ compiler")
+endif()
+
+if (NOT DEFINED ENV{CXXFLAGS})
+  set(CMAKE_CXX_FLAGS_DEBUG "-g -O1")
+  set(CMAKE_CXX_FLAGS_RELEASE "-O3")
+endif()
+
+if (NOT DEFINED ENV{CC})
+  set(CMAKE_C_COMPILER "${rocm_bin}/amdclang" CACHE PATH "Path to the C compiler")
+else()
+  set(CMAKE_C_COMPILER "$ENV{CC}" CACHE PATH "Path to the C compiler")
+endif()
+
+if (NOT DEFINED ENV{CFLAGS})
+  set(CMAKE_C_FLAGS_DEBUG "-g -O1")
+  set(CMAKE_C_FLAGS_RELEASE "-O3")
+endif()
@@ -1,6 +1,6 @@
 #
 # Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
-# Modifications are Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+# Modifications are Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -13,13 +13,13 @@ BUILDDIR := $(abspath ../../build)
 DST_DIR := $(BUILDDIR)/test/verifiable

 ROCM_PATH ?= /opt/rocm
-MPI_HOME ?= /usr/lib/openmpi
+MPI_HOME ?= /usr/lib/x86_64-linux-gnu
 PREFIX ?= /usr/local
 VERBOSE ?= 0
 DEBUG ?= 0
 NCCL_HOME ?= ""

-HIPCC = $(ROCM_PATH)/bin/hipcc
+HIPCC = $(ROCM_PATH)/bin/amdclang++
 CXX = $(HIPCC)

 HIPCUFLAGS := -std=c++14
@@ -28,12 +28,13 @@ HIPLDFLAGS :=

 ifneq ($(NCCL_HOME), "")
 HIPCUFLAGS += -I$(NCCL_HOME)/ -I$(NCCL_HOME)/include
-HIPLDFLAGS   += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME)
+HIPLDFLAGS   += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME) -L$(NCCL_HOME)/lib
 endif
+
 HIPCUFLAGS += -I$(ROCM_PATH)/include
 HIPCUFLAGS += -I$(ROCM_PATH)/include/hip
 LDFLAGS    += -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
-HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
+HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lamdhip64 -lstdc++ -lrt

 ifeq ($(DEBUG), 0)
 HIPCUFLAGS += -O3
@@ -46,15 +47,15 @@ ifeq ($(VERBOSE), 0)
 endif

 ifeq ($(MPI), 1)
-HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/include/mpi
-HIPLDFLAGS += -L${MPI_HOME}/lib -lmpi
+HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/include/openmpi -I${MPI_HOME}/openmpi/include -I${MPI_HOME}/openmpi/include/openmpi
+HIPLDFLAGS += -L${MPI_HOME}/lib -L${MPI_HOME}/openmpi/lib -lmpi
 else ifeq ($(MPICH), 1)
-HIPCUFLAGS += -DMPI_SUPPORT -I/usr/include/mpich -I/usr/include/x86_64-linux-gnu/mpich
-HIPLDFLAGS += -L/usr/lib -lmpich
+HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/mpich/include -I/usr/include/x86_64-linux-gnu/mpich
+HIPLDFLAGS += -L${MPI_HOME}/lib -L${MPI_HOME}/mpich/lib -lmpich
 endif

 LIBRARIES += rccl
-HIPLDFLAGS   += $(LIBRARIES:%=-l%)
+HIPLDFLAGS += $(LIBRARIES:%=-l%)

 all: $(DST_DIR)/verifiable.o $(DST_DIR)/self_test