Add 'projects/rccl-tests/' from commit '6405c76e6826663bbb67bd40aeee8c70aa5d3094'

git-subtree-dir: projects/rccl-tests git-subtree-mainline: 42d84317cf git-subtree-split: 6405c76e68
2025-12-11 20:46:38 +00:00
@@ -0,0 +1,6 @@
+* @wenkaidu @gilbertlee-amd @PedramAlizadeh @nusislam @nileshnegi @KawtharShafie @AtlantaPepsi @mberenjk @corey-derochie-amd @mustafabar @thananon @JhaShweta1 @BertanDogancay @rahulvaidya20 @isaki001 @PJAvinash @AbandiGa @Nikhil-Nunna @haripriya-amd @atulkulk @alex-breslow-amd @ddebonis-amd @amd-mengshwu @Kapil-Shyam-Pawar @weilewei @nawrinsu @speriaswamy-amd
+
+# Documentation files
+doc/ @ROCm/rocm-documentation
+*.md @ROCm/rocm-documentation
+*.rst @ROCm/rocm-documentation
@@ -0,0 +1,16 @@
+## Details
+___Do not mention proprietary info or link to internal work items in this PR.___
+
+**Work item:** _"Internal", or link to GitHub issue (if applicable)._
+
+**What were the changes?**  
+_One sentence describing the work done._
+
+**Why were the changes made?**  
+_Explain the motivation behind the work. Provide any publicly-available historical context._
+
+**How was the outcome achieved?**  
+_Technical details behind the work. Explain any publicly-available hardware peculiarities._
+
+**Additional Documentation:**  
+_What else should the reviewer know?_
@@ -0,0 +1,7 @@
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENCE.txt for license information
+build/
+*.gcov
+/coverage/
+__pycache__/
@@ -0,0 +1,43 @@
+// This file is for internal AMD use.
+// If you are interested in running your own Jenkins, please raise a github issue for assistance.
+
+def runCompileCommand(platform, project, jobName)
+{
+    project.paths.construct_build_prefix()
+
+    String hipclangArgs = jobName.contains('hipclang') ? '--hip-clang' : ''
+
+    def command = """#!/usr/bin/env bash
+                set -x
+                cd ${project.paths.build_prefix}
+                git clone --recursive https://github.com/ROCm/rccl.git
+                cd rccl
+                ./install.sh -l
+                cd ../..
+                ${auxiliary.exitIfNotSuccess()}
+                
+                cd ${project.paths.project_build_prefix}
+                export RCCL_DIR=\$(pwd)/../rccl/build/release
+                ./install.sh --rccl_home \$RCCL_DIR
+                ${auxiliary.exitIfNotSuccess()}
+            """
+
+    platform.runCommand(this,command)
+}
+
+def runTestCommand (platform, project)
+{
+    String sudo = auxiliary.sudo(platform.jenkinsLabel)
+
+    def command = """#!/usr/bin/env bash
+                set -x
+                cd ${project.paths.project_build_prefix}
+                python3 -m pip install --upgrade pytest
+                python3 -m pytest --version
+                python3 -m pytest -k "not MPI and not host and not fine" --verbose --junitxml=./testreport.xml
+            """
+
+   platform.runCommand(this, command)
+}
+
+return this
@@ -0,0 +1,64 @@
+#!/usr/bin/env groovy
+// This shared library is available at https://github.com/ROCm/rocJENKINS/
+@Library('rocJenkins@pong') _
+
+// This is file for internal AMD use.
+// If you are interested in running your own Jenkins, please raise a github issue for assistance.
+
+import com.amd.project.*
+import com.amd.docker.*
+import java.nio.file.Path
+
+def runCI = 
+{
+    nodeDetails, jobName->
+
+    def prj  = new rocProject('rccl-tests', 'StaticAnalysis')
+
+    // Define test architectures, optional rocm version argument is available
+    def nodes = new dockerNodes(nodeDetails, jobName, prj)
+
+    boolean formatCheck = false
+    boolean staticAnalysis = true
+
+    buildProject(prj, formatCheck, nodes.dockerArray, null, null, null, staticAnalysis)
+}
+
+ci: { 
+    String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
+
+    def propertyList = [
+        "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]
+    ]
+    propertyList = auxiliary.appendPropertyList(propertyList)
+
+    def jobNameList = [
+        "compute-rocm-dkms-no-npi-hipclang":([ubuntu22:['cpu']])
+    ]
+    jobNameList = auxiliary.appendJobNameList(jobNameList)
+
+    propertyList.each
+    {
+        jobName, property->
+        if (urlJobName == jobName)
+            properties(auxiliary.addCommonProperties(property))
+    }
+
+    jobNameList.each
+    {
+        jobName, nodeDetails->
+        if (urlJobName == jobName)
+            stage(jobName) {
+                runCI(nodeDetails, jobName)
+            }
+    }
+
+    // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901
+    if(!jobNameList.keySet().contains(urlJobName))
+    {
+        properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * 0')])]))
+        stage(urlJobName) {
+            runCI([ubuntu22:['cpu']], urlJobName)
+        }
+    }
+}
@@ -0,0 +1,210 @@
+# Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
+
+# CMake version minimum requirements
+#==================================================================================================
+cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
+
+# CMake Toolchain file to define compilers and path to ROCm
+#==================================================================================================
+if (NOT CMAKE_TOOLCHAIN_FILE)
+  set(CMAKE_TOOLCHAIN_FILE "${CMAKE_CURRENT_SOURCE_DIR}/toolchain-linux.cmake")
+  message(STATUS "CMAKE_TOOLCHAIN_FILE: ${CMAKE_TOOLCHAIN_FILE}")
+endif()
+
+# RCCL Tests project
+#==================================================================================================
+project(rccl-tests LANGUAGES CXX)
+
+# Build options
+#==================================================================================================
+option(USE_MPI                     "Build RCCL-tests with MPI support."           OFF)
+option(BUILD_LOCAL_GPU_TARGET_ONLY "Build only for GPUs detected on this machine" OFF)
+
+if (NOT CMAKE_BUILD_TYPE)
+  message(WARNING "CMAKE_BUILD_TYPE is not defined. Setting to Release")
+  set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Default build type")
+endif()
+
+# Default GPU architectures to build
+#==================================================================================================
+set(DEFAULT_GPUS
+      gfx906
+      gfx908
+      gfx90a
+      gfx942
+      gfx950
+      gfx1030
+      gfx1100
+      gfx1101
+      gfx1102
+      gfx1200
+      gfx1201)
+
+# Get additional packages required
+include(CheckIncludeFiles)
+include(CheckSymbolExists)
+include(cmake/Dependencies.cmake) # rocm-cmake, rocm_local_targets
+include(cmake/CheckSymbolExistsNoWarn.cmake)
+
+# Build only for local GPU architecture
+if (BUILD_LOCAL_GPU_TARGET_ONLY)
+  message(STATUS "Building only for local GPU target")
+  if (COMMAND rocm_local_targets)
+    rocm_local_targets(DEFAULT_GPUS)
+  else()
+    message(WARNING "Unable to determine local GPU targets. Falling back to default GPUs.")
+  endif()
+endif()
+
+# Determine which GPU architectures to build for
+set(GPU_TARGETS "${DEFAULT_GPUS}" CACHE STRING "Target default GPUs if GPU_TARGETS is not defined.")
+
+# Check if clang compiler can offload to GPU_TARGETS
+if (COMMAND rocm_check_target_ids)
+  message(STATUS "Checking for ROCm support for GPU targets: " "${GPU_TARGETS}")
+  rocm_check_target_ids(SUPPORTED_GPUS TARGETS ${GPU_TARGETS})
+else()
+  message(WARNING "Unable to check for supported GPU targets. Falling back to default GPUs.")
+  set(SUPPORTED_GPUS ${DEFAULT_GPUS})
+endif()
+
+set(GPU_TARGETS "${SUPPORTED_GPUS}")
+message(STATUS "Compiling for ${GPU_TARGETS}")
+
+## NOTE: Reload rocm-cmake in order to update GPU_TARGETS
+include(cmake/Dependencies.cmake) # Reloading to use desired GPU_TARGETS instead of defaults
+
+# Try to establish ROCM_PATH (for find_package)
+#==================================================================================================
+if(NOT DEFINED ROCM_PATH)
+  # Guess default location
+  set(ROCM_PATH "/opt/rocm")
+  message(WARNING "Unable to find ROCM_PATH: Falling back to ${ROCM_PATH}")
+else()
+  message(STATUS "ROCM_PATH found: ${ROCM_PATH}")
+endif()
+set(ENV{ROCM_PATH} ${ROCM_PATH})
+
+if("${CMAKE_CXX_COMPILER}" MATCHES ".*amdclang\\+\\+")
+  message(STATUS "Compiling with amdclang++")
+  set(COMPILER_EXE_NAME amdclang++)
+  set(COMPILER_GREP_STRING "AMD clang version")
+  set(COMPILER_AWK_CMD "awk -F\" \" '{ printf $4}'")
+elseif("${CMAKE_CXX_COMPILER}" MATCHES ".*clang\\+\\+")
+  message(STATUS "Compiling with clang++")
+  set(COMPILER_EXE_NAME clang++)
+  set(COMPILER_GREP_STRING "AMD clang version")
+  set(COMPILER_AWK_CMD "awk -F\" \" '{ printf $4}'")
+elseif("${CMAKE_CXX_COMPILER}" MATCHES ".*hipcc$")
+  message(STATUS "Compiling with hipcc")
+  set(COMPILER_EXE_NAME hipcc)
+  set(COMPILER_GREP_STRING "HIP version")
+  set(COMPILER_AWK_CMD "awk -F\" \" '{ printf $3}' | awk -F\"-\" '{ printf $1}'")
+else()
+  message(FATAL_ERROR "RCCL-Tests can be built only with hipcc or amdclang++")
+endif()
+
+# Set CMAKE flags
+#==================================================================================================
+set(CMAKE_INSTALL_PREFIX "${ROCM_PATH}" CACHE PATH "")
+set(CMAKE_CXX_STANDARD   14)   # We use C++14 features, this will add compile option: -std=c++14
+set(CMAKE_CXX_EXTENSIONS OFF)  # Without this line, it will add -std=gnu++14 instead, which has some issues.
+set(CPACK_PACKAGING_INSTALL_PREFIX "${ROCM_PATH}" CACHE PATH "Path to install to when packaged.")
+if(ROCM_PATH)
+  #list(APPEND CMAKE_PREFIX_PATH  # Temporary workaround 
+  list(PREPEND CMAKE_PREFIX_PATH  # Add ROCM_PATH to CMake search paths (for finding HIP / HSA
+              ${ROCM_PATH}
+              ${ROCM_PATH}/hip
+              ${ROCM_PATH}/llvm)
+endif()
+
+# Check for required dependencies
+#==================================================================================================
+## Check for Threads
+set(THREADS_PREFER_PTHREAD_FLAG ON)
+find_package(Threads REQUIRED)
+
+## Check for HIP
+find_package(hip REQUIRED)
+message(STATUS "HIP compiler:     ${HIP_COMPILER}")
+message(STATUS "HIP runtime:      ${HIP_RUNTIME}")
+if(NOT "${HIP_COMPILER}" MATCHES "clang")
+  message(FATAL_ERROR "RCCL requires clang-based compiler (amdclang++ or hipcc)")
+endif()
+
+## Check for compiler version
+find_program(compiler_executable ${COMPILER_EXE_NAME})
+message(STATUS "${COMPILER_EXE_NAME} executable: ${compiler_executable}")
+execute_process(
+  COMMAND         bash "-c" "${compiler_executable} --version | grep \"${COMPILER_GREP_STRING}\" | ${COMPILER_AWK_CMD}"
+  OUTPUT_VARIABLE compiler_version_string)
+message(STATUS "${COMPILER_EXE_NAME} version:    ${compiler_version_string}")
+
+## Check for HIP version
+find_program(hipconfig_executable hipconfig)
+message(STATUS "hipconfig executable: ${hipconfig_executable}")
+execute_process(
+  COMMAND         bash "-c" "${hipconfig_executable} -v | awk -F\"-\" '{ printf $1 }'"
+  OUTPUT_VARIABLE hip_version_string)
+message(STATUS "${COMPILER_EXE_NAME} HIP version:    ${hip_version_string}")
+
+##Check for ROCm version
+set(EXPLICIT_ROCM_VERSION "" CACHE STRING "Explicit ROCM version to compile to (auto detect if empty)")
+if(EXPLICIT_ROCM_VERSION)
+  set(rocm_version_string "${EXPLICIT_ROCM_VERSION}")
+elseif(ROCM_PATH)
+  message(STATUS "Reading ROCM version from ${ROCM_PATH}/.info/version")
+  file(READ "${ROCM_PATH}/.info/version" rocm_version_string)
+else()
+  message(FATAL_ERROR "Could not determine ROCM version (set EXPLICIT_ROCM_VERSION or set ROCM_PATH to a valid installation)")
+endif()
+string(REGEX MATCH "([0-9]+)\\.([0-9]+)\\.([0-9]+)" rocm_version_matches ${rocm_version_string})
+if (rocm_version_matches)
+  set(ROCM_MAJOR_VERSION ${CMAKE_MATCH_1})
+  set(ROCM_MINOR_VERSION ${CMAKE_MATCH_2})
+  set(ROCM_PATCH_VERSION ${CMAKE_MATCH_3})
+
+  message(STATUS "ROCm version: ${ROCM_MAJOR_VERSION}.${ROCM_MINOR_VERSION}.${ROCM_PATCH_VERSION}")
+
+  # Convert the version components to int for comparison
+  math(EXPR ROCM_VERSION "(10000 * ${ROCM_MAJOR_VERSION}) + (100 * ${ROCM_MINOR_VERSION}) + ${ROCM_PATCH_VERSION}")
+  add_definitions("-DROCM_VERSION=${ROCM_VERSION}")
+else()
+  message(WARNING "Failed to extract ROCm version.")
+endif()
+
+## Check for RCCL
+find_package(RCCL CONFIG REQUIRED HINTS "${CMAKE_PREFIX_PATH}" PATHS "${ROCM_PATH}")
+if (RCCL_FOUND)
+    message(STATUS "RCCL version : ${RCCL_VERSION}")
+    message(STATUS "RCCL include path : ${RCCL_INCLUDE_DIRS}")
+    message(STATUS "RCCL libraries : ${RCCL_LIBRARIES}")
+endif()
+
+## Check for MPI (if enabled)
+if (USE_MPI)
+    find_package(MPI REQUIRED)
+    if (MPI_FOUND)
+        message(STATUS "MPI include path : ${MPI_CXX_INCLUDE_PATH}")
+        message(STATUS "MPI libraries : ${MPI_CXX_LIBRARIES}")
+        add_definitions(-DMPI_SUPPORT)
+    else()
+        message ("-- no MPI library found")
+    endif()
+else()
+    message ("-- MPI support disabled")
+endif()
+
+set(ROCM_USE_DEV_COMPONENT OFF)  # This repo doesn't have a dev component
+
+# Add all of the tests
+add_subdirectory(src)
+
+rocm_setup_version(VERSION "2.14.1")
+
+# Create ROCm standard packages
+rocm_create_package(
+    NAME rccl-tests
+    DESCRIPTION "Tests for the ROCm Communication Collectives Library"
+    MAINTAINER "RCCL Maintainer <rccl-maintainer@amd.com>"
+)
@@ -0,0 +1,28 @@
+
+ Copyright (c) 2016-2017, NVIDIA CORPORATION.  All rights reserved.
+ Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+  * Neither the name of NVIDIA CORPORATION, nor the names of their
+    contributors may be used to endorse or promote products derived
+    from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
@@ -0,0 +1,23 @@
+#
+# Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENCE.txt for license information
+#
+
+BUILDDIR ?= build
+override BUILDDIR := $(abspath $(BUILDDIR))
+
+.PHONY: all clean
+
+default: src.build
+
+TARGETS=src
+
+all:   ${TARGETS:%=%.build}
+clean: ${TARGETS:%=%.clean}
+
+%.build:
+	${MAKE} -C $* build BUILDDIR=${BUILDDIR}
+
+%.clean:
+	${MAKE} -C $* clean BUILDDIR=${BUILDDIR}
@@ -0,0 +1,66 @@
+Notices and Licenses file
+_______________________________________________________________
+
+Dependencies on nvidia-nccl-tests v2.0.0 (BSD3)
+Copyright (c) 2016-2017, NVIDIA CORPORATION.
+Modifications Copyright (c) 2019 Advanced Micro Devices, Inc.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+ * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
+   Laboratory, the U.S. Department of Energy, nor the names of their
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+The U.S. Department of Energy funded the development of this software
+under subcontract 7078610 with Lawrence Berkeley National Laboratory.
+
+
+nvidia-nccl-tests v2.0.0 (BSD2)
+Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+ * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
+   Laboratory, the U.S. Department of Energy, nor the names of their
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+The U.S. Department of Energy funded the development of this software
+under subcontract 7078610 with Lawrence Berkeley National Laboratory.
@@ -0,0 +1,189 @@
+# RCCL Tests
+
+These tests check both the performance and the correctness of RCCL operations. They can be compiled against [RCCL](https://github.com/ROCm/rccl).
+
+## Build
+
+To build the tests, just type `make` or `make -j`
+
+If HIP is not installed in `/opt/rocm`, you may specify `HIP_HOME`. Similarly, if RCCL (`librccl.so`) is not installed in `/opt/rocm/lib/`, you may specify `NCCL_HOME` and `CUSTOM_RCCL_LIB`.
+
+```shell
+$ make HIP_HOME=/path/to/hip NCCL_HOME=/path/to/rccl
+```
+
+RCCL Tests rely on MPI to work on multiple processes, hence multiple nodes.
+
+> [!TIP]
+> To compile RCCL tests with MPI support, you need to set `MPI=1` and set `MPI_HOME` to the path where MPI is installed.
+
+```shell
+$ make MPI=1 MPI_HOME=/path/to/mpi HIP_HOME=/path/to/hip NCCL_HOME=/path/to/rccl
+```
+
+RCCL Tests can also be built using cmake. A typical sequence will be:
+
+```shell
+$ mkdir build
+$ cd build
+$ cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH=/path/to/rocm ..
+$ make
+```
+
+When using the cmake build procedure for building RCCL-Tests with custom/user-built `librccl.so`, please make sure that RCCL has been installed (i.e. using `make install`) and not pointing to the RCCL `build` directory, since cmake will check for cmake target and config files. This is not necessary as one can modify `LD_LIBRARY_PATH` to point to the custom/user-built `librccl.so` when running RCCL Tests.
+
+Using the cmake method also has the advantage that it automatically checks for MPI installation during the build. The tests can be compiled with MPI support by adding the `-DUSE_MPI=ON` flag to the cmake command line.
+
+> [!TIP]
+> Users can choose to link against a particular MPI library by using one of these options:
+> * setting the environment variable `MPI_HOME`.
+> * by adding the path to the MPI library to the cmake prefix path with `-DCMAKE_PREFIX_PATH`.
+> * including the paths to MPI `bin` and `lib` in the `PATH` and `LD_LIBRARY_PATH` environment variables, respectively.
+
+e.g.,
+```shell
+$ mkdir build
+$ cd build
+$ cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="/path/to/mpi;/path/to/rocm" -DUSE_MPI=ON ..
+$ make
+```
+
+By default, for both Makefile and `cmake` based builds, RCCL Tests will link against all supported GPU targets (defined in `src/Makefile` and as `DEFAULT_GPUS` in `CMakeLists.txt`).
+
+To target specific GPU(s), and potentially reduce build time, use:
+* `GPU_TARGETS` as a `,` separated string listing GPU(s) to target for Makefile based build.
+e.g. build RCCL-Tests using Makefile only for `gfx942` and `gfx950`. e.g.,
+    ```shell
+    $ GPU_TARGETS="gfx942,gfx950" make MPI=1 MPI_HOME=/path/to/mpi NCCL_HOME=/opt/rocm
+    ```
+* `-DGPU_TARGETS` as a `;` separated string listing GPU(s) to target for `cmake` based build.
+e.g. build RCCL-Tests using CMake for `gfx90a`, `gfx942` and `gfx1200`. e.g.,
+    ```shell
+    $ cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="/path/to/mpi;/path/to/rocm" -DUSE_MPI=ON -DGPU_TARGETS="gfx90a;gfx942;gfx1200;" ..
+    ```
+* For CMake builds, we also have another flag `DBUILD_LOCAL_GPU_TARGET_ONLY` that queries and builds for the local GPU target only (similar to RCCL).
+    ```shell
+    $ cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="/path/to/mpi;/path/to/rocm" -DUSE_MPI=ON -DBUILD_LOCAL_GPU_TARGET_ONLY=ON ..
+    ```
+
+`-DBUILD_LOCAL_GPU_TARGET_ONLY` will not work with `docker build`-based setups, as the docker build engine is unable to query the local GPU architecture. Please use `-DGPU_TARGETS` for CMake-based builds or `GPU_TARGETS` for Makefile-based builds when building RCCL-Tests using a Dockerfile and `docker build`.
+
+## Usage
+
+RCCL Tests can run on multiple processes, multiple threads, and multiple HIP devices per thread. The number of process is managed by MPI and is therefore not passed to the tests as argument. The total number of ranks (=HIP devices) will be equal to (number of processes)\*(number of threads)\*(number of GPUs per thread).
+
+### Quick examples
+
+Run on single node with 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes :
+
+```shell
+$ ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8
+```
+
+Run 64 MPI processes on nodes with 8 GPUs each, for a total of 64 GPUs spread across 8 nodes :
+(NB: The rccl-tests binaries must be compiled with `MPI=1` for this case)
+
+```shell
+$ mpirun -np 64 -N 8 ./build/all_reduce_perf -b 8 -e 8G -f 2 -g 1
+```
+
+> [!TIP]
+> For performance-oriented runs, on both single-node and multi-node, we suggest using 1 MPI process per GPU and `-g 1`. So, a run on 8 GPUs looks like :
+> ```shell
+> $ mpirun -np 8 --bind-to numa ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 1
+> ```
+> Running with 1 MPI process per GPU ensures a 1:1 mapping for CPUs and GPUs, which can be beneficial for smaller message sizes and better represents the real-world use of RCCL in Deep Learning frameworks like Pytorch and TensorFlow.
+
+### Performance
+
+See the [Performance](doc/PERFORMANCE.md) page for explanation about numbers, and in particular the "busbw" column.
+
+#### Environment variables
+On some earlier versions of ROCm (before ROCm 6.4.0), setting `HSA_NO_SCRATCH_RECLAIM=1` as part of the environment is necessary to achieve better performance on MI300 GPUs. When running without MPI, a command similar to the following one should be sufficient:
+```shell
+HSA_NO_SCRATCH_RECLAIM=1 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8
+```
+
+For MPI (using MPICH), you need to use a command similar to the following:
+```shell
+mpirun.mpich -np 8 -env NCCL_DEBUG=VERSION -env HSA_NO_SCRATCH_RECLAIM=1 ./build/all_reduce_perf -b 8M -e 128M -i 8388608 -g 1 -d bfloat16
+```
+
+### Arguments
+
+All tests support the same set of arguments :
+
+* Number of GPUs
+  * `-t,--nthreads <num threads>` number of threads per process. Default : 1.
+  * `-g,--ngpus <GPUs per thread>` number of gpus per thread. Default : 1.
+* Sizes to scan
+  * `-b,--minbytes <min size in bytes>` minimum size to start with. Default : 32M.
+  * `-e,--maxbytes <max size in bytes>` maximum size to end at. Default : 32M.
+  * Increments can be either fixed or a multiplication factor. Only one of those should be used
+    * `-i,--stepbytes <increment size>` fixed increment between sizes. Default : 1M.
+    * `-f,--stepfactor <increment factor>` multiplication factor between sizes. Default : disabled.
+* RCCL operations arguments
+  * `-o,--op <sum/prod/min/max/avg/all>` Specify which reduction operation to perform. Only relevant for reduction operations like Allreduce, Reduce or ReduceScatter. Default : Sum.
+  * `-d,--datatype <nccltype/all>` Specify which datatype to use. Default : Float.
+  * `-r,--root <root/all>` Specify which root to use. Only for operations with a root like broadcast or reduce. Default : 0.
+  * `-y,--memory_type <coarse/fine/host/managed>` Default: Coarse
+  * `-u,--cumask <d0,d1,d2,d3>` Default: None
+* Performance
+  * `-n,--iters <iteration count>` number of iterations. Default : 20.
+  * `-w,--warmup_iters <warmup iteration count>` number of warmup iterations (not timed). Default : 5.
+  * `-m,--agg_iters <aggregation count>` number of operations to aggregate together in each iteration. Default : 1.
+  * `-N,--run_cycles <cycle count>` run & print each cycle. Default : 1; 0=infinite.
+  * `-a,--average <0/1/2/3>` Report performance as an average across all ranks (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. Default : 1.
+* Test operation
+  * `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. Default : 0.
+  * `-c,--check <check iteration count>` perform count iterations, checking correctness of results on each iteration. This can be quite slow on large numbers of GPUs. Default : 1.
+  * `-z,--blocking <0/1>` Make RCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0.
+  * `-G,--hipgraph <num graph launches>` Capture iterations as a HIP graph and then replay specified number of times. Default : 0.
+  * `-C,--report_cputime <0/1>]` Report CPU time instead of latency. Default : 0.
+  * `-R,--local_register <0/1/2>` enable local (1) or symmetric (2) buffer registration on send/recv buffers. Default : 0.
+  * `-T,--timeout <time in seconds>` timeout each test after specified number of seconds. Default : disabled.
+  * `-F,--cache_flush <cache flush after every -F iteration>` Enable cache flush after every -F iteration. Default : 0 (No cache flush).
+  * `-O,--out_of_place <0=in-place only, 1=out-of-place only>`. Default: both.
+  * `-q,--delay <delay>` Delay between out-of-place and in-place runs (in microseconds). Default: 10.
+* Parsing RCCL-Tests output
+  * `-Z,--output_format <csv|json>` Parse RCCL-Tests output as a CSV or JSON. Default : disabled.
+  * `-x,--output_file <output file name>` RCCL-Tests output file name. Default : disabled.
+  * `-M,--output_algo_proto_channels <0/1>` Report Algorithm/Protocol/Channels for each message size. Default : 0.
+
+### Running multiple operations in parallel
+
+RCCL Tests allow to partition the set of GPUs into smaller sets, each executing the same operation in parallel. 
+To split the GPUs, RCCL will compute a "color" for each rank, based on the `NCCL_TESTS_SPLIT` environment variable, then all ranks
+with the same color will end up in the same group. The resulting group is printed next to each GPU at the beginning of the test.
+
+`NCCL_TESTS_SPLIT` takes the following syntax: `<operation><value>`. Operation can be `AND`, `OR`, `MOD` or `DIV`. The `&`, `|`, `%`, and `/` symbols are also supported. The value can be either decimal, hexadecimal (prefixed by `0x`) or binary (prefixed by `0b`).
+
+`NCCL_TESTS_SPLIT_MASK="<value>"` is equivalent to `NCCL_TESTS_SPLIT="&<value>"`.
+
+Here are a few examples:
+
+ - `NCCL_TESTS_SPLIT="AND 0x7"` or `NCCL_TESTS_SPLIT="MOD 8"`: On systems with 8 GPUs, run 8 parallel operations, each with 1 GPU per node (purely communicating over the inter-node network)
+
+- `NCCL_TESTS_SPLIT="OR 0x7"` or `NCCL_TESTS_SPLIT="DIV 8"`: On systems with 8 GPUs, run one operation per node, purely intra-node.
+
+- `NCCL_TESTS_SPLIT="AND 0x1"` or `NCCL_TESTS_SPLIT="MOD 2"`: Run two operations, each operation using every other rank.
+
+Note that the reported bandwidth is per group, hence to get the total bandwidth used by all groups, one must multiply by the number of groups.
+
+## Unit tests
+
+Unit tests for rccl-tests are implemented with pytest (python3 is also required). Several notes for the unit tests:
+
+1. The `LD_LIBRARY_PATH` environment variable will need to be set to include `/path/to/rccl-install/lib/` in order to run the unit tests.
+2. The `HSA_FORCE_FINE_GRAIN_PCIE` environment variable will need to be set to 1 in order to run the unit tests which use fine-grained memory type.
+
+The unit tests can be invoked within the rccl-tests root, or in the test subfolder. An example call to the unit tests:
+```shell
+$ LD_LIBRARY_PATH=/path/to/rccl-install/lib/ HSA_FORCE_FINE_GRAIN_PCIE=1 python3 -m pytest
+```
+
+## Copyright
+
+NCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved.
+
+All modifications are copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
@@ -0,0 +1,40 @@
+# MIT License
+#
+# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# These overrides are due to CMake CHECK_SYMBOL_EXISTS modifying CMAKE_CXX_FLAGS to do a test compile,
+# while ROCMChecks gives a warning if this variable is modified manually without a target.
+
+# We now choose to disable ROCMChecks for this one case.
+
+set(DISABLE_ROCM_CHECK OFF)
+
+function(rocm_check_toolchain_var var access value list_file)
+  if(NOT DISABLE_ROCM_CHECK)
+    _rocm_check_toolchain_var("${var}" "${access}" "${value}" "${list_file}")
+  endif()
+endfunction()
+
+macro(CHECK_SYMBOL_EXISTS)
+  set(DISABLE_ROCM_CHECK ON)
+  _check_symbol_exists(${ARGN})
+  set(DISABLE_ROCM_CHECK OFF)
+endmacro()
@@ -0,0 +1,108 @@
+# MIT License
+#
+# Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# Dependencies
+
+# HIP dependency is handled earlier in the project cmake file
+# when VerifyCompiler.cmake is included.
+
+# GIT
+
+# Test dependencies
+include(FetchContent)
+
+# Find or download/install rocm-cmake project
+find_package(ROCmCMakeBuildTools 0.11.0 CONFIG QUIET PATHS "${ROCM_PATH}")
+if(NOT ROCmCMakeBuildTools_FOUND)
+    find_package(ROCM 0.7.3 CONFIG QUIET PATHS "${ROCM_PATH}") # deprecated fallback
+    if(NOT ROCM_FOUND)
+        message(STATUS "ROCmCMakeBuildTools not found. Fetching...")
+        set(PROJECT_EXTERN_DIR ${CMAKE_CURRENT_BINARY_DIR}/extern)
+        set(rocm_cmake_tag "rocm-6.4.0" CACHE STRING "rocm-cmake tag to download")
+        FetchContent_Declare(
+            rocm-cmake
+            GIT_REPOSITORY https://github.com/ROCm/rocm-cmake.git
+            GIT_TAG ${rocm_cmake_tag}
+            SOURCE_SUBDIR "DISABLE ADDING TO BUILD"
+        )
+        FetchContent_MakeAvailable(rocm-cmake)
+        find_package(ROCmCMakeBuildTools CONFIG REQUIRED NO_DEFAULT_PATH PATHS "${rocm-cmake_SOURCE_DIR}")
+    endif()
+endif()
+
+# Find available local ROCM targets
+# NOTE: This will eventually be part of ROCm-CMake and should be removed at that time
+function(rocm_local_targets VARIABLE)
+  set(${VARIABLE} "NOTFOUND" PARENT_SCOPE)
+  find_program(_rocm_agent_enumerator rocm_agent_enumerator HINTS ocm/bin ENV ROCM_PATH)
+  if(NOT _rocm_agent_enumerator STREQUAL "_rocm_agent_enumerator-NOTFOUND")
+    execute_process(
+      COMMAND "${_rocm_agent_enumerator}"
+      RESULT_VARIABLE _found_agents
+      OUTPUT_VARIABLE _rocm_agents
+      ERROR_QUIET
+      )
+    if (_found_agents EQUAL 0)
+      string(REPLACE "\n" ";" _rocm_agents "${_rocm_agents}")
+      unset(result)
+      foreach (agent IN LISTS _rocm_agents)
+        if (NOT agent STREQUAL "gfx000")
+          list(APPEND result "${agent}")
+        endif()
+      endforeach()
+      if(result)
+        list(REMOVE_DUPLICATES result)
+        set(${VARIABLE} "${result}" PARENT_SCOPE)
+      endif()
+    endif()
+  endif()
+endfunction()
+
+# Iterate over the "source" list and check if there is a duplicate file name
+# NOTE: This is due to compiler bug '--save-temps' and can be removed when fix availabe
+function(add_file_unique FILE_LIST FILE)
+  get_filename_component(FILE_NAME "${FILE}" NAME)
+
+  # Iterate over whatever is in the list so far
+  foreach(curr_file IN LISTS ${FILE_LIST})
+    get_filename_component(curr_file_name ${curr_file} NAME)
+
+    # Check if duplicate
+    if(${FILE_NAME} STREQUAL ${curr_file_name})
+      get_filename_component(DIR_PATH "${FILE}" DIRECTORY)
+      get_filename_component(FILE_NAME_WE "${FILE}" NAME_WE)
+      get_filename_component(FILE_EXT "${FILE}" EXT)
+
+      # Construct a new file name by adding _tmp
+      set(HIP_FILE "${DIR_PATH}/${FILE_NAME_WE}_tmp${FILE_EXT}" PARENT_SCOPE)
+    endif()
+  endforeach()
+endfunction()
+
+include(ROCMSetupVersion)
+include(ROCMCreatePackage)
+include(ROCMInstallTargets)
+include(ROCMPackageConfigHelpers)
+include(ROCMInstallSymlinks)
+include(ROCMCheckTargetIds)
+include(ROCMClients)
+include(ROCMHeaderWrapper)
@@ -0,0 +1,145 @@
+# Performance reported by RCCL tests
+
+RCCL tests report the average operation time in ms, and two bandwidths in GB/s : algorithm bandwidth and bus bandwidth. This page explains what those numbers mean and what you should expect depending on the hardware used.
+
+# Time
+
+Time is useful with small sizes, to measure the constant overhead (or latency) associated with operations.
+
+On large sizes, the time becomes linear with the size (since it is roughly equal to overhead + size / bw) and is no longer measuring the latency but
+also the bandwidth multiplied by the size.
+
+Therefore, on large sizes, it makes more sense to look at the bandwidth.
+
+# Bandwidth
+
+## Algorithm bandwidth
+
+Algorithm bandwidth is using the most commonly used formula for bandwidth : size (_S_) / time (_t_). It is useful to compute how much time any large operation would take by simply dividing the size of the operation by the algorithm bandwidth.
+
+`algbw = S/t`
+
+## Bus bandwidth
+
+While the algorithm bandwidth makes sense for point-to-point operations like Send/Receive, it is not always helpful to measure collective operations speed, since the theoretical peak algorithm bandwidth is not equal to the hardware peak bandwidth, usually depending on the number of ranks.
+Most benchmarks only provide time measurements, which is hard to interpret for large sizes. Some others also provide algorithms bandwidth, but see that depending on the number of ranks, that bandwidth varies (and decreases as the number of ranks increase).
+
+To provide a number which reflects how optimally the hardware is used, RCCL tests introduce the notion of "Bus Bandwidth" ("busbw" column in the tests output).
+This number is obtained applying a formula to the algorithm bandwidth to reflect the speed of the inter-GPU communication.
+Using this bus bandwidth, we can compare it with the hardware peak bandwidth, independently of the number of ranks used.
+
+The formula depends on the collective operation.
+
+### AllReduce
+
+An allreduce operation, for each element of the N arrays (input i_X and output o_X, each situated on rank X), is performing the following operation :
+
+`o_0 = o_1 = o_2 = ... = o_{n-1} = i_0 + i_1 + i_2 + ... + i_{n-1}`
+
+**Note : this is independent of the algorithm used (ring, tree, or other) as long as they use point-to-point operations (send/receive).**
+
+A ring would do that operation in an order which follows the ring :
+
+`i_0 + i_1 + ... + i_{n-1} -> o_{n-1} -> o_0 -> o_1 -> .. -> o_{n-2}`
+
+A tree would do it hierarchically :
+
+`(((((i_{n-1} + i_{n-2}) + (i_{n-3} + i_{n-4})) + ... + (i_1 + i_0))))) -> o_0 -> (o_{n/2} -> (o_{3n/4} ...))`
+
+In all cases, we need n-1 additions and n assignments for each element. Since every step is on a different rank except potentially one (the last input and the first output),
+we need 2(n-1) data transfers (x number of elements) to perform an allReduce operation.
+
+Considering that each rank has a bandwidth to the outside world of _B_, the time to perform an allReduce operation of _S_ elements is at best :
+
+ `t = (S*2*(n-1)) / (n*B)`
+
+Indeed, we have _S_ elements, 2*(n-1) operations per element, and _n_ links of bandwidth _B_ to perform them.
+Reordering the equation, we find that
+
+ `t = (S/B) * (2*(n-1)/n)`
+
+Therefore, to get an AllReduce bandwidth measurement which we can compare to the hardware peak bandwidth, we compute :
+
+ `B = S/t * (2*(n-1)/n) = algbw * (2*(n-1)/n)`
+
+### ReduceScatter
+
+The ReduceScatter operation requires only to perform the addition part of the allReduce operation :
+
+ `o_K = i_0 + i_1 + i_2 + ... + i_{n-1}`
+
+With K being the rank which is getting the final result(K=offset/recvsize).
+
+The perfect reduceScatter time with a rank bandwidth of B would therefore be :
+
+ `t = S*(n-1) / (B*n)`
+
+And the Bus Bandwidth is therefore computed as :
+
+ `B = S/t * (n-1)/n = algbw * (n-1)/n`
+
+Note that here, S is the size in bytes of the total array, which for RCCL is equal to `recvcount*sizeof(datatype)*n` as the `recvcount` argument is the count per rank.
+
+### AllGather
+
+The AllGather operation requires only to perform the assignment part of the allReduce operation :
+
+ `o_0 = o_1 = o_2 = ... = o_{n-1} = i_K`
+
+With K being the rank where the data originates from (K=offset*sendsize).
+
+The perfect allGather time with a rank bandwidth of B would therefore be :
+
+ `t = S*(n-1) / (B*n)`
+
+And the Bus Bandwidth is therefore computed as :
+
+ `B = S/t * (n-1)/n = algbw * (n-1)/n`
+
+Note that here, S is the size in bytes of the total array, which for RCCL is equal to `sendcount*sizeof(datatype)*n` as the `sendcount` argument is the count per rank.
+
+### Broadcast
+
+The broadcast operation representation is similar to allGather :
+
+ `o_0 = o_1 = o_2 = ... = o_{n-1} = i_R`
+
+R being the root of the operation.
+
+However, in this case, since the i_R input is not evenly distributed on the ranks, we cannot use all N links to perform the transfer operations.
+Indeed, *all* data has to get out of the root rank, hence the bottleneck is on the root rank which only has B as capacity to get data out :
+
+ `t = S/B`
+
+And :
+
+ `B = S/t`
+
+### Reduce
+
+The reduce operation performs :
+
+ `o_R = i_0 + i_1 + i_2 + ... + i_{n-1}`
+
+R being the root of the operation.
+
+Similarly to broadcast, all data need to be sent to the root, hence :
+
+ `t = S/B`
+
+And :
+
+ `B = S/t`
+
+### Summary
+
+To obtain a bus bandwidth which should be independent of the number of ranks _n_, we apply a correction factor to the algorithm bandwidth :
+
+* AllReduce : 2*(_n_-1)/_n_
+* ReduceScatter : (_n_-1)/_n_
+* AllGather : (_n_-1)/_n_
+* Broadcast : 1
+* Reduce : 1
+* AlltoAll: (_n_-1)/_n_
+
+The bus bandwidth should reflect the speed of the hardware bottleneck : NVLink, PCI, QPI, or network.
@@ -0,0 +1,41 @@
+# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+# This Dockerfile provides a starting point for a ROCm installation of rccl
+
+# Parameters related to building rccl
+ARG base_image
+
+FROM ${base_image}
+LABEL maintainer="rccl-maintainer@amd.com"
+
+USER root
+ARG user_uid
+
+# Install dependent packages
+RUN yum install -y --nogpgcheck \
+    sudo \
+    chrpath \
+    rock-dkms \
+    rocm-cmake \
+    centos-release-scl \
+    devtoolset-7 \
+    ca-certificates \
+    git \
+    cmake3 \
+    make \
+    libgomp \
+    clang \
+    clang-devel \
+    gcc-c++ \
+    pkgconfig \
+    numactl-libs 
+
+RUN echo '#!/bin/bash' | tee /etc/profile.d/devtoolset7.sh && echo \
+    'source scl_source enable devtoolset-7' >>/etc/profile.d/devtoolset7.sh
+
+# docker pipeline runs containers with particular uid
+# create a jenkins user with this specific uid so it can use sudo priviledges
+# Grant any member of sudo group password-less sudo privileges
+RUN useradd --create-home -u ${user_uid} -o -G video --shell /bin/bash jenkins && \
+    echo '%video ALL=(ALL) NOPASSWD:ALL' | tee /etc/sudoers.d/sudo-nopasswd && \
+    chmod 400 /etc/sudoers.d/sudo-nopasswd
+
@@ -0,0 +1,43 @@
+# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+# Parameters related to building rccl
+ARG base_image
+
+FROM ${base_image}
+LABEL maintainer="rccl-maintainer@amd.com"
+
+ARG user_uid
+
+# Install dependent packages
+# Dependencies:
+# * hcc-config.cmake: pkg-config
+# * tensile: python2.7, python-yaml
+# * rocblas-test: gfortran, googletest
+# * rocblas-bench: libboost-program-options-dev
+# * libhsakmt.so: libnuma1
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    rock-dkms \
+    sudo \
+    ca-certificates \
+    chrpath \
+    git \
+    make \
+    cmake \
+    pkg-config \
+    python2.7 \
+    python-yaml \
+    python3-pytest \
+    rocm-cmake \
+    libboost-program-options-dev \
+    libnuma1 \
+    libomp-dev \
+    && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# docker pipeline runs containers with particular uid
+# create a jenkins user with this specific uid so it can use sudo priviledges
+# Grant any member of sudo group password-less sudo privileges
+RUN useradd --create-home -u ${user_uid} -o -G video --shell /bin/bash jenkins && \
+    mkdir -p /etc/sudoers.d/ && \
+    echo '%video ALL=(ALL) NOPASSWD:ALL' | tee /etc/sudoers.d/sudo-nopasswd
+
@@ -0,0 +1,8 @@
+# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+# Parameters related to building rccl
+ARG base_image
+
+FROM ${base_image}
+LABEL maintainer="rccl-maintainer@amd.com"
+
+#empty for now
@@ -0,0 +1,8 @@
+# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+# Parameters related to building rccl
+ARG base_image
+
+FROM ${base_image}
+LABEL maintainer="rccl-maintainer@amd.com"
+
+#empty for now
@@ -0,0 +1,154 @@
+#!/bin/bash
+# Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+
+# #################################################
+# helper functions
+# #################################################
+function display_help()
+{
+    echo "RCCL-tests build & installation helper script"
+    echo "./install [-h|--help] "
+    echo "    [-h|--help] Prints this help message."
+    echo "    [-m|--mpi] Build RCCL-tests with MPI support. (see --mpi_home below.)"
+    echo "    [-t|--test] Run unit-tests after building RCCL-Tests."
+    echo "    [--rocm_home] Specify custom path for ROCm installation (default: /opt/rocm)"
+    echo "    [--rccl_home] Specify custom path for RCCL installation (default: /opt/rocm)"
+    echo "    [--mpi_home] Specify path to your MPI installation."
+    echo "    [--hip_compiler] Specify path to HIP compiler (default: /opt/rocm/bin/amdclang++)"
+    echo "    [--gpu_targets] Specify GPU targets (default:gfx906,gfx908,gfx90a,gfx942,gfx950,gfx1030,gfx1100,gxf1101,gfx1102,gfx1200,gfx1201)"
+}
+
+# #################################################
+# global variables
+# #################################################
+run_tests=false
+build_release=true
+mpi_enabled=false
+rocm_dir=${ROCM_PATH}
+rccl_dir=${rocm_dir}
+mpi_dir=""
+hip_compiler=${rocm_dir}/bin/amdclang++
+gpu_targets=""
+
+# #################################################
+# Parameter parsing
+# #################################################
+
+# check if we have a modern version of getopt that can handle whitespace and long parameters
+getopt -T
+if [[ $? -eq 4 ]]; then
+  GETOPT_PARSE=$(getopt --name "${0}" --longoptions help,mpi,test,rocm_home:,rccl_home:,mpi_home:,hip_compiler:,gpu_targets: --options hmt -- "$@")
+else
+  echo "Need a new version of getopt"
+  exit 1
+fi
+
+if [[ $? -ne 0 ]]; then
+  echo "getopt invocation failed; could not parse the command line";
+  exit 1
+fi
+
+eval set -- "${GETOPT_PARSE}"
+
+while true; do
+  case "${1}" in
+    -h|--help)
+       display_help
+       exit 0 ;;
+    -m|--mpi)
+       mpi_enabled=true
+       shift ;;
+    -t|--test)
+       run_tests=true
+       shift ;;
+    --rocm_home)
+       rocm_dir=${2}
+       shift 2 ;;
+    --rccl_home)
+       rccl_dir=${2}
+       shift 2 ;;
+    --mpi_home)
+       mpi_dir=${2}
+       shift 2 ;;
+    --hip_compiler)
+       hip_compiler=${2}
+       shift 2 ;;
+    --gpu_targets)
+       gpu_targets=${2}
+       shift 2 ;;
+    --) shift ; break ;;
+    *)  echo "Unexpected command line parameter received; aborting";
+    exit 1 ;;
+  esac
+done
+
+# throw error code after running a command in the install script
+check_exit_code( )
+{
+  if (( $1 != 0 )); then
+    exit $1
+  fi
+}
+
+# Install the pre-commit hook
+#bash ./githooks/install
+
+build_dir=./build
+# #################################################
+# prep
+# #################################################
+# ensure a clean build environment
+rm -rf ${build_dir}
+
+if [[ -z ${rocm_dir} ]]; then
+  echo "[WARN] ROCM_PATH does not exist at ${rocm_dir}. Defaulting to /opt/rocm"
+  rocm_dir=/opt/rocm
+fi
+
+if ! command -v ${hip_compiler} 2>&1 >/dev/null ; then
+  echo "[WARN] HIP Compiler does not exist at ${hip_compiler}. Please check the path."
+  echo "[WARN] - Falling back to ${rocm_dir}/bin/amdclang++"
+  hip_compiler=${rocm_dir}/bin/amdclang++
+
+  if ! command -v ${hip_compiler} 2>&1 >/dev/null ; then
+    echo "[WARN] ${hip_compiler} does not exist. Please be advised."
+    echo "[WARN] - Falling back to ${rocm_dir}/bin/hipcc"
+    hip_compiler=${rocm_dir}/bin/hipcc
+
+    if ! command -v ${hip_compiler} 2>&1 >/dev/null ; then
+      echo "[ERROR] ${hip_compiler} does not exist!. Please check your ROCm installation." >&2
+      echo "[ERROR] Cannot proceed with building rccl-tests!" >&2
+      exit 1
+    fi
+  fi
+fi
+echo "[INFO] Compiling with ${hip_compiler}"
+
+if [[ -n ${gpu_targets} ]]; then
+  GPU_TARGETS="GPU_TARGETS=${gpu_targets}"
+fi
+
+if ($mpi_enabled); then
+  if [[ ${mpi_dir} == "" ]]; then
+    echo "[ERROR] MPI flag enabled but path to MPI installation not specified.  See --mpi_home command line argument." >&2
+    exit 1
+  else
+    echo "[INFO] Compiling with MPI support (Using MPI from ${mpi_dir})"
+    echo
+    make NCCL_HOME=${rccl_dir} CUSTOM_RCCL_LIB=${rccl_dir}/lib/librccl.so MPI=1 MPI_HOME=${mpi_dir} HIPCC=${hip_compiler} ${GPU_TARGETS} -j$(nproc)
+  fi
+else
+  echo "[INFO] Compiling without MPI support (MPI support requires -m and --mpi_home)"
+  echo
+  make NCCL_HOME=${rccl_dir} CUSTOM_RCCL_LIB=${rccl_dir}/lib/librccl.so HIPCC=${hip_compiler} ${GPU_TARGETS} -j$(nproc)
+fi
+check_exit_code "$?"
+
+# Optionally, run tests if they're enabled.
+if ($run_tests); then
+  if ($mpi_enabled); then
+    cd test; LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${rccl_dir}/lib:${mpi_dir}/lib PATH=$PATH:${mpi_dir}/bin python3 -m pytest
+  else
+    cd test; LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${rccl_dir}/lib python3 -m pytest -k "not MPI"
+  fi
+fi
@@ -0,0 +1,148 @@
+# ########################################################################
+# Copyright 2022-2024 Advanced Micro Devices, Inc.
+# ########################################################################
+
+function(add_rccl_test TEST)
+    set(TEST_SOURCE "${TEST}.cu")
+    set_property(SOURCE ${TEST_SOURCE} PROPERTY LANGUAGE CXX)
+
+    # Check that file exists
+    if (NOT EXISTS ${SOURCE_DIR}/${TEST_SOURCE})
+        message(FATAL_ERROR "Unable to find file listed in CMakeLists.txt: ${SOURCE_DIR}/${TEST_SOURCE}")
+    endif()
+
+    # Establish hipified copy of the source file
+    set(HIP_FILE "${HIPIFY_DIR}/${TEST_SOURCE}")
+    get_filename_component(HIP_FILE_DIR ${HIP_FILE} DIRECTORY)
+
+    # Convert .cu files to .cpp so that they get processed properly
+    string(REPLACE "\.cu" "\.cu.cpp" HIP_FILE ${HIP_FILE})
+
+    # Create a custom command to create hipified source code
+    add_custom_command(
+        OUTPUT ${HIP_FILE}
+        COMMAND mkdir -p ${HIP_FILE_DIR} && $ ${hipify-perl_executable} -quiet-warnings ${SOURCE_DIR}/${TEST_SOURCE} -o ${HIP_FILE}
+        MAIN_DEPENDENCY ${TEST_SOURCE}
+        COMMENT "Hipifying ${TEST_SOURCE} -> ${HIP_FILE}"
+    )
+
+    set(TEST_TARGET "${TEST}_perf")
+    add_executable(${TEST_TARGET} ${HIP_FILE})
+    target_link_libraries(
+        ${TEST_TARGET}
+        PRIVATE
+            rccl_common
+    )
+    set_target_properties(
+        ${TEST_TARGET}
+        PROPERTIES
+            RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+            # LINKER_LANGUAGE CXX
+    )
+    add_relative_test(${TEST} ${TEST_TARGET})
+    rocm_install(TARGETS ${TEST_TARGET})
+    # TODO: copy/install DLLs on Windows
+    set_target_properties(
+        ${TEST_TARGET} PROPERTIES
+        INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib;${ROCM_PATH}/lib"
+    )
+endfunction()
+
+function(add_relative_test test_name test_target)
+    get_target_property(EXE_PATH ${test_target} RUNTIME_OUTPUT_DIRECTORY)
+    if(EXE_PATH STREQUAL "EXE_PATH-NOTFOUND")
+        set(EXE_PATH ".")
+    endif()
+    get_filename_component(EXE_PATH "${EXE_PATH}" ABSOLUTE BASE_DIR "${CMAKE_CURRENT_BINARY_DIR}")
+    get_target_property(EXE_NAME ${test_target} RUNTIME_OUTPUT_NAME)
+    if(EXE_NAME STREQUAL "EXE_NAME-NOTFOUND")
+        get_target_property(EXE_NAME ${test_target} OUTPUT_NAME)
+        if(EXE_NAME STREQUAL "EXE_NAME-NOTFOUND")
+            set(EXE_NAME "${test_target}")
+        endif()
+    endif()
+    file(RELATIVE_PATH rel_path "${CMAKE_CURRENT_BINARY_DIR}" "${EXE_PATH}/${EXE_NAME}")
+    add_test(NAME "${test_name}" COMMAND "./${rel_path}")
+endfunction()
+
+# Collect list of common source files
+#==================================================================================================
+set(COMMON_FILES
+  git_version.h
+  common.h
+  common.cu
+  nccl1_compat.h
+  rccl_compat.h
+  rccl_float8.h
+  timer.h
+  timer.cc
+  ../verifiable/verifiable.h
+  ../verifiable/verifiable.cu
+)
+
+# Hipify common files (copy of source generated into hipify directory)
+#==================================================================================================
+find_program(hipify-perl_executable hipify-perl)
+set(HIPIFY_DIR "${CMAKE_CURRENT_BINARY_DIR}/hipify")
+set(SOURCE_DIR "${CMAKE_SOURCE_DIR}/src")
+
+## Loop over each common file to hipify
+foreach(COMMON_FILE ${COMMON_FILES})
+  # Check that file exists
+  if (NOT EXISTS ${SOURCE_DIR}/${COMMON_FILE})
+    message(FATAL_ERROR "Unable to find file listed in CMakeLists.txt: ${SOURCE_DIR}/${COMMON_FILE}")
+  endif()
+
+  # Establish hipified copy of the common file
+  get_filename_component(HIP_FILE_NAME ${HIPIFY_DIR}/${COMMON_FILE} NAME)
+  set(HIP_FILE "${HIPIFY_DIR}/${HIP_FILE_NAME}")
+
+  # Convert .cu files to .cpp so that they get processed properly
+  string(REPLACE "\.cu" "\.cu.cpp" HIP_FILE ${HIP_FILE})
+  list(APPEND HIP_COMMON_SOURCES ${HIP_FILE})
+
+  # Create a custom command to create hipified source code
+  add_custom_command(
+    OUTPUT ${HIP_FILE}
+    COMMAND mkdir -p ${HIPIFY_DIR} && $ ${hipify-perl_executable} -quiet-warnings ${SOURCE_DIR}/${COMMON_FILE} -o ${HIP_FILE}
+    MAIN_DEPENDENCY ${COMMON_FILE}
+    COMMENT "Hipifying ${COMMON_FILE} -> ${HIP_FILE}"
+  )
+endforeach()
+
+# Create an initial git_version.cpp file (that will be updated with latest git version)
+#==================================================================================================
+file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp "")
+list(APPEND HIP_COMMON_SOURCES ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp)
+
+#Create a custom target that updates git_version.cpp and executes whenever rccl is built
+add_custom_target(git_version_check
+  COMMENT "Updating git_version.cpp if necessary"
+  COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/git_version.cmake
+  VERBATIM
+)
+
+# Compile common object library
+#==================================================================================================
+add_custom_target(hipify DEPENDS ${HIP_COMMON_SOURCES})
+add_library(rccl_common OBJECT ${HIP_COMMON_SOURCES})
+add_dependencies(rccl_common hipify git_version_check)
+target_link_libraries(rccl_common roc::rccl hip::device Threads::Threads dl)
+if(USE_MPI)
+  target_link_libraries(rccl_common MPI::MPI_CXX)
+endif()
+
+# Compile tests
+#==================================================================================================
+add_rccl_test(all_gather)
+add_rccl_test(all_reduce)
+add_rccl_test(alltoall)
+add_rccl_test(alltoallv)
+add_rccl_test(broadcast)
+add_rccl_test(gather)
+add_rccl_test(hypercube)
+add_rccl_test(reduce_scatter)
+add_rccl_test(reduce)
+add_rccl_test(scatter)
+add_rccl_test(sendrecv)
+add_rccl_test(all_reduce_bias)
@@ -0,0 +1,205 @@
+#
+# Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
+# Modifications are Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+include common.mk
+
+ROCM_PATH ?= /opt/rocm
+MPI_HOME ?= /usr/lib/x86_64-linux-gnu
+PREFIX ?= /usr/local
+VERBOSE ?= 0
+DEBUG ?= 0
+NCCL_HOME ?= ""
+CUSTOM_RCCL_LIB ?= ""
+
+HIPCC ?= $(ROCM_PATH)/bin/amdclang++
+HIPCONFIG = $(ROCM_PATH)/bin/hipconfig
+HIPIFY_PL_EXE=$(ROCM_PATH)/bin/hipify-perl
+HIPIFY_PL_FLAGS = -quiet-warnings
+CXX = $(HIPCC)
+
+HIPCUFLAGS := -std=c++14
+LDFLAGS    :=
+HIPLDFLAGS :=
+
+# Set to 1 to enable MPI support (multi-process/multi-node)
+MPI ?= 0
+# e.g. Set to _mpi when using MPI=1
+NAME_SUFFIX ?=
+# Set to 1 to create and use libverifiable.so to reduce binary size
+DSO ?= 0
+
+HIP_VERSION = $(strip $(shell which $(HIPCONFIG) >/dev/null && $(HIPCONFIG) --version))
+HIP_MAJOR = $(shell echo $(HIP_VERSION) | cut -d "." -f 1)
+HIP_MINOR = $(shell echo $(HIP_VERSION) | cut -d "." -f 2)
+
+# Better define GPU_TARGETS in your environment to the minimal set
+# of archs to reduce compile time.
+# Currently, supports gfx906,gfx908,gfx90a,gfx942,gfx950,gfx1030,gfx1100,gfx1101,gfx1102,gfx1200,gfx1201
+ifndef GPU_TARGETS
+GPU_TARGETS = gfx906 gfx908 gfx90a
+  ifeq ($(shell test "0$(HIP_MAJOR)" -ge 7; echo $$?),0)
+    GPU_TARGETS += gfx942 gfx950
+  else ifeq ($(shell test "0$(HIP_MAJOR)" -eq 6; echo $$?),0)
+    # Include gfx942 support if we're using ROCm 6.0 or above
+    GPU_TARGETS += gfx942
+    ifeq ($(shell test "0$(HIP_MINOR)" -ge 5; echo $$?),0)
+    # Include gfx950 support if we're using ROCm 6.5 or above
+    GPU_TARGETS += gfx950
+    endif
+  endif
+GPU_TARGETS += gfx1030 gfx1100 gfx1101 gfx1102 gfx1200 gfx1201
+endif
+
+GPU_TARGETS_FLAGS = $(foreach target,$(GPU_TARGETS),"--offload-arch=$(target)")
+
+#CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
+#CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
+#CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
+#
+## Better define NVCC_GENCODE in your environment to the minimal set
+## of archs to reduce compile time.
+#ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -ge 13; echo $$?),0)
+## Include Blackwell support if we're using CUDA12.8 or above
+#NVCC_GENCODE ?=	-gencode=arch=compute_80,code=sm_80 \
+#		-gencode=arch=compute_90,code=sm_90 \
+#		-gencode=arch=compute_100,code=sm_100 \
+#		-gencode=arch=compute_120,code=sm_120 \
+#		-gencode=arch=compute_120,code=compute_120
+#else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 12; echo $$?),0)
+#NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
+#                -gencode=arch=compute_61,code=sm_61 \
+#                -gencode=arch=compute_70,code=sm_70 \
+#		-gencode=arch=compute_80,code=sm_80 \
+#		-gencode=arch=compute_90,code=sm_90 \
+#		-gencode=arch=compute_90,code=compute_90
+#else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
+#NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
+#                -gencode=arch=compute_61,code=sm_61 \
+#                -gencode=arch=compute_70,code=sm_70 \
+#		-gencode=arch=compute_80,code=sm_80 \
+#		-gencode=arch=compute_80,code=compute_80
+#else
+#NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
+#                -gencode=arch=compute_50,code=sm_50 \
+#                -gencode=arch=compute_60,code=sm_60 \
+#                -gencode=arch=compute_61,code=sm_61 \
+#                -gencode=arch=compute_70,code=sm_70 \
+#                -gencode=arch=compute_70,code=compute_70
+#endif
+
+ifneq ($(NCCL_HOME), "")
+HIPCUFLAGS += -I$(NCCL_HOME)/ -I$(NCCL_HOME)/include
+HIPLDFLAGS += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME) -L$(NCCL_HOME)/lib
+endif
+
+HIPCUFLAGS += -I$(ROCM_PATH)/include
+HIPCUFLAGS += -I$(ROCM_PATH)/include/hip
+HIPCUFLAGS += -x hip -D__HIP_PLATFORM_AMD__ -D__HIPCC__ $(GPU_TARGETS_FLAGS)
+LDFLAGS    += -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
+ifneq ($(CUSTOM_RCCL_LIB), "")
+HIPLDFLAGS += -L$(CUSTOM_RCCL_LIB)
+endif
+HIPLDFLAGS += -L$(ROCM_PATH)/lib -lhsa-runtime64 -lamdhip64 -lstdc++ -lrt -pthread
+
+ifeq ($(DEBUG), 0)
+HIPCUFLAGS += -O3
+else
+HIPCUFLAGS += -O0 -g -ggdb3
+endif
+
+ifeq ($(VERBOSE), 0)
+.SILENT:
+endif
+
+.PHONY: build clean
+
+BUILDDIR ?= ../build
+HIPIFY_DIR ?= $(BUILDDIR)/hipify
+
+.PRECIOUS: $(HIPIFY_DIR)/%.cu.cpp $(HIPIFY_DIR)/%.h
+
+ifeq ($(MPI), 1)
+HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/include/openmpi -I${MPI_HOME}/openmpi/include -I${MPI_HOME}/openmpi/include/openmpi
+HIPLDFLAGS += -L${MPI_HOME}/lib -L${MPI_HOME}/openmpi/lib -lmpi
+else ifeq ($(MPICH), 1)
+HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/mpich/include -I/usr/include/x86_64-linux-gnu/mpich
+HIPLDFLAGS += -L${MPI_HOME}/lib -L${MPI_HOME}/mpich/lib -lmpich
+endif
+
+LIBRARIES += rccl dl
+HIPLDFLAGS += $(LIBRARIES:%=-l%)
+
+DST_DIR := $(BUILDDIR)
+SRC_FILES := $(wildcard *.cu)
+OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
+BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv alltoallv hypercube all_reduce_bias
+BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf${NAME_SUFFIX})
+
+GIT_VERSION_FILE := ${DST_DIR}/src/git_version.cpp
+GIT_REV          := $(shell git log --pretty=format:'%h' -n 1)
+GIT_DIFF         := $(shell git diff --quiet --exit-code || echo +)
+GIT_BRANCH       := $(shell git rev-parse --abbrev-ref HEAD)
+
+build: ${BIN_FILES}
+
+clean:
+	rm -rf ${DST_DIR}
+
+TEST_VERIFIABLE_SRCDIR := ../verifiable
+TEST_VERIFIABLE_BUILDDIR := $(BUILDDIR)/verifiable
+include ../verifiable/verifiable.mk
+
+# Rule to create git_version.cpp
+$(GIT_VERSION_FILE):
+	@mkdir -p ${DST_DIR}/src
+	@echo 'const char* rcclTestsGitHash = "$(GIT_BRANCH):$(GIT_REV)$(GIT_DIFF)";' > $@
+
+${HIPIFY_DIR}/%.cu.cpp: %.cu
+	@printf "Hipifying  %-35s > %s\n" $< $@
+	@mkdir -p ${HIPIFY_DIR}
+	${HIPIFY_PL_EXE} ${HIPIFY_PL_FLAGS} $< > $@
+
+${HIPIFY_DIR}/%.h: %.h
+	@printf "Hipifying  %-35s > %s\n" $< $@
+	@mkdir -p ${HIPIFY_DIR}
+	${HIPIFY_PL_EXE} ${HIPIFY_PL_FLAGS} $< > $@
+
+.PRECIOUS: ${DST_DIR}/%.o
+
+${DST_DIR}/%.o: ${HIPIFY_DIR}/%.cu.cpp ${HIPIFY_DIR}/common.h $(TEST_VERIFIABLE_HDRS) $(GIT_VERSION_FILE)
+	@printf "Compiling  %-35s > %s\n" $< $@
+	@mkdir -p ${DST_DIR}
+	echo "$(HIPCC) $(HIPCUFLAGS) -I. -c -o $@ $<"
+	$(HIPCC) $(HIPCUFLAGS) -I. -c -o $@ $<
+
+${DST_DIR}/%$(NAME_SUFFIX).o: %.cu.cpp ${HIPIFY_DIR}/common.h $(TEST_VERIFIABLE_HDRS) $(GIT_VERSION_FILE)
+	@printf "Compiling  %-35s > %s\n" $< $@
+	@mkdir -p ${DST_DIR}
+	echo "$(HIPCC) $(HIPCUFLAGS) -I. -c -o $@ $<"
+	$(HIPCC) $(HIPCUFLAGS) -I. -c -o $@ $<
+
+${DST_DIR}/timer.o: timer.cc timer.h
+	@printf "Compiling  %-35s > %s\n" $< $@
+	@mkdir -p ${DST_DIR}
+	$(CXX) $(CXXFLAGS) -o $@ -c $<
+
+ifeq ($(DSO), 1)
+${DST_DIR}/%_perf$(NAME_SUFFIX): ${DST_DIR}/%.o ${DST_DIR}/common$(NAME_SUFFIX).o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_LIBS) $(DST_DIR)/src/git_version.cpp
+	@printf "Linking  %-35s > %s\n" $< $@
+	@mkdir -p ${DST_DIR}
+	echo "$(HIPCC) -o $@ $^ $(HIPLDFLAGS)"
+	$(HIPCC) -o $@ $^ $(HIPLDFLAGS) -L$(TEST_VERIFIABLE_BUILDDIR) -lverifiable -Xlinker "--enable-new-dtags" -Xlinker "-rpath,\$$ORIGIN:\$$ORIGIN/verifiable"
+else
+${DST_DIR}/%_perf$(NAME_SUFFIX):${DST_DIR}/%.o ${DST_DIR}/common$(NAME_SUFFIX).o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS) $(DST_DIR)/src/git_version.cpp
+	@printf "Linking  %-35s > %s\n" $< $@
+	@mkdir -p ${DST_DIR}
+	echo "$(HIPCC) -o $@ $^ $(HIPLDFLAGS)"
+	$(HIPCC) -o $@ $^ $(HIPLDFLAGS)
+endif
+
+clean_intermediates:
+	rm -f ${DST_DIR}/*.o $(TEST_VERIFIABLE_OBJS)
+
@@ -0,0 +1,99 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+#include "rccl_compat.h"
+
+void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
+  size_t base = (count/nranks) & -(16/eltSize);
+  *sendcount = base;
+  *recvcount = base*nranks;
+  *sendInplaceOffset = base;
+  *recvInplaceOffset = 0;
+  *paramcount = base;
+}
+
+testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
+    for (int j=0; j<nranks; j++) {
+      TESTCHECK(InitData((char*)args->expected[i] + args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0));
+    }
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+testResult_t  AllGatherGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo, int* proto, int* nchannels) {
+  if(rcclTestsGetAlgoInfo == NULL) return testInternalError;
+  NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFunc_t::ncclFuncAllGather , count, type , 0, 0, 1, algo, proto, nchannels));
+  return testSuccess;
+}
+
+
+void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize * nranks) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(nranks - 1))/((double)nranks);
+  *busBw = baseBw * factor;
+}
+
+testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
+  NCCLCHECK(ncclAllGather(sendbuff, recvbuff, count, type, comm, stream));
+  return testSuccess;
+}
+
+struct testColl allGatherTest = {
+  "AllGather",
+  AllGatherGetCollByteCount,
+  AllGatherInitData,
+  AllGatherGetBw,
+  AllGatherRunColl,
+  AllGatherGetAlgoProtoChannels
+};
+
+void AllGatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  AllGatherGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
+}
+
+testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &allGatherTest;
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", -1));
+  }
+  return testSuccess;
+}
+
+struct testEngine ncclTestEngine = {
+  AllGatherGetBuffSize,
+  AllGatherRunTest
+};
@@ -0,0 +1,111 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+#include "rccl_compat.h"
+
+void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
+}
+
+testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
+    TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+testResult_t  AllReduceGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo, int* proto, int* nchannels) {
+  if(rcclTestsGetAlgoInfo == NULL) return testInternalError;
+  NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncAllReduce , count, type , 0, 0, 1, algo, proto, nchannels));
+  return testSuccess;
+}
+
+void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(2*(nranks - 1)))/((double)nranks);
+  *busBw = baseBw * factor;
+}
+
+testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
+  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
+  return testSuccess;
+}
+
+struct testColl allReduceTest = {
+  "AllReduce",
+  AllReduceGetCollByteCount,
+  AllReduceInitData,
+  AllReduceGetBw,
+  AllReduceRunColl,
+  AllReduceGetAlgoProtoChannels
+};
+
+void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  AllReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
+}
+
+testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &allReduceTest;
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if ((int)op != -1) {
+    op_count = 1;
+    run_ops = &op;
+    run_opnames = &opName;
+  } else {
+    op_count = test_opnum;
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+#if defined(RCCL_FLOAT8)
+  if((run_types[i] == ncclFloat8e4m3 || run_types[i] == ncclFloat8e5m2) && (run_ops[j] == ncclProd || run_ops[j] == ncclAvg || strcmp(run_opnames[j],"mulsum") == 0))
+    continue;
+#endif
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine ncclTestEngine = {
+  AllReduceGetBuffSize,
+  AllReduceRunTest
+};
@@ -0,0 +1,123 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+#include <dlfcn.h>
+
+typedef ncclResult_t (*PFN_ncclAllReduceWithBias)(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream, const void* acc);
+#define DECLARE_RCCL_PFN(symbol) PFN_##symbol pfn_##symbol = nullptr
+DECLARE_RCCL_PFN(ncclAllReduceWithBias);
+static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
+
+static void initOnceFunc() {
+  void *librccl = dlopen("librccl.so", RTLD_NOLOAD);
+  pfn_ncclAllReduceWithBias = (PFN_ncclAllReduceWithBias) dlsym(librccl, "ncclAllReduceWithBias");
+}
+
+void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
+  pthread_once(&initOnceControl, initOnceFunc);
+}
+
+testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
+    TESTCHECK(InitData(args->bias[i], sendcount, 0, type, op, rep+0x12345678, nranks, rank));
+    TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
+    TESTCHECK(InitDataApplyBias(args->expected[i], args->bias[i], recvcount, 0, type, op));
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(2*(nranks - 1)))/((double)nranks);
+  *busBw = baseBw * factor;
+}
+
+testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
+  if (pfn_ncclAllReduceWithBias == nullptr) {
+    fprintf(stderr, "[ERROR] This version of RCCL doesn't support ncclAllReduceWithBias\n");
+    return testNcclError;
+  }
+  NCCLCHECK((*pfn_ncclAllReduceWithBias)(sendbuff, recvbuff, count, type, op, comm, stream, bias));
+  return testSuccess;
+}
+
+struct testColl allReduceTest = {
+  "AllReduce",
+  AllReduceGetCollByteCount,
+  AllReduceInitData,
+  AllReduceGetBw,
+  AllReduceRunColl
+};
+
+void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  AllReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
+}
+
+testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &allReduceTest;
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if ((int)op != -1) {
+    op_count = 1;
+    run_ops = &op;
+    run_opnames = &opName;
+  } else {
+    op_count = test_opnum;
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+#if defined(RCCL_FLOAT8)
+      if((run_types[i] == ncclFloat8e4m3 || run_types[i] == ncclFloat8e5m2) && (run_ops[j] == ncclProd || run_ops[j] == ncclAvg || strcmp(run_opnames[j],"mulsum") == 0))
+      continue;
+#endif
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine ncclTestEngine = {
+  AllReduceGetBuffSize,
+  AllReduceRunTest
+};
@@ -0,0 +1,94 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+#include "rccl_compat.h"
+
+void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
+  *paramcount = (count/nranks) & -(16/eltSize);
+  *sendcount = nranks*(*paramcount);
+  *recvcount = *sendcount;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+}
+
+testResult_t AlltoAllInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
+    for (int j=0; j<nranks; j++) {
+      size_t partcount = sendcount/nranks;
+      TESTCHECK(InitData((char*)args->expected[i] + j*partcount*wordSize(type), partcount, rank*partcount, type, ncclSum, 33*rep + j, 1, 0));
+    }
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  // We don't support in-place alltoall
+  args->reportErrors = in_place ? 0 : 1;
+  return testSuccess;
+}
+
+void AlltoAllGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(nranks-1))/((double)(nranks));
+  *busBw = baseBw * factor;
+}
+
+testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
+  NCCLCHECK(ncclAllToAll(sendbuff, recvbuff, count, type, comm, stream));
+  return testSuccess;
+}
+
+struct testColl alltoAllTest = {
+  "AlltoAll",
+  AlltoAllGetCollByteCount,
+  AlltoAllInitData,
+  AlltoAllGetBw,
+  AlltoAllRunColl,
+  NULL
+};
+
+void AlltoAllGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  AlltoAllGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
+}
+
+testResult_t AlltoAllRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &alltoAllTest;
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", -1));
+  }
+  return testSuccess;
+}
+
+struct testEngine ncclTestEngine = {
+  AlltoAllGetBuffSize,
+  AlltoAllRunTest
+};
@@ -0,0 +1,194 @@
+/*************************************************************************
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+#include "rccl_compat.h"
+
+#define USE_RCCL_GATHER_SCATTER
+
+void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
+  if (count < nranks*nranks/2) {
+    *sendcount = 0;
+    *recvcount = 0;
+    *sendInplaceOffset = 0;
+    *recvInplaceOffset = 0;
+    *paramcount = 0;
+  } else {
+    *paramcount = (count/nranks) & -(16/eltSize);
+    *sendcount = nranks*(*paramcount);
+    *recvcount = *sendcount;
+    *sendInplaceOffset = 0;
+    *recvInplaceOffset = 0;
+  }
+}
+
+testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep+rank, 1, 0));
+
+#if 0
+    int *dataHost = (int *)malloc(args->sendBytes);
+    cudaMemcpy(dataHost, data, args->sendBytes, cudaMemcpyDeviceToHost);
+    printf(" Rank [%d] Original: ", rank);
+    for(int j=0; j<sendcount; j++) {
+	    printf("%d:%d ", j, dataHost[j]);
+    }
+    printf("\n");
+    free(dataHost);
+#endif
+
+    size_t rdisp = 0;
+    size_t data_count = sendcount*2/nranks;
+    size_t chunksize = data_count/nranks;
+    for (int j=0; j<nranks; j++) {
+      size_t scount = 0, rcount = ((j+rank)%nranks)*chunksize;
+      if ((j+rank)%nranks == 0)
+        rcount += (sendcount-chunksize*(nranks-1)*nranks/2);
+      size_t sdisp = 0;
+      for (int k=0; k<nranks; k++) {
+        scount = ((k+j)%nranks)*chunksize;
+        if ((k+j)%nranks == 0)
+          scount += (sendcount-chunksize*(nranks-1)*nranks/2);
+        if (k == rank)
+          break;
+        sdisp += scount;
+      }
+      TESTCHECK(InitData(((char*)args->expected[i])+rdisp*wordSize(type), rcount, sdisp, type, ncclSum, 33*rep+j, 1, 0));
+      rdisp += rcount;
+    }
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  // We don't support in-place alltoall
+  args->reportErrors = in_place ? 0 : 1;
+  return testSuccess;
+}
+
+void AlltoAllvGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(nranks-1))/((double)(nranks));
+  *busBw = baseBw * factor;
+}
+
+testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
+  int nranks;
+  NCCLCHECK(ncclCommCount(comm, &nranks));
+  int rank;
+  NCCLCHECK(ncclCommUserRank(comm, &rank));
+
+  if (count == 0) return testSuccess;
+
+  size_t *sendcounts, *recvcounts, *sdispls, *rdispls;
+  sendcounts = (size_t *)malloc(nranks*nranks*sizeof(size_t));
+  recvcounts = (size_t *)malloc(nranks*nranks*sizeof(size_t));
+  sdispls = (size_t *)malloc(nranks*nranks*sizeof(size_t));
+  rdispls = (size_t *)malloc(nranks*nranks*sizeof(size_t));
+  if (sendcounts == nullptr || recvcounts == nullptr || sdispls == nullptr || rdispls == nullptr) {
+    printf("failed to allocate buffers for alltoallv\n");
+    return testNcclError;
+  }
+
+  size_t disp = 0;
+  size_t chunksize = count*2/nranks;
+  for (int i = 0; i < nranks; i++) {
+      size_t scount = ((i+rank)%nranks)*chunksize;
+      if ((i+rank)%nranks == 0)
+          scount += (count*nranks-chunksize*(nranks-1)*nranks/2);
+      sendcounts[i+rank*nranks] = recvcounts[i+rank*nranks] = scount;
+      sdispls[i+rank*nranks] = rdispls[i+rank*nranks] = disp;
+      disp += scount;
+      //printf("%d->%d: sendcounts/recvcounts %lx sdispls/rdispls %lx\n", rank, i, sendcounts[i+rank*nranks]*wordSize(type), sdispls[i+rank*nranks]*wordSize(type));
+  }
+
+#if NCCL_MAJOR < 2 || NCCL_MINOR < 7
+  printf("NCCL 2.7 or later is needed for alltoallv. This test was compiled with %d.%d.\n", NCCL_MAJOR, NCCL_MINOR);
+  return testNcclError;
+#else
+#if defined(RCCL_ALLTOALLV) && defined(USE_RCCL_GATHER_SCATTER)
+  NCCLCHECK(ncclAllToAllv(sendbuff, sendcounts+rank*nranks, sdispls+rank*nranks, recvbuff, recvcounts+rank*nranks, rdispls+rank*nranks, type, comm, stream));
+#else
+  NCCLCHECK(ncclGroupStart());
+  for (int r=0; r<nranks; r++) {
+    if (sendcounts[r+rank*nranks] != 0) {
+      NCCLCHECK(ncclSend(
+          ((char*)sendbuff) + sdispls[r+rank*nranks] * wordSize(type),
+          sendcounts[r+rank*nranks],
+          type,
+          r,
+          comm,
+          stream));
+    }
+    if (recvcounts[r+rank*nranks] != 0) {
+      NCCLCHECK(ncclRecv(
+          ((char*)recvbuff) + rdispls[r+rank*nranks] * wordSize(type),
+          recvcounts[r+rank*nranks],
+          type,
+          r,
+          comm,
+          stream));
+    }
+  }
+  NCCLCHECK(ncclGroupEnd());
+#endif
+#endif
+  free(sendcounts);
+  free(recvcounts);
+  free(sdispls);
+  free(rdispls);
+  return testSuccess;
+}
+
+struct testColl alltoAllTest = {
+  "AlltoAllv",
+  AlltoAllvGetCollByteCount,
+  AlltoAllvInitData,
+  AlltoAllvGetBw,
+  AlltoAllvRunColl,
+  NULL
+};
+
+void AlltoAllvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  AlltoAllvGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
+}
+
+testResult_t AlltoAllvRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &alltoAllTest;
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = ncclNumTypes;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", -1));
+  }
+  return testSuccess;
+}
+
+struct testEngine ncclTestEngine = {
+  AlltoAllvGetBuffSize,
+  AlltoAllvRunTest
+};
@@ -0,0 +1,114 @@
+/*************************************************************************
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+#include "rccl_compat.h"
+
+void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
+}
+
+testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+
+  for (int i=0; i<args->nGpus; i++) {
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0));
+    TESTCHECK(InitData(args->expected[i], recvcount, 0, type, ncclSum, rep, 1, 0));
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+testResult_t  BroadcastGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo, int* proto, int* nchannels) {
+  if(rcclTestsGetAlgoInfo == NULL) return testInternalError;
+  NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncBroadcast , count, type , 0, 0, 1, algo, proto, nchannels));
+  return testSuccess;
+}
+
+void BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = 1;
+  *busBw = baseBw * factor;
+}
+
+testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
+  int rank;
+  NCCLCHECK(ncclCommUserRank(comm, &rank));
+#if NCCL_MAJOR >= 2 && NCCL_MINOR >= 2
+  NCCLCHECK(ncclBroadcast(sendbuff, recvbuff, count, type, root, comm, stream));
+#else
+  if (rank == root) {
+      NCCLCHECK(ncclBcast(sendbuff, count, type, root, comm, stream));
+  } else {
+      NCCLCHECK(ncclBcast(recvbuff, count, type, root, comm, stream));
+  }
+#endif
+  return testSuccess;
+}
+
+struct testColl broadcastTest = {
+  "Broadcast",
+  BroadcastGetCollByteCount,
+  BroadcastInitData,
+  BroadcastGetBw,
+  BroadcastRunColl,
+  BroadcastGetAlgoProtoChannels
+};
+
+void BroadcastGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  BroadcastGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
+}
+
+testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &broadcastTest;
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+  int begin_root, end_root;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if (root != -1) {
+    begin_root = end_root = root;
+  } else {
+    begin_root = 0;
+    end_root = args->nProcs*args->nThreads*args->nGpus-1;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=begin_root; j<=end_root; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", j));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine ncclTestEngine = {
+  BroadcastGetBuffSize,
+  BroadcastRunTest
+};
@@ -0,0 +1,62 @@
+# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# Attempt to collect the latest git hash
+execute_process(COMMAND git log --pretty=format:'%h' -n 1
+                OUTPUT_VARIABLE GIT_REV
+                ERROR_QUIET)
+
+# Check if git information was found
+if ("${GIT_REV}" STREQUAL "")
+  set(CURR_GIT_VERSION "const char *rcclTestsGitHash =\"Unknown \";")
+else()
+  # Check for changes (denote with a '+') after hash
+  execute_process(
+    COMMAND bash -c "git diff --quiet --exit-code || echo +"
+    OUTPUT_VARIABLE GIT_DIFF)
+  # Collect branch information
+  execute_process(
+    COMMAND git rev-parse --abbrev-ref HEAD
+    OUTPUT_VARIABLE GIT_BRANCH)
+
+  string(STRIP "${GIT_REV}" GIT_REV)
+  string(SUBSTRING "${GIT_REV}" 1 7 GIT_REV)
+  string(STRIP "${GIT_DIFF}" GIT_DIFF)
+  string(STRIP "${GIT_BRANCH}" GIT_BRANCH)
+
+  set(CURR_GIT_VERSION "const char *rcclTestsGitHash =\"${GIT_BRANCH}:${GIT_REV}${GIT_DIFF}\";")
+endif()
+
+# Compare file with older git version file (git_version.cpp)
+if (EXISTS ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp)
+  #MESSAGE(STATUS "Found ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp")
+  file(READ ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp PREV_GIT_VERSION)
+  #message(STATUS "CURR GIT version: ${CURR_GIT_VERSION}")
+  #message(STATUS "PREV GIT version: ${PREV_GIT_VERSION}")
+  if (NOT "${CURR_GIT_VERSION}" STREQUAL "${PREV_GIT_VERSION}")
+    message(STATUS "Updating git_version.cpp")
+    file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp "${CURR_GIT_VERSION}")
+  else()
+    message(STATUS "No changes to git_version.cpp required")
+  endif()
+else()
+  # Create git_version.cpp if it doesn't exist yet
+  file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp "${CURR_GIT_VERSION}")
+endif()
@@ -0,0 +1,400 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef __COMMON_H__
+#define __COMMON_H__
+#include "rccl/rccl.h"
+#include <stdio.h>
+#include <cstdint>
+#include <cstring>
+#include <algorithm>
+#ifdef MPI_SUPPORT
+#include "mpi.h"
+#endif
+#include <pthread.h>
+#include "nccl1_compat.h"
+#include "timer.h"
+#include <string>
+#include <fstream>
+#include <iostream>
+#include <utility>
+#include <vector>
+
+// Ensures backward compatibility for FP8 datatypes
+#if NCCL_VERSION_CODE < NCCL_VERSION(2,24,3)
+  #define ncclFloat8e4m3 ncclFp8E4M3
+  #define ncclFloat8e5m2 ncclFp8E5M2
+#endif
+
+// For nccl.h < 2.13 since we define a weak fallback
+extern "C" char const* ncclGetLastError(ncclComm_t comm);
+
+#define CUDACHECK(cmd) do {                         \
+  cudaError_t err = cmd;                            \
+  if( err != cudaSuccess ) {                        \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf("%s: Test CUDA failure %s:%d '%s'\n",    \
+         hostname,                                  \
+        __FILE__,__LINE__,cudaGetErrorString(err)); \
+    return testCudaError;                           \
+  }                                                 \
+} while(0)
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,13,0)
+#define NCCLCHECK(cmd) do {                         \
+  ncclResult_t res = cmd;                           \
+  if (res != ncclSuccess) {                         \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf("%s: Test NCCL failure %s:%d "           \
+           "'%s / %s'\n",                           \
+           hostname,__FILE__,__LINE__,              \
+           ncclGetErrorString(res),                 \
+           ncclGetLastError(NULL));                 \
+    return testNcclError;                           \
+  }                                                 \
+} while(0)
+#else
+#define NCCLCHECK(cmd) do {                         \
+  ncclResult_t res = cmd;                           \
+  if (res != ncclSuccess) {                         \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf("%s: Test NCCL failure %s:%d '%s'\n",    \
+         hostname,                                  \
+        __FILE__,__LINE__,ncclGetErrorString(res)); \
+    return testNcclError;                           \
+  }                                                 \
+} while(0)
+#endif
+
+typedef enum {
+  testSuccess = 0,
+  testInternalError = 1,
+  testCudaError = 2,
+  testNcclError = 3,
+  testTimeout = 4,
+  testNumResults = 5
+} testResult_t;
+
+// Relay errors up and trace
+#define TESTCHECK(cmd) do {                         \
+  testResult_t r = cmd;                             \
+  if (r!= testSuccess) {                            \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf(" .. %s pid %d: Test failure %s:%d\n",   \
+         hostname, getpid(),                        \
+        __FILE__,__LINE__);                         \
+    return r;                                       \
+  }                                                 \
+} while(0)
+
+struct testColl {
+  const char name[20];
+  void (*getCollByteCount)(
+      size_t *sendcount, size_t *recvcount, size_t *paramcount,
+      size_t *sendInplaceOffset, size_t *recvInplaceOffset,
+      size_t count, size_t eltSize, int nranks);
+  testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type,
+      ncclRedOp_t op, int root, int rep, int in_place);
+  void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
+  testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias);
+  testResult_t (*getAlgoProtoChannels)(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo, int* proto, int* nchannels);
+
+};
+extern struct testColl allReduceTest;
+extern struct testColl allGatherTest;
+extern struct testColl reduceScatterTest;
+extern struct testColl broadcastTest;
+extern struct testColl reduceTest;
+extern struct testColl alltoAllTest;
+
+class Reporter {
+  public:
+    Reporter(std::string fileName, std::string outputFormat);
+    ~Reporter() { if (_outputValid) { _out.close(); } };
+    void setParameters(const size_t numCycle, const char* name, const char* typeName, const char* opName);
+    void addResult(int gpusPerRank, int ranksPerNode, int totalRanks, size_t numBytes, int inPlace, double timeUsec, double algBw, double busBw, int64_t wrongElts = -1);
+    void writeFile();
+
+  private:
+    bool isMainThread();
+    template<typename T> std::pair<std::string, std::string> makeValueKeyPair(T v, std::string k) { return std::make_pair(std::to_string(v), k); };
+    template <> std::pair<std::string, std::string> makeValueKeyPair<std::string>(std::string v, std::string k) { return std::make_pair("\"" + v + "\"", k); };
+
+    bool _outputValid = false;
+    std::ofstream _out;
+    std::string _outputFormat;
+    size_t _numCycle = 0;
+    std::string _collectiveName;
+    std::string _typeName;
+    std::string _opName;
+    std::vector<std::vector<std::pair<std::string, std::string>>> _outputData;
+};
+
+struct testEngine {
+  void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks);
+  testResult_t (*runTest)(struct threadArgs* args, int root, ncclDataType_t type,
+      const char* typeName, ncclRedOp_t op, const char* opName);
+};
+
+extern struct testEngine ncclTestEngine;
+
+struct threadArgs {
+  size_t nbytes;
+  size_t minbytes;
+  size_t maxbytes;
+  size_t stepbytes;
+  size_t stepfactor;
+
+  int totalProcs;
+  int nProcs;
+  int proc;
+  int nThreads;
+  int thread;
+  int nGpus;
+  int* gpus;
+  int localRank;
+  int enable_out_of_place;
+  int enable_in_place;
+  int enable_cache_flush;
+  int enable_rotating_tensor;
+  void** sendbuffs;
+  size_t sendBytes;
+  size_t sendInplaceOffset;
+  void** recvbuffs;
+  size_t recvInplaceOffset;
+  ncclUniqueId ncclId;
+  ncclComm_t* comms;
+  cudaStream_t* streams;
+  void** bias;
+
+  void** expected;
+  size_t expectedBytes;
+  int* errors;
+  double* bw;
+  int* bw_count;
+
+  int reportErrors;
+
+  struct testColl* collTest;
+
+  Reporter* reporter;
+};
+
+typedef testResult_t (*threadFunc_t)(struct threadArgs* args);
+struct testThread {
+  pthread_t thread;
+  threadFunc_t func;
+  struct threadArgs args;
+  testResult_t ret;
+};
+
+// Provided by common.cu
+extern void Barrier(struct threadArgs* args);
+extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op,  const char* opName, int root);
+extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks);
+extern testResult_t InitDataApplyBias(void* expected, void* bias, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op);
+extern testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks, const int rank);
+extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks, void **bias);
+
+#include <unistd.h>
+
+static void getHostName(char* hostname, int maxlen) {
+  gethostname(hostname, maxlen);
+  for (int i=0; i< maxlen; i++) {
+    if (hostname[i] == '.') {
+      hostname[i] = '\0';
+      return;
+    }
+  }
+}
+
+#include <stdint.h>
+
+static uint64_t getHash(const char* string, size_t n) {
+  // Based on DJB2a, result = result * 33 ^ char
+  uint64_t result = 5381;
+  for (size_t c = 0; c < n; c++) {
+    result = ((result << 5) + result) ^ string[c];
+  }
+  return result;
+}
+
+/* Generate a hash of the unique identifying string for this host
+ * that will be unique for both bare-metal and container instances
+ * Equivalent of a hash of;
+ *
+ * $(hostname)$(cat /proc/sys/kernel/random/boot_id)
+ *
+ */
+#define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
+static uint64_t getHostHash(const char* hostname) {
+  char hostHash[1024];
+
+  // Fall back is the hostname if something fails
+  (void) strncpy(hostHash, hostname, sizeof(hostHash));
+  int offset = strlen(hostHash);
+
+  FILE *file = fopen(HOSTID_FILE, "r");
+  if (file != NULL) {
+    char *p;
+    if (fscanf(file, "%ms", &p) == 1) {
+        strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1);
+        free(p);
+    }
+  }
+  fclose(file);
+
+  // Make sure the string is terminated
+  hostHash[sizeof(hostHash)-1]='\0';
+
+  return getHash(hostHash, strlen(hostHash));
+}
+
+#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1
+#define HAVE_BF16 1
+#else
+#define HAVE_BF16 0
+#endif
+#if NCCL_MAJOR >= 2 && RCCL_FLOAT8 == 1
+#define HAVE_FP8 1
+#else
+#define HAVE_FP8 0
+#endif
+
+#if NCCL_MAJOR >= 2
+  #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+    #undef HAVE_BF16
+    #define HAVE_BF16 1
+    #if defined(__CUDA_FP8_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,24,0)
+      #undef HAVE_FP8
+      #define HAVE_FP8 1
+    #endif
+  #endif
+#endif
+
+static size_t wordSize(ncclDataType_t type) {
+  switch(type) {
+    case ncclChar:
+#if NCCL_MAJOR >= 2
+    //case ncclInt8:
+    case ncclUint8:
+#if HAVE_FP8
+    case ncclFloat8e4m3:
+    case ncclFloat8e5m2:
+#endif
+#endif
+      return 1;
+    case ncclHalf:
+#if HAVE_BF16
+    case ncclBfloat16:
+#endif
+    //case ncclFloat16:
+      return 2;
+    case ncclInt:
+    case ncclFloat:
+#if NCCL_MAJOR >= 2
+    //case ncclInt32:
+    case ncclUint32:
+    //case ncclFloat32:
+#endif
+      return 4;
+    case ncclInt64:
+    case ncclUint64:
+    case ncclDouble:
+    //case ncclFloat64:
+      return 8;
+    default: return 0;
+  }
+}
+
+extern int test_ncclVersion; // init'd with ncclGetVersion()
+typedef enum { ncclCoarse        = 0,
+               ncclFine          = 1,
+               ncclHost          = 2,
+               ncclManaged       = 3,
+               nccl_NUM_MTYPES   = 4 } ncclMemoryType_t;
+extern const char *test_memorytypes[nccl_NUM_MTYPES];
+constexpr int test_opNumMax = (int)ncclNumOps + (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) ? 1 : 0);
+extern int test_opnum;
+extern int test_typenum;
+extern ncclDataType_t test_types[ncclNumTypes];
+extern const char *test_typenames[ncclNumTypes];
+extern ncclRedOp_t test_ops[];
+extern const char *test_opnames[];
+
+static int ncclstringtotype(char *str) {
+    for (int t=0; t<test_typenum; t++) {
+      if (strcmp(str, test_typenames[t]) == 0) {
+        return t;
+      }
+    }
+    if (strcmp(str, "all") == 0) {
+      return -1;
+    }
+    printf("invalid type %s, defaulting to %s .. \n", str, test_typenames[ncclFloat]);
+    return ncclFloat;
+}
+
+static int ncclstringtoop (char *str) {
+    for (int o=0; o<test_opnum; o++) {
+      if (strcmp(str, test_opnames[o]) == 0) {
+        return o;
+      }
+    }
+    if (strcmp(str, "all") == 0) {
+      return -1;
+    }
+    printf("invalid op %s, defaulting to %s .. \n", str, test_opnames[ncclSum]);
+    return ncclSum;
+}
+
+static int ncclstringtoroot (char *str) {
+    if (strcmp(str, "all") == 0) {
+      return -1;
+    }
+    return strtol(str, NULL, 0);
+}
+
+static int ncclstringtomtype (char *str) {
+    for (int o=0; o<nccl_NUM_MTYPES; o++) {
+      if (strcmp(str, test_memorytypes[o]) == 0) {
+        return o;
+      }
+    }
+    printf("invalid memorytype %s, defaulting to %s .. \n", str, test_memorytypes[ncclCoarse]);
+    return ncclCoarse;
+}
+
+extern int is_main_proc;
+extern thread_local int is_main_thread;
+#define PRINT if (is_main_thread) printf
+
+typedef enum {
+  ncclFuncBroadcast = 0,
+  ncclFuncReduce = 1,
+  ncclFuncAllGather = 2,
+  ncclFuncReduceScatter = 3,
+  ncclFuncAllReduce = 4,
+  ncclFuncAllReduceWithBias = 5,
+  ncclFuncSendRecv = 6,
+  ncclFuncSend = 7,
+  ncclFuncRecv = 8,
+  ncclFuncAllToAllPivot = 9,
+  ncclNumFuncs = 10
+} ncclFunc_t;
+
+typedef ncclResult_t (*rcclTestsGetAlgoInfo_t)(struct ncclComm* comm, ncclFunc_t coll, uint64_t count, ncclDataType_t dataType,
+                                          int collNetSupport, int nvlsSupport, int numPipeOps,
+                                          int* algo, int* protocol, int* maxChannels);
+typedef ncclResult_t (*rcclTestsGetAlgoName_t)(int algo, const char** algoName);
+typedef ncclResult_t (*rcclTestsGetProtocolName_t)(int protocol, const char** protocolName);
+
+#endif
@@ -0,0 +1,80 @@
+#
+# Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+CUDA_HOME ?= /usr/local/cuda
+PREFIX ?= /usr/local
+VERBOSE ?= 0
+DEBUG ?= 0
+
+CUDA_LIB ?= $(CUDA_HOME)/lib64
+CUDA_INC ?= $(CUDA_HOME)/include
+NVCC ?= $(CUDA_HOME)/bin/nvcc
+CUDARTLIB ?= cudart
+
+CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
+CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
+CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
+
+# Better define NVCC_GENCODE in your environment to the minimal set
+# of archs to reduce compile time.
+ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 13; echo $$?),0)
+# Add Blackwell but drop Pascal & Volta support if we're using CUDA13.0 or above
+NVCC_GENCODE ?= -gencode=arch=compute_75,code=sm_75 \
+		-gencode=arch=compute_80,code=sm_80 \
+		-gencode=arch=compute_90,code=sm_90 \
+		-gencode=arch=compute_100,code=sm_100 \
+		-gencode=arch=compute_120,code=sm_120 \
+		-gencode=arch=compute_120,code=compute_120
+else ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8; echo $$?),0)
+# Include Blackwell support if we're using CUDA12.8 or above
+NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
+		-gencode=arch=compute_61,code=sm_61 \
+		-gencode=arch=compute_70,code=sm_70 \
+		-gencode=arch=compute_80,code=sm_80 \
+		-gencode=arch=compute_90,code=sm_90 \
+		-gencode=arch=compute_100,code=sm_100 \
+		-gencode=arch=compute_120,code=sm_120 \
+		-gencode=arch=compute_120,code=compute_120
+else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 12; echo $$?),0)
+NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
+                -gencode=arch=compute_61,code=sm_61 \
+                -gencode=arch=compute_70,code=sm_70 \
+		-gencode=arch=compute_80,code=sm_80 \
+		-gencode=arch=compute_90,code=sm_90 \
+		-gencode=arch=compute_90,code=compute_90
+else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
+NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
+                -gencode=arch=compute_61,code=sm_61 \
+                -gencode=arch=compute_70,code=sm_70 \
+		-gencode=arch=compute_80,code=sm_80 \
+		-gencode=arch=compute_80,code=compute_80
+else
+NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
+                -gencode=arch=compute_50,code=sm_50 \
+                -gencode=arch=compute_60,code=sm_60 \
+                -gencode=arch=compute_61,code=sm_61 \
+                -gencode=arch=compute_70,code=sm_70 \
+                -gencode=arch=compute_70,code=compute_70
+endif
+
+NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11
+CXXFLAGS   := -std=c++11
+
+LDFLAGS    := -L${CUDA_LIB} -lcudart -lrt
+NVLDFLAGS  := -L${CUDA_LIB} -l${CUDARTLIB} -lrt
+
+ifeq ($(DEBUG), 0)
+NVCUFLAGS += -O3 -g
+CXXFLAGS  += -O3 -g
+else
+NVCUFLAGS += -O0 -G -g
+CXXFLAGS  += -O0 -g -ggdb3
+endif
+
+ifneq ($(VERBOSE), 0)
+NVCUFLAGS += -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
+else
+.SILENT:
+endif
@@ -0,0 +1,117 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+#include "rccl_compat.h"
+
+void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
+  *sendcount = (count/nranks) & -(16/eltSize);
+  *recvcount = (*sendcount)*nranks;
+  *sendInplaceOffset = *sendcount;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
+}
+
+testResult_t GatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
+    CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault));
+    if (rank == root) {
+      TESTCHECK(InitData(args->expected[i], nranks*sendcount, 0, type, ncclSum, rep, 1, 0));
+    }
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void GatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(nranks-1))/((double)(nranks));
+  *busBw = baseBw * factor;
+}
+
+testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
+  int nRanks;
+  NCCLCHECK(ncclCommCount(comm, &nRanks));
+  int rank;
+  NCCLCHECK(ncclCommUserRank(comm, &rank));
+  size_t rankOffset = count * wordSize(type);
+  if (count == 0) return testSuccess;
+
+  NCCLCHECK(ncclGroupStart());
+  NCCLCHECK(ncclSend(sendbuff, count, type, root, comm, stream));
+  if (rank == root) {
+    for (int r=0; r<nRanks; r++) {
+      NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, count, type, r, comm, stream));
+    }
+  }
+  NCCLCHECK(ncclGroupEnd());
+
+  return testSuccess;
+}
+
+struct testColl gatherTest = {
+  "Gather",
+  GatherGetCollByteCount,
+  GatherInitData,
+  GatherGetBw,
+  GatherRunColl,
+  NULL
+};
+
+void GatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  GatherGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
+}
+
+testResult_t GatherRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &gatherTest;
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+  int begin_root, end_root;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if (root != -1) {
+    begin_root = end_root = root;
+  } else {
+    begin_root = 0;
+    end_root = args->nProcs*args->nThreads*args->nGpus-1;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=begin_root; j<=end_root; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", j));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine ncclTestEngine = {
+  GatherGetBuffSize,
+  GatherRunTest
+};
@@ -0,0 +1,6 @@
+#ifndef RCCL_TESTS_GIT_VERSION_H_
+#define RCCL_TESTS_GIT_VERSION_H_
+
+extern const char *rcclTestsGitHash;
+
+#endif
@@ -0,0 +1,116 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+
+#define ALIGN 4
+
+void HyperCubeGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
+  size_t base = (count/nranks) & -(16/eltSize);
+  *sendcount = base;
+  *recvcount = base*nranks;
+  *sendInplaceOffset = base;
+  *recvInplaceOffset = 0;
+  *paramcount = base;
+}
+
+testResult_t HyperCubeInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
+    for (int j=0; j<nranks; j++) {
+      TESTCHECK(InitData((char*)args->expected[i] + args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0));
+    }
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void HyperCubeGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize * (nranks - 1)) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = 1;
+  *busBw = baseBw * factor;
+}
+
+testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
+  char* sbuff = (char*)sendbuff;
+  char* rbuff = (char*)recvbuff;
+  int nRanks;
+  NCCLCHECK(ncclCommCount(comm, &nRanks));
+  int rank;
+  NCCLCHECK(ncclCommUserRank(comm, &rank));
+  size_t rankSize = count * wordSize(type);
+  if (rbuff+rank*rankSize != sbuff) CUDACHECK(cudaMemcpyAsync(rbuff+rank*rankSize, sbuff, rankSize, cudaMemcpyDeviceToDevice, stream));
+
+  // Hypercube AllGather
+  for (int mask=1; mask<nRanks; mask<<=1) {
+    NCCLCHECK(ncclGroupStart());
+    int s = rank & ~(mask-1);
+    int r = s ^ mask;
+    NCCLCHECK(ncclSend(rbuff+s*rankSize, count*mask, type, rank^mask, comm, stream));
+    NCCLCHECK(ncclRecv(rbuff+r*rankSize, count*mask, type, rank^mask, comm, stream));
+    NCCLCHECK(ncclGroupEnd());
+  }
+  return testSuccess;
+}
+
+struct testColl hyperCubeTest = {
+  "HyperCube",
+  HyperCubeGetCollByteCount,
+  HyperCubeInitData,
+  HyperCubeGetBw,
+  HyperCubeRunColl
+};
+
+void HyperCubeGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  HyperCubeGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
+}
+
+testResult_t HyperCubeRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &hyperCubeTest;
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  // Check if this is a power of 2
+  int nRanks = args->nProcs*args->nThreads*args->nGpus;
+  if (nRanks && !(nRanks & (nRanks - 1))) {
+    for (int i=0; i<type_count; i++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", -1));
+    }
+  } else {
+    printf("nRanks %d is not a power of 2, skipping\n", nRanks);
+  }
+
+  return testSuccess;
+}
+
+struct testEngine ncclTestEngine = {
+  HyperCubeGetBuffSize,
+  HyperCubeRunTest
+};
@@ -0,0 +1,50 @@
+/*************************************************************************
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL1_COMPAT_H
+#define NCCL1_COMPAT_H
+
+#ifndef NCCL_MAJOR // NCCL 1.x
+#define NCCL_MAJOR 1
+#define NCCL_MINOR 0
+
+#define ncclNumOps nccl_NUM_OPS
+#define ncclNumTypes nccl_NUM_TYPES
+
+static ncclResult_t ncclGroupStart() { return ncclSuccess; }
+static ncclResult_t ncclGroupEnd() { return ncclSuccess; }
+
+#define CHECKCOUNT(count) if (count > INT_MAX) return ncclInvalidArgument;
+
+static ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
+    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(count);
+  return ncclReduce(sendbuff, recvbuff, (int)count, datatype, op, root, comm, stream);
+}
+static ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(count);
+  return ncclAllReduce(sendbuff, recvbuff, (int)count, datatype, op, comm, stream);
+}
+static ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(count);
+  return ncclBcast(buff, (int)count, datatype, root, comm, stream);
+}
+static ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
+    size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+    cudaStream_t stream) {
+  CHECKCOUNT(recvcount);
+  return ncclReduceScatter(sendbuff, recvbuff, (int)recvcount, datatype, op, comm, stream);
+}
+static ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(sendcount);
+  return ncclAllGather(sendbuff, (int)sendcount, datatype, recvbuff, comm, stream);
+}
+#endif
+
+#endif
@@ -0,0 +1,30 @@
+/* ************************************************************************
+ * Copyright (C) 2016-2025 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+ * ies of the Software, and to permit persons to whom the Software is furnished
+ * to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+ * PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+ * CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ************************************************************************ */
+
+#ifndef RCCL_COMPAT_H
+#define RCCL_COMPAT_H
+
+extern rcclTestsGetAlgoInfo_t rcclTestsGetAlgoInfo;
+extern rcclTestsGetProtocolName_t rcclTestsGetProtocolName;
+extern rcclTestsGetAlgoName_t rcclTestsGetAlgoName;
+
+#endif
@@ -0,0 +1,122 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+#include "rccl_compat.h"
+
+void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
+}
+
+testResult_t ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
+    CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault));
+    if (rank == root) TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+testResult_t  ReduceGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo, int* proto, int* nchannels) {
+  if(rcclTestsGetAlgoInfo == NULL) return testInternalError;
+  NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncReduce , count, type , 0, 0, 1, algo, proto, nchannels));
+  return testSuccess;
+}
+
+
+void ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
+  *algBw = baseBw;
+  *busBw = baseBw;
+}
+
+testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
+  NCCLCHECK(ncclReduce(sendbuff, recvbuff, count, type, op, root, comm, stream));
+  return testSuccess;
+}
+
+struct testColl reduceTest = {
+  "Reduce",
+  ReduceGetCollByteCount,
+  ReduceInitData,
+  ReduceGetBw,
+  ReduceRunColl,
+  ReduceGetAlgoProtoChannels
+};
+
+void ReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  ReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
+}
+
+testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &reduceTest;
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+  int begin_root, end_root;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if ((int)op != -1) {
+    op_count = 1;
+    run_ops = &op;
+    run_opnames = &opName;
+  } else {
+    op_count = test_opnum;
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+
+  if (root != -1) {
+    begin_root = end_root = root;
+  } else {
+    begin_root = 0;
+    end_root = args->nProcs*args->nThreads*args->nGpus-1;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+#if defined(RCCL_FLOAT8)
+if((run_types[i] == ncclFloat8e4m3 || run_types[i] == ncclFloat8e5m2) && (run_ops[j] == ncclProd || run_ops[j] == ncclAvg || strcmp(run_opnames[j],"mulsum") == 0))
+    continue;
+#endif
+      for (int k=begin_root; k<=end_root; k++) {
+        TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], k));
+      }
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine ncclTestEngine = {
+  ReduceGetBuffSize,
+  ReduceRunTest
+};
@@ -0,0 +1,115 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+#include "rccl_compat.h"
+
+void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
+  size_t base = (count/nranks) & -(16/eltSize);
+  *sendcount = base*nranks;
+  *recvcount = base;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = base;
+  *paramcount = base;
+}
+
+testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
+    CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault));
+    TESTCHECK(InitDataReduce(args->expected[i], recvcount, rank*recvcount, type, op, rep, nranks));
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+testResult_t  ReduceScatterGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo, int* proto, int* nchannels) {
+  if(rcclTestsGetAlgoInfo == NULL) return testInternalError;
+  NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncReduceScatter , count, type , 0, 0, 1, algo, proto, nchannels));
+  return testSuccess;
+}
+
+
+void ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize * nranks) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(nranks - 1))/((double)nranks);
+  *busBw = baseBw * factor;
+}
+
+testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
+  NCCLCHECK(ncclReduceScatter(sendbuff, recvbuff, count, type, op, comm, stream));
+  return testSuccess;
+}
+
+struct testColl reduceScatterTest = {
+  "ReduceScatter",
+  ReduceScatterGetCollByteCount,
+  ReduceScatterInitData,
+  ReduceScatterGetBw,
+  ReduceScatterRunColl,
+  ReduceScatterGetAlgoProtoChannels
+};
+
+void ReduceScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  ReduceScatterGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
+}
+
+testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &reduceScatterTest;
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if ((int)op != -1) {
+    run_ops = &op;
+    run_opnames = &opName;
+    op_count = 1;
+  } else {
+    op_count = test_opnum;
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+#if defined(RCCL_FLOAT8)
+if((run_types[i] == ncclFloat8e4m3 || run_types[i] == ncclFloat8e5m2) && (run_ops[j] == ncclProd || run_ops[j] == ncclAvg || strcmp(run_opnames[j],"mulsum") == 0))
+    continue;
+#endif
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine ncclTestEngine = {
+  ReduceScatterGetBuffSize,
+  ReduceScatterRunTest
+};
@@ -0,0 +1,113 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+#include "rccl_compat.h"
+
+void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
+  *recvcount = (count/nranks) & -(16/eltSize);
+  *sendcount = (*recvcount)*nranks;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = *recvcount;
+  *paramcount = *recvcount;
+}
+
+testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+
+  for (int i=0; i<args->nGpus; i++) {
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0));
+    TESTCHECK(InitData(args->expected[i], recvcount, rank*recvcount, type, ncclSum, rep, 1, 0));
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void ScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(nranks-1))/((double)(nranks));
+  *busBw = baseBw * factor;
+}
+
+testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
+  int nRanks;
+  NCCLCHECK(ncclCommCount(comm, &nRanks));
+  int rank;
+  NCCLCHECK(ncclCommUserRank(comm, &rank));
+  size_t rankOffset = count * wordSize(type);
+  if (count == 0) return testSuccess;
+
+  NCCLCHECK(ncclGroupStart());
+  if (rank == root) {
+    for (int r=0; r<nRanks; r++) {
+      NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, count, type, r, comm, stream));
+    }
+  }
+  NCCLCHECK(ncclRecv(recvbuff, count, type, root, comm, stream));
+  NCCLCHECK(ncclGroupEnd());
+
+  return testSuccess;
+}
+
+struct testColl scatterTest = {
+  "Scatter",
+  ScatterGetCollByteCount,
+  ScatterInitData,
+  ScatterGetBw,
+  ScatterRunColl,
+  NULL
+};
+
+void ScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  ScatterGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
+}
+
+testResult_t ScatterRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &scatterTest;
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+  int begin_root, end_root;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if (root != -1) {
+    begin_root = end_root = root;
+  } else {
+    begin_root = 0;
+    end_root = args->nProcs*args->nThreads*args->nGpus-1;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=begin_root; j<=end_root; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", j));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine ncclTestEngine = {
+  ScatterGetBuffSize,
+  ScatterRunTest
+};
@@ -0,0 +1,115 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+#include "rccl_compat.h"
+
+void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
+}
+
+testResult_t SendRecvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
+    int peer = (rank-1+nranks)%nranks;
+    TESTCHECK(InitData(args->expected[i], recvcount, peer*recvcount, type, ncclSum, rep, 1, 0));
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  // We don't support in-place sendrecv
+  args->reportErrors = in_place ? 0 : 1;
+  return testSuccess;
+}
+
+void SendRecvGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = 1;
+  *busBw = baseBw * factor;
+}
+
+testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
+  int nRanks;
+  NCCLCHECK(ncclCommCount(comm, &nRanks));
+  int rank;
+  NCCLCHECK(ncclCommUserRank(comm, &rank));
+  int recvPeer = (rank-1+nRanks) % nRanks;
+  int sendPeer = (rank+1) % nRanks;
+
+  NCCLCHECK(ncclGroupStart());
+  NCCLCHECK(ncclSend(sendbuff, count, type, sendPeer, comm, stream));
+  NCCLCHECK(ncclRecv(recvbuff, count, type, recvPeer, comm, stream));
+  NCCLCHECK(ncclGroupEnd());
+  return testSuccess;
+}
+
+struct testColl sendRecvTest = {
+  "SendRecv",
+  SendRecvGetCollByteCount,
+  SendRecvInitData,
+  SendRecvGetBw,
+  SendRecvRunColl,
+  NULL
+};
+
+void SendRecvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  SendRecvGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
+}
+
+testResult_t SendRecvRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &sendRecvTest;
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if ((int)op != -1) {
+    op_count = 1;
+    run_ops = &op;
+    run_opnames = &opName;
+  } else {
+    op_count = test_opnum;
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine ncclTestEngine = {
+  SendRecvGetBuffSize,
+  SendRecvRunTest
+};
@@ -0,0 +1,28 @@
+#include "timer.h"
+
+// Make sure to compile this translation unit with the host compiler and not
+// nvcc, lest you hit an internal compiler error (ICE) with GCC 10.3.0
+#include <chrono>
+
+namespace {
+  std::uint64_t now() {
+    using clock = std::chrono::steady_clock;
+    return std::chrono::duration_cast<std::chrono::nanoseconds>(clock::now().time_since_epoch()).count();
+  }
+}
+
+timer::timer() {
+  t0 = now();
+}
+
+double timer::elapsed() const {
+  std::uint64_t t1 = now();
+  return 1.e-9*(t1 - t0);
+}
+
+double timer::reset() {
+  std::uint64_t t1 = now();
+  double ans = 1.e-9*(t1 - t0);
+  t0 = t1;
+  return ans;
+}
@@ -0,0 +1,15 @@
+#ifndef _408319ecdd5b47b28bf8f511c4fdf816
+#define _408319ecdd5b47b28bf8f511c4fdf816
+
+#include <cstdint>
+
+// Can't include <chrono> because of bug with gcc 10.3.0
+class timer {
+  std::uint64_t t0;
+public:
+  timer();
+  double elapsed() const;
+  double reset();
+};
+
+#endif
@@ -0,0 +1,20 @@
+#################################################################################
+# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+# ies of the Software, and to permit persons to whom the Software is furnished
+# to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+################################################################################
@@ -0,0 +1,23 @@
+#################################################################################
+# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+# ies of the Software, and to permit persons to whom the Software is furnished
+# to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+################################################################################
+
+def pytest_addoption(parser):
+    parser.addoption("--hostfile", action="store", default="", help="specify MPI hostfile")
@@ -0,0 +1,112 @@
+#################################################################################
+# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+# ies of the Software, and to permit persons to whom the Software is furnished
+# to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+################################################################################
+
+import os
+import subprocess
+import itertools
+import math
+
+import pytest
+
+ngpus = 0
+if os.environ.get('ROCR_VISIBLE_DEVICES') is not None:
+    ngpus = len(os.environ['ROCR_VISIBLE_DEVICES'].split(","))
+elif os.environ.get('HIP_VISIBLE_DEVICES') is not None:
+    ngpus = len(os.environ['HIP_VISIBLE_DEVICES'].split(","))
+else:
+    ngpus = int(subprocess.check_output("rocminfo | grep \"Device Type:.\s*.GPU\" | wc -l",shell=True))
+log_ngpus = int(math.log2(ngpus))
+
+nthreads = ["1"]
+nprocs = ["2"]
+ngpus_single = [str(2**x) for x in range(log_ngpus+1)]
+ngpus_mpi = ["1","2"]
+byte_range = [("4", "128M")]
+op = ["sum", "prod", "min", "max"]
+step_factor = ["2"]
+datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
+memory_type = ["coarse","fine", "host"]
+
+path = os.path.dirname(os.path.abspath(__file__))
+executable = path + "/../build/all_gather_perf"
+
+@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
+    itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
+def test_AllGatherSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
+    try:
+        args = [executable,
+                "-t", nthreads,
+                "-g", ngpus_single,
+                "-b", byte_range[0],
+                "-e", byte_range[1],
+                "-o", op,
+                "-f", step_factor,
+                "-d", datatype,
+                "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("AllGather test error(s) detected.")
+
+    assert rccl_test.returncode == 0
+
+@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
+    itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
+def test_AllGatherMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
+    try:
+        mpi_hostfile = request.config.getoption('--hostfile')
+        if not mpi_hostfile:
+            args = ["mpirun -np", nprocs,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype]
+        else:
+            args = ["mpirun -np", nprocs,
+                    "-host", mpi_hostfile,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype,
+                    "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        print(args_str)
+        rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("AllGather test error(s) detected.")
+
+    assert rccl_test.returncode == 0
@@ -0,0 +1,112 @@
+#################################################################################
+# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+# ies of the Software, and to permit persons to whom the Software is furnished
+# to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+################################################################################
+
+import os
+import subprocess
+import itertools
+import math
+
+import pytest
+
+ngpus = 0
+if os.environ.get('ROCR_VISIBLE_DEVICES') is not None:
+    ngpus = len(os.environ['ROCR_VISIBLE_DEVICES'].split(","))
+elif os.environ.get('HIP_VISIBLE_DEVICES') is not None:
+    ngpus = len(os.environ['HIP_VISIBLE_DEVICES'].split(","))
+else:
+    ngpus = int(subprocess.check_output("rocminfo | grep \"Device Type:.\s*.GPU\" | wc -l",shell=True))
+log_ngpus = int(math.log2(ngpus))
+
+nthreads = ["1"]
+nprocs = ["2"]
+ngpus_single = [str(2**x) for x in range(log_ngpus+1)]
+ngpus_mpi = ["1","2"]
+byte_range = [("4", "128M")]
+op = ["sum", "prod", "min", "max"]
+step_factor = ["2"]
+datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
+memory_type = ["coarse","fine", "host"]
+
+path = os.path.dirname(os.path.abspath(__file__))
+executable = path + "/../build/all_reduce_perf"
+
+@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
+    itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
+def test_AllReduceSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
+    try:
+        args = [executable,
+                "-t", nthreads,
+                "-g", ngpus_single,
+                "-b", byte_range[0],
+                "-e", byte_range[1],
+                "-o", op,
+                "-f", step_factor,
+                "-d", datatype,
+                "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("AllReduce test error(s) detected.")
+
+    assert rccl_test.returncode == 0
+
+@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
+    itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
+def test_AllReduceMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
+    try:
+        mpi_hostfile = request.config.getoption('--hostfile')
+        if not mpi_hostfile:
+            args = ["mpirun -np", nprocs,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype]
+        else:
+            args = ["mpirun -np", nprocs,
+                    "-host", mpi_hostfile,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype,
+                    "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        print(args_str)
+        rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("AllReduce test error(s) detected.")
+
+    assert rccl_test.returncode == 0
@@ -0,0 +1,112 @@
+#################################################################################
+# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+# ies of the Software, and to permit persons to whom the Software is furnished
+# to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+################################################################################
+
+import os
+import subprocess
+import itertools
+import math
+
+import pytest
+
+ngpus = 0
+if os.environ.get('ROCR_VISIBLE_DEVICES') is not None:
+    ngpus = len(os.environ['ROCR_VISIBLE_DEVICES'].split(","))
+elif os.environ.get('HIP_VISIBLE_DEVICES') is not None:
+    ngpus = len(os.environ['HIP_VISIBLE_DEVICES'].split(","))
+else:
+    ngpus = int(subprocess.check_output("rocminfo | grep \"Device Type:.\s*.GPU\" | wc -l",shell=True))
+log_ngpus = int(math.log2(ngpus))
+
+nthreads = ["1"]
+nprocs = ["2"]
+ngpus_single = [str(2**x) for x in range(log_ngpus+1)]
+ngpus_mpi = ["1","2"]
+byte_range = [("4", "128M")]
+op = ["sum", "prod", "min", "max"]
+step_factor = ["2"]
+datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
+memory_type = ["coarse","fine", "host"]
+
+path = os.path.dirname(os.path.abspath(__file__))
+executable = path + "/../build/broadcast_perf"
+
+@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
+    itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
+def test_BroadcastSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
+    try:
+        args = [executable,
+                "-t", nthreads,
+                "-g", ngpus_single,
+                "-b", byte_range[0],
+                "-e", byte_range[1],
+                "-o", op,
+                "-f", step_factor,
+                "-d", datatype,
+                "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("Broadcast test error(s) detected.")
+
+    assert rccl_test.returncode == 0
+
+@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
+    itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
+def test_BroadcastMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
+    try:
+        mpi_hostfile = request.config.getoption('--hostfile')
+        if not mpi_hostfile:
+            args = ["mpirun -np", nprocs,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype]
+        else:
+            args = ["mpirun -np", nprocs,
+                    "-host", mpi_hostfile,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype,
+                    "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        print(args_str)
+        rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("Broadcast test error(s) detected.")
+
+    assert rccl_test.returncode == 0
@@ -0,0 +1,112 @@
+#################################################################################
+# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+# ies of the Software, and to permit persons to whom the Software is furnished
+# to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+################################################################################
+
+import os
+import subprocess
+import itertools
+import math
+
+import pytest
+
+ngpus = 0
+if os.environ.get('ROCR_VISIBLE_DEVICES') is not None:
+    ngpus = len(os.environ['ROCR_VISIBLE_DEVICES'].split(","))
+elif os.environ.get('HIP_VISIBLE_DEVICES') is not None:
+    ngpus = len(os.environ['HIP_VISIBLE_DEVICES'].split(","))
+else:
+    ngpus = int(subprocess.check_output("rocminfo | grep \"Device Type:.\s*.GPU\" | wc -l",shell=True))
+log_ngpus = int(math.log2(ngpus))
+
+nthreads = ["1"]
+nprocs = ["2"]
+ngpus_single = [str(2**x) for x in range(log_ngpus+1)]
+ngpus_mpi = ["1","2"]
+byte_range = [("4", "128M")]
+op = ["sum", "prod", "min", "max"]
+step_factor = ["2"]
+datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
+memory_type = ["coarse","fine", "host"]
+
+path = os.path.dirname(os.path.abspath(__file__))
+executable = path + "/../build/reduce_perf"
+
+@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
+    itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
+def test_ReduceSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
+    try:
+        args = [executable,
+                "-t", nthreads,
+                "-g", ngpus_single,
+                "-b", byte_range[0],
+                "-e", byte_range[1],
+                "-o", op,
+                "-f", step_factor,
+                "-d", datatype,
+                "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("Reduce test error(s) detected.")
+
+    assert rccl_test.returncode == 0
+
+@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
+    itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
+def test_ReduceMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
+    try:
+        mpi_hostfile = request.config.getoption('--hostfile')
+        if not mpi_hostfile:
+            args = ["mpirun -np", nprocs,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype]
+        else:
+            args = ["mpirun -np", nprocs,
+                    "-host", mpi_hostfile,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype,
+                    "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        print(args_str)
+        rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("Reduce test error(s) detected.")
+
+    assert rccl_test.returncode == 0
@@ -0,0 +1,112 @@
+#################################################################################
+# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+# ies of the Software, and to permit persons to whom the Software is furnished
+# to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+################################################################################
+
+import os
+import subprocess
+import itertools
+import math
+
+import pytest
+
+ngpus = 0
+if os.environ.get('ROCR_VISIBLE_DEVICES') is not None:
+    ngpus = len(os.environ['ROCR_VISIBLE_DEVICES'].split(","))
+elif os.environ.get('HIP_VISIBLE_DEVICES') is not None:
+    ngpus = len(os.environ['HIP_VISIBLE_DEVICES'].split(","))
+else:
+    ngpus = int(subprocess.check_output("rocminfo | grep \"Device Type:.\s*.GPU\" | wc -l",shell=True))
+log_ngpus = int(math.log2(ngpus))
+
+nthreads = ["1"]
+nprocs = ["2"]
+ngpus_single = [str(2**x) for x in range(log_ngpus+1)]
+ngpus_mpi = ["1","2"]
+byte_range = [("4", "128M")]
+op = ["sum", "prod", "min", "max"]
+step_factor = ["2"]
+datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
+memory_type = ["coarse","fine", "host"]
+
+path = os.path.dirname(os.path.abspath(__file__))
+executable = path + "/../build/reduce_scatter_perf"
+
+@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
+    itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
+def test_ReduceScatterSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
+    try:
+        args = [executable,
+                "-t", nthreads,
+                "-g", ngpus_single,
+                "-b", byte_range[0],
+                "-e", byte_range[1],
+                "-o", op,
+                "-f", step_factor,
+                "-d", datatype,
+                "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("ReduceScatter test error(s) detected.")
+
+    assert rccl_test.returncode == 0
+
+@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
+    itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
+def test_ReduceScatterMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
+    try:
+        mpi_hostfile = request.config.getoption('--hostfile')
+        if not mpi_hostfile:
+            args = ["mpirun -np", nprocs,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype]
+        else:
+            args = ["mpirun -np", nprocs,
+                    "-host", mpi_hostfile,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype,
+                    "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        print(args_str)
+        rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("ReduceScatter test error(s) detected.")
+
+    assert rccl_test.returncode == 0
@@ -0,0 +1,29 @@
+
+if (DEFINED ENV{ROCM_PATH})
+  set(rocm_bin "$ENV{ROCM_PATH}/bin")
+else()
+  set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to the ROCm installation.")
+  set(rocm_bin "/opt/rocm/bin")
+endif()
+
+if (NOT DEFINED ENV{CXX})
+  set(CMAKE_CXX_COMPILER "${rocm_bin}/amdclang++" CACHE PATH "Path to the C++ compiler")
+else()
+  set(CMAKE_CXX_COMPILER "$ENV{CXX}" CACHE PATH "Path to the C++ compiler")
+endif()
+
+if (NOT DEFINED ENV{CXXFLAGS})
+  set(CMAKE_CXX_FLAGS_DEBUG "-g -O1")
+  set(CMAKE_CXX_FLAGS_RELEASE "-O3")
+endif()
+
+if (NOT DEFINED ENV{CC})
+  set(CMAKE_C_COMPILER "${rocm_bin}/amdclang" CACHE PATH "Path to the C compiler")
+else()
+  set(CMAKE_C_COMPILER "$ENV{CC}" CACHE PATH "Path to the C compiler")
+endif()
+
+if (NOT DEFINED ENV{CFLAGS})
+  set(CMAKE_C_FLAGS_DEBUG "-g -O1")
+  set(CMAKE_C_FLAGS_RELEASE "-O3")
+endif()
@@ -0,0 +1,74 @@
+#
+# Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
+# Modifications are Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+#include ../../makefiles/common.mk
+
+.PHONY: all clean
+
+BUILDDIR := $(abspath ../../build)
+DST_DIR := $(BUILDDIR)/test/verifiable
+
+ROCM_PATH ?= /opt/rocm
+MPI_HOME ?= /usr/lib/x86_64-linux-gnu
+PREFIX ?= /usr/local
+VERBOSE ?= 0
+DEBUG ?= 0
+NCCL_HOME ?= ""
+
+HIPCC = $(ROCM_PATH)/bin/amdclang++
+CXX = $(HIPCC)
+
+HIPCUFLAGS := -std=c++14
+LDFLAGS    :=
+HIPLDFLAGS :=
+
+ifneq ($(NCCL_HOME), "")
+HIPCUFLAGS += -I$(NCCL_HOME)/ -I$(NCCL_HOME)/include
+HIPLDFLAGS   += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME) -L$(NCCL_HOME)/lib
+endif
+
+HIPCUFLAGS += -I$(ROCM_PATH)/include
+HIPCUFLAGS += -I$(ROCM_PATH)/include/hip
+LDFLAGS    += -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
+HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lamdhip64 -lstdc++ -lrt
+
+ifeq ($(DEBUG), 0)
+HIPCUFLAGS += -O3
+else
+HIPCUFLAGS += -O0 -g -ggdb3
+endif
+
+ifeq ($(VERBOSE), 0)
+.SILENT:
+endif
+
+ifeq ($(MPI), 1)
+HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/include/openmpi -I${MPI_HOME}/openmpi/include -I${MPI_HOME}/openmpi/include/openmpi
+HIPLDFLAGS += -L${MPI_HOME}/lib -L${MPI_HOME}/openmpi/lib -lmpi
+else ifeq ($(MPICH), 1)
+HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/mpich/include -I/usr/include/x86_64-linux-gnu/mpich
+HIPLDFLAGS += -L${MPI_HOME}/lib -L${MPI_HOME}/mpich/lib -lmpich
+endif
+
+LIBRARIES += rccl
+HIPLDFLAGS += $(LIBRARIES:%=-l%)
+
+all: $(DST_DIR)/verifiable.o $(DST_DIR)/self_test
+
+clean:
+	rm -rf $(DST_DIR)
+
+TEST_VERIFIABLE_SRCDIR := .
+TEST_VERIFIABLE_BUILDDIR := $(DST_DIR)
+include verifiable.mk
+
+self_test: $(DST_DIR)/self_test
+
+$(DST_DIR)/self_test: main.cu $(TEST_VERIFIABLE_LIBS)
+	@printf "Linking  %s\n" $@
+	@mkdir -p $(DST_DIR)
+	$(HIPCC) -o $@ $(HIPCUFLAGS) -DSELF_TEST=1 $< -L$(TEST_VERIFIABLE_BUILDDIR) -lverifiable $(HIPLDFLAGS) -Xlinker "-rpath=\$$ORIGIN"
@@ -0,0 +1,195 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+/* Generate parameters for our error bound model of floating point average
+ * (sum of scaled values) by sampling sums of random sequences for each
+ * floating point type.
+ *
+ * The model has parameters "coef" and "power", where for two floats a & b,
+ * they are close enough if and only if:
+ *   abs(intBits(a) - intBits(b)) <= 1 + coef*pow(rank_n, power);
+ *
+ * Where intBits(x) is the reinterpretation of the float bitpattern as an integer.
+ *
+ * Compile with:
+ *   nvcc -gencode=arch=compute_80,code=sm_80
+ */
+
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstdint>
+#include <hip/hip_bfloat16.h>
+#include <cuda_fp16.h>
+
+using std::uint64_t;
+using std::uint32_t;
+using bfloat16 = hip_bfloat16;
+
+template<typename T>
+struct float_traits;
+
+template<>
+struct float_traits<float> {
+  static constexpr int mantissa_bits = 23;
+  static constexpr int exponent_bits = 8;
+  using uint_t = uint32_t;
+  __device__ static float make(double x) { return (float)x; }
+  __device__ static float make(uint64_t x) { return (float)x; }
+  __device__ static double todouble(float x) { return x; }
+  __device__ static float add(float a, float b) { return a+b; }
+  __device__ static float mul(float a, float b) { return a*b; }
+};
+template<>
+struct float_traits<double> {
+  static constexpr int mantissa_bits = 52;
+  static constexpr int exponent_bits = 11;
+  using uint_t = uint64_t;
+  __device__ static double make(double x) { return x; }
+  __device__ static double make(uint64_t x) { return (double)x; }
+  __device__ static double todouble(double x) { return x; }
+  __device__ static double add(double a, double b) { return a+b; }
+  __device__ static double mul(double a, double b) { return a*b; }
+};
+template<>
+struct float_traits<__half> {
+  static constexpr int mantissa_bits = 10;
+  static constexpr int exponent_bits = 5;
+  using uint_t = uint16_t;
+  __device__ static __half make(double x) { return __float2half((float)x); }
+  __device__ static __half make(uint64_t x) { return __int2half_rn(x); }
+  __device__ static double todouble(__half x) { return __half2float(x); }
+  __device__ static __half add(__half a, __half b) { return __hadd(a, b); }
+  __device__ static __half mul(__half a, __half b) { return __hmul(a, b); }
+};
+template<>
+struct float_traits<bfloat16> {
+  static constexpr int mantissa_bits = 7;
+  static constexpr int exponent_bits = 8;
+  using uint_t = uint16_t;
+  __device__ static bfloat16 make(double x) { return bfloat16(x); }
+  __device__ static bfloat16 make(uint64_t x) { return bfloat16(x); }
+  __device__ static double todouble(bfloat16 x) { return double(x); }
+  __device__ static bfloat16 add(bfloat16 a, bfloat16 b) { return bfloat16(__hadd((float)a, (float)b)); }
+  __device__ static bfloat16 mul(bfloat16 a, bfloat16 b) { return bfloat16(__hmul((float)a, (float)b)); }
+};
+
+template<typename F>
+__device__ int compare(F a, F b) {
+  union { typename float_traits<F>::uint_t ua; F fa; };
+  union { typename float_traits<F>::uint_t ub; F fb; };
+  ua=0; ub=0;
+  fa=a; fb=b;
+  //std::printf("bits(%1.10f)=%x bits(%1.10f)=%x\n", fa, ua, fb, ub);
+  return ua < ub ? ub-ua : ua-ub;
+}
+
+struct xoshiro256ss {
+	uint64_t s[4];
+  __device__ xoshiro256ss(int seed) {
+    constexpr uint64_t src[4] = {0xbb99e851d1f545cc, 0xbfc4022389ca40cb, 0xe84aff5cb1914af5, 0x845999858284de77};
+    for(int i=0; i < 4; i++)
+      s[i] = src[i] + (seed + i)*0xb45de8a52fdb65d3;
+  }
+  __device__ uint64_t operator()() {
+    auto rol64 = [](uint64_t x, int k) {
+      return (x << k) | (x >> (64 - k));
+    };
+    uint64_t const result = rol64(s[1] * 5, 7) * 9;
+    uint64_t const t = s[1] << 17;
+    s[2] ^= s[0];
+    s[3] ^= s[1];
+    s[1] ^= s[2];
+    s[0] ^= s[3];
+    s[2] ^= t;
+    s[3] = rol64(s[3], 45);
+    return result;
+  }
+};
+
+static __device__ int __reduce_max_sync(unsigned int mask, int value)
+{
+  //We ignore mask, since all bits are set when calling them in the
+  //test code below.
+  int width = warpSize;
+  for (unsigned int i = warpSize; i; i >>= 1) {
+    value = max(__shfl_down(value, i, width), value);
+  }
+  return value;
+}
+
+template<typename F>
+__global__ void kernel() {
+  using traits = float_traits<F>;
+  constexpr int samps = 4<<10;
+  __shared__ F accf[samps];
+  __shared__ double accd[samps];
+
+  xoshiro256ss rng(threadIdx.x);
+  float expo_avg = 1;
+  for(int pass=0; pass < 2; pass++) {
+    F scalar = traits::make(1.0/(3.14159 + .5*threadIdx.x));
+    int err_max = 0;
+    float coef = 0;
+    double expo_sum = 0;
+    int expo_n = 0;
+    int max_ranks = std::is_same<F,float>::value ? 16<<10 : 1<<traits::mantissa_bits;
+    for(int round=0; round < 1 + (16<<10)/max_ranks; round++) {
+    //for(int round=0; round < 2; round++) {
+      for(int i=threadIdx.x; i < samps; i += blockDim.x) {
+        accf[i] = (F)0;
+        accd[i] = 0;
+      }
+      __syncthreads();
+      for(int r=0; r < max_ranks; r++) {
+        int err = 0;
+        for(int i=threadIdx.x; i < samps; i+=blockDim.x) {
+          constexpr uint64_t m = (1ll<<traits::mantissa_bits)-1;
+          double d = std::is_same<F,float>::value ? double(rng() & m) : 1.0;
+          F f = traits::make(d);
+          accf[i] = traits::add(accf[i], traits::mul(scalar, f));
+          accd[i] += traits::todouble(f);
+          //if(threadIdx.x==0 && std::is_same<F,half>::value) std::printf(" r=%d f=%f\n", r, traits::todouble(accf[i]));
+          int e = compare(accf[i], traits::mul(scalar, traits::make(accd[i])));
+          err = err > e ? err : e;
+        }
+        err = __reduce_max_sync(-1u, err);
+        err_max = err_max > err ? err_max : err;
+        if (r >= 2) {
+          // err = 1 + coef*pow(r,expo)
+          float c = float(err-1)/powf(float(r), expo_avg);
+          coef = coef > c ? coef : c;
+        }
+        if (r >= 2) {
+          double expo = log2f(1+err_max)/log2f(r);
+          expo_sum += expo;
+          expo_n++;
+          //if(threadIdx.x==0 && std::is_same<F,half>::value) std::printf(" r=%d err=%d errmax=%d expo=%f sum=%f n=%d\n", r, err, err_max, expo, expo_sum, expo_n);
+        }
+      }
+    }
+    if(pass==0)
+      expo_avg = expo_sum/expo_n;
+    else if(threadIdx.x == 0)
+      printf("  coef=%1.10f expo=%1.10f\n", coef, expo_avg);
+  }
+}
+
+int main() {
+  std::printf("type=float:\n");
+  kernel<float><<<1,32>>>();
+  cudaDeviceSynchronize();
+
+  std::printf("\ntype=half:\n");
+  kernel<half><<<1,32>>>();
+  cudaDeviceSynchronize();
+
+  std::printf("\ntype=bfloat16:\n");
+  kernel<bfloat16><<<1,32>>>();
+  cudaDeviceSynchronize();
+  return 0;
+}
@@ -0,0 +1,18 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include <cuda_runtime.h>
+#include <iostream>
+
+#define NCCL_VERIFIABLE_SELF_TEST 1
+#include "verifiable.h"
+
+int main(int arg_n, char **args) {
+  std::cerr<<"You are hoping to see no output beyond this line."<<std::endl;
+  cudaSetDevice(0);
+  ncclVerifiableLaunchSelfTest();
+  cudaDeviceSynchronize();
+  return 0;
+}
@@ -0,0 +1,77 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _d41d8cd98f00b204e9800998ecf8427e
+#define _d41d8cd98f00b204e9800998ecf8427e
+
+#include <cuda_runtime.h>
+
+#include <stdint.h>
+
+/* Routines for launching kernels that verify reduction results. A significant
+ * feature of these routines is they carefully craft floating point input
+ * to produce exactly predictable output.
+ *
+ * int elt_ty: actually just a ncclDataType_t
+ *
+ * int red_op: mostly just a  ncclRedOp_t. Since PreMulSum ops are dynamically
+ * created, these are encoded as the value ncclNumOps and their scalar is
+ * assumed to be `ncclVerifiablePremulScalar(rank_me)`
+ *
+ * uint64_t seed: arbitrary 64-bits to use in seeding the random values
+ *
+ * intptr_t elt_ix0: index of first element pointed to by elts when generating
+ * random values. This makes it possible to generate subsequences independently
+ * as well as in aggregate.
+ *
+ * int rank_n: Number of contributions into the reduction. Non-reduction
+ * collectives like broadcast, gather, etc will always set this to one.
+ *
+ * int rank_me: Index of this contribution
+ */
+
+// Use this as the local scalar for PreMulSum ops
+template<typename T>
+__host__ __device__ T ncclVerifiablePremulScalar(int rank_me) {
+  return T(rank_me%2 == 0 ? 1.0f : 2.0f);
+}
+
+// Enqueue kernel to generate data which is to be reduced.
+hipError_t ncclVerifiablePrepareInput(
+  void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
+  uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
+);
+
+// Enqueue kernel to generate expected results of reduction.
+hipError_t ncclVerifiablePrepareExpected(
+  void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
+  uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
+);
+
+// Enqueue kernel to verify reduced data matches expectation. The number of
+// failed elements is written to bad_elt_n which must be in cudaHost memory.
+// If `expected == nullptr` then the expected results are generated on-the-fly
+// which can be costly. Thus if you plan to run the same reduction multiple
+// times it is advantageous to precompute the expected values with
+// ncclVerifiablePrepareExpected and pass them as `expected` here.
+hipError_t ncclVerifiableVerify(
+  void const *results, void const *expected, intptr_t elt_n, int elt_ty,
+  int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
+  int64_t *bad_elt_n, cudaStream_t stream
+);
+
+// Enqueue kernel that applies bias to expected results
+void ncclVerifiableApplyBias(
+    void *elts, void* bias, intptr_t elt_n, int elt_ty, int red_op, intptr_t elt_ix0,
+    cudaStream_t stream
+);
+
+#ifdef NCCL_VERIFIABLE_SELF_TEST
+void ncclVerifiableLaunchSelfTest();
+#endif
+
+#endif
@@ -0,0 +1,40 @@
+# Copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved.
+# Modifications Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE.txt for license information
+
+
+# We require both of the following paths to be set upon including this makefile
+# TEST_VERIFIABLE_SRCDIR = <points to this directory>
+# TEST_VERIFIABLE_BUILDDIR = <points to destination of .so file>
+
+TEST_VERIFIABLE_HDRS      = $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
+TEST_VERIFIABLE_OBJS      = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o
+TEST_VERIFIABLE_LIBS      = $(TEST_VERIFIABLE_BUILDDIR)/libverifiable.so
+
+${HIPIFY_DIR}/verifiable.cu.cpp: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu
+	@printf "Hipifying  %-35s > %s\n" $< $@
+	@mkdir -p ${HIPIFY_DIR}
+	${HIPIFY_PL_EXE} ${HIPIFY_PL_FLAGS} $< > $@
+
+${HIPIFY_DIR}/verifiable.h: $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
+	@printf "Hipifying  %-35s > %s\n" $< $@
+	@mkdir -p ${HIPIFY_DIR}
+	${HIPIFY_PL_EXE} ${HIPIFY_PL_FLAGS} $< > $@
+
+${HIPIFY_DIR}/rccl_float8.h: $(TEST_VERIFIABLE_SRCDIR)/../src/rccl_float8.h
+	@printf "Hipifying  %-35s > %s\n" $< $@
+	@mkdir -p ${HIPIFY_DIR}
+	${HIPIFY_PL_EXE} ${HIPIFY_PL_FLAGS} $< > $@
+
+$(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(HIPIFY_DIR)/verifiable.cu.cpp $(HIPIFY_DIR)/verifiable.h $(HIPIFY_DIR)/rccl_float8.h
+	@printf "Compiling %s\n" $@
+	@mkdir -p $(TEST_VERIFIABLE_BUILDDIR)
+	echo " $(HIPCC) -o $@ $(HIPCUFLAGS) -c $<"
+	$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<
+
+$(TEST_VERIFIABLE_BUILDDIR)/libverifiable.so: $(TEST_VERIFIABLE_OBJS)
+	@printf "Creating DSO %s\n" $@
+	@mkdir -p $(TEST_VERIFIABLE_BUILDDIR)
+	$(CC) -shared -o $@.0 $^ -Wl,-soname,$(notdir $@).0
+	ln -sf $(notdir $@).0 $@