diff --git a/projects/rccl-tests/.github/CODEOWNERS b/projects/rccl-tests/.github/CODEOWNERS new file mode 100755 index 0000000000..23ddf51b30 --- /dev/null +++ b/projects/rccl-tests/.github/CODEOWNERS @@ -0,0 +1,6 @@ +* @wenkaidu @gilbertlee-amd @PedramAlizadeh @nusislam @nileshnegi @KawtharShafie @AtlantaPepsi @mberenjk @corey-derochie-amd @mustafabar @thananon @JhaShweta1 @BertanDogancay @rahulvaidya20 @isaki001 @PJAvinash @AbandiGa @Nikhil-Nunna @haripriya-amd @atulkulk @alex-breslow-amd @ddebonis-amd @amd-mengshwu @Kapil-Shyam-Pawar @weilewei @nawrinsu @speriaswamy-amd + +# Documentation files +doc/ @ROCm/rocm-documentation +*.md @ROCm/rocm-documentation +*.rst @ROCm/rocm-documentation diff --git a/projects/rccl-tests/.github/PULL_REQUEST_TEMPLATE.md b/projects/rccl-tests/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000000..426bf3d7db --- /dev/null +++ b/projects/rccl-tests/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,16 @@ +## Details +___Do not mention proprietary info or link to internal work items in this PR.___ + +**Work item:** _"Internal", or link to GitHub issue (if applicable)._ + +**What were the changes?** +_One sentence describing the work done._ + +**Why were the changes made?** +_Explain the motivation behind the work. Provide any publicly-available historical context._ + +**How was the outcome achieved?** +_Technical details behind the work. Explain any publicly-available hardware peculiarities._ + +**Additional Documentation:** +_What else should the reviewer know?_ diff --git a/projects/rccl-tests/.gitignore b/projects/rccl-tests/.gitignore new file mode 100644 index 0000000000..9edaab3568 --- /dev/null +++ b/projects/rccl-tests/.gitignore @@ -0,0 +1,7 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# See LICENCE.txt for license information +build/ +*.gcov +/coverage/ +__pycache__/ diff --git a/projects/rccl-tests/.jenkins/common.groovy b/projects/rccl-tests/.jenkins/common.groovy new file mode 100644 index 0000000000..c709f06566 --- /dev/null +++ b/projects/rccl-tests/.jenkins/common.groovy @@ -0,0 +1,43 @@ +// This file is for internal AMD use. +// If you are interested in running your own Jenkins, please raise a github issue for assistance. + +def runCompileCommand(platform, project, jobName) +{ + project.paths.construct_build_prefix() + + String hipclangArgs = jobName.contains('hipclang') ? '--hip-clang' : '' + + def command = """#!/usr/bin/env bash + set -x + cd ${project.paths.build_prefix} + git clone --recursive https://github.com/ROCm/rccl.git + cd rccl + ./install.sh -l + cd ../.. + ${auxiliary.exitIfNotSuccess()} + + cd ${project.paths.project_build_prefix} + export RCCL_DIR=\$(pwd)/../rccl/build/release + ./install.sh --rccl_home \$RCCL_DIR + ${auxiliary.exitIfNotSuccess()} + """ + + platform.runCommand(this,command) +} + +def runTestCommand (platform, project) +{ + String sudo = auxiliary.sudo(platform.jenkinsLabel) + + def command = """#!/usr/bin/env bash + set -x + cd ${project.paths.project_build_prefix} + python3 -m pip install --upgrade pytest + python3 -m pytest --version + python3 -m pytest -k "not MPI and not host and not fine" --verbose --junitxml=./testreport.xml + """ + + platform.runCommand(this, command) +} + +return this diff --git a/projects/rccl-tests/.jenkins/staticanalysis.groovy b/projects/rccl-tests/.jenkins/staticanalysis.groovy new file mode 100644 index 0000000000..52702f9c3b --- /dev/null +++ b/projects/rccl-tests/.jenkins/staticanalysis.groovy @@ -0,0 +1,64 @@ +#!/usr/bin/env groovy +// This shared library is available at https://github.com/ROCm/rocJENKINS/ +@Library('rocJenkins@pong') _ + +// This is file for internal AMD use. +// If you are interested in running your own Jenkins, please raise a github issue for assistance. + +import com.amd.project.* +import com.amd.docker.* +import java.nio.file.Path + +def runCI = +{ + nodeDetails, jobName-> + + def prj = new rocProject('rccl-tests', 'StaticAnalysis') + + // Define test architectures, optional rocm version argument is available + def nodes = new dockerNodes(nodeDetails, jobName, prj) + + boolean formatCheck = false + boolean staticAnalysis = true + + buildProject(prj, formatCheck, nodes.dockerArray, null, null, null, staticAnalysis) +} + +ci: { + String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) + + def propertyList = [ + "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])] + ] + propertyList = auxiliary.appendPropertyList(propertyList) + + def jobNameList = [ + "compute-rocm-dkms-no-npi-hipclang":([ubuntu22:['cpu']]) + ] + jobNameList = auxiliary.appendJobNameList(jobNameList) + + propertyList.each + { + jobName, property-> + if (urlJobName == jobName) + properties(auxiliary.addCommonProperties(property)) + } + + jobNameList.each + { + jobName, nodeDetails-> + if (urlJobName == jobName) + stage(jobName) { + runCI(nodeDetails, jobName) + } + } + + // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 + if(!jobNameList.keySet().contains(urlJobName)) + { + properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * 0')])])) + stage(urlJobName) { + runCI([ubuntu22:['cpu']], urlJobName) + } + } +} diff --git a/projects/rccl-tests/CMakeLists.txt b/projects/rccl-tests/CMakeLists.txt new file mode 100644 index 0000000000..cb08498032 --- /dev/null +++ b/projects/rccl-tests/CMakeLists.txt @@ -0,0 +1,210 @@ +# Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved. + +# CMake version minimum requirements +#================================================================================================== +cmake_minimum_required(VERSION 3.16 FATAL_ERROR) + +# CMake Toolchain file to define compilers and path to ROCm +#================================================================================================== +if (NOT CMAKE_TOOLCHAIN_FILE) + set(CMAKE_TOOLCHAIN_FILE "${CMAKE_CURRENT_SOURCE_DIR}/toolchain-linux.cmake") + message(STATUS "CMAKE_TOOLCHAIN_FILE: ${CMAKE_TOOLCHAIN_FILE}") +endif() + +# RCCL Tests project +#================================================================================================== +project(rccl-tests LANGUAGES CXX) + +# Build options +#================================================================================================== +option(USE_MPI "Build RCCL-tests with MPI support." OFF) +option(BUILD_LOCAL_GPU_TARGET_ONLY "Build only for GPUs detected on this machine" OFF) + +if (NOT CMAKE_BUILD_TYPE) + message(WARNING "CMAKE_BUILD_TYPE is not defined. Setting to Release") + set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Default build type") +endif() + +# Default GPU architectures to build +#================================================================================================== +set(DEFAULT_GPUS + gfx906 + gfx908 + gfx90a + gfx942 + gfx950 + gfx1030 + gfx1100 + gfx1101 + gfx1102 + gfx1200 + gfx1201) + +# Get additional packages required +include(CheckIncludeFiles) +include(CheckSymbolExists) +include(cmake/Dependencies.cmake) # rocm-cmake, rocm_local_targets +include(cmake/CheckSymbolExistsNoWarn.cmake) + +# Build only for local GPU architecture +if (BUILD_LOCAL_GPU_TARGET_ONLY) + message(STATUS "Building only for local GPU target") + if (COMMAND rocm_local_targets) + rocm_local_targets(DEFAULT_GPUS) + else() + message(WARNING "Unable to determine local GPU targets. Falling back to default GPUs.") + endif() +endif() + +# Determine which GPU architectures to build for +set(GPU_TARGETS "${DEFAULT_GPUS}" CACHE STRING "Target default GPUs if GPU_TARGETS is not defined.") + +# Check if clang compiler can offload to GPU_TARGETS +if (COMMAND rocm_check_target_ids) + message(STATUS "Checking for ROCm support for GPU targets: " "${GPU_TARGETS}") + rocm_check_target_ids(SUPPORTED_GPUS TARGETS ${GPU_TARGETS}) +else() + message(WARNING "Unable to check for supported GPU targets. Falling back to default GPUs.") + set(SUPPORTED_GPUS ${DEFAULT_GPUS}) +endif() + +set(GPU_TARGETS "${SUPPORTED_GPUS}") +message(STATUS "Compiling for ${GPU_TARGETS}") + +## NOTE: Reload rocm-cmake in order to update GPU_TARGETS +include(cmake/Dependencies.cmake) # Reloading to use desired GPU_TARGETS instead of defaults + +# Try to establish ROCM_PATH (for find_package) +#================================================================================================== +if(NOT DEFINED ROCM_PATH) + # Guess default location + set(ROCM_PATH "/opt/rocm") + message(WARNING "Unable to find ROCM_PATH: Falling back to ${ROCM_PATH}") +else() + message(STATUS "ROCM_PATH found: ${ROCM_PATH}") +endif() +set(ENV{ROCM_PATH} ${ROCM_PATH}) + +if("${CMAKE_CXX_COMPILER}" MATCHES ".*amdclang\\+\\+") + message(STATUS "Compiling with amdclang++") + set(COMPILER_EXE_NAME amdclang++) + set(COMPILER_GREP_STRING "AMD clang version") + set(COMPILER_AWK_CMD "awk -F\" \" '{ printf $4}'") +elseif("${CMAKE_CXX_COMPILER}" MATCHES ".*clang\\+\\+") + message(STATUS "Compiling with clang++") + set(COMPILER_EXE_NAME clang++) + set(COMPILER_GREP_STRING "AMD clang version") + set(COMPILER_AWK_CMD "awk -F\" \" '{ printf $4}'") +elseif("${CMAKE_CXX_COMPILER}" MATCHES ".*hipcc$") + message(STATUS "Compiling with hipcc") + set(COMPILER_EXE_NAME hipcc) + set(COMPILER_GREP_STRING "HIP version") + set(COMPILER_AWK_CMD "awk -F\" \" '{ printf $3}' | awk -F\"-\" '{ printf $1}'") +else() + message(FATAL_ERROR "RCCL-Tests can be built only with hipcc or amdclang++") +endif() + +# Set CMAKE flags +#================================================================================================== +set(CMAKE_INSTALL_PREFIX "${ROCM_PATH}" CACHE PATH "") +set(CMAKE_CXX_STANDARD 14) # We use C++14 features, this will add compile option: -std=c++14 +set(CMAKE_CXX_EXTENSIONS OFF) # Without this line, it will add -std=gnu++14 instead, which has some issues. +set(CPACK_PACKAGING_INSTALL_PREFIX "${ROCM_PATH}" CACHE PATH "Path to install to when packaged.") +if(ROCM_PATH) + #list(APPEND CMAKE_PREFIX_PATH # Temporary workaround + list(PREPEND CMAKE_PREFIX_PATH # Add ROCM_PATH to CMake search paths (for finding HIP / HSA + ${ROCM_PATH} + ${ROCM_PATH}/hip + ${ROCM_PATH}/llvm) +endif() + +# Check for required dependencies +#================================================================================================== +## Check for Threads +set(THREADS_PREFER_PTHREAD_FLAG ON) +find_package(Threads REQUIRED) + +## Check for HIP +find_package(hip REQUIRED) +message(STATUS "HIP compiler: ${HIP_COMPILER}") +message(STATUS "HIP runtime: ${HIP_RUNTIME}") +if(NOT "${HIP_COMPILER}" MATCHES "clang") + message(FATAL_ERROR "RCCL requires clang-based compiler (amdclang++ or hipcc)") +endif() + +## Check for compiler version +find_program(compiler_executable ${COMPILER_EXE_NAME}) +message(STATUS "${COMPILER_EXE_NAME} executable: ${compiler_executable}") +execute_process( + COMMAND bash "-c" "${compiler_executable} --version | grep \"${COMPILER_GREP_STRING}\" | ${COMPILER_AWK_CMD}" + OUTPUT_VARIABLE compiler_version_string) +message(STATUS "${COMPILER_EXE_NAME} version: ${compiler_version_string}") + +## Check for HIP version +find_program(hipconfig_executable hipconfig) +message(STATUS "hipconfig executable: ${hipconfig_executable}") +execute_process( + COMMAND bash "-c" "${hipconfig_executable} -v | awk -F\"-\" '{ printf $1 }'" + OUTPUT_VARIABLE hip_version_string) +message(STATUS "${COMPILER_EXE_NAME} HIP version: ${hip_version_string}") + +##Check for ROCm version +set(EXPLICIT_ROCM_VERSION "" CACHE STRING "Explicit ROCM version to compile to (auto detect if empty)") +if(EXPLICIT_ROCM_VERSION) + set(rocm_version_string "${EXPLICIT_ROCM_VERSION}") +elseif(ROCM_PATH) + message(STATUS "Reading ROCM version from ${ROCM_PATH}/.info/version") + file(READ "${ROCM_PATH}/.info/version" rocm_version_string) +else() + message(FATAL_ERROR "Could not determine ROCM version (set EXPLICIT_ROCM_VERSION or set ROCM_PATH to a valid installation)") +endif() +string(REGEX MATCH "([0-9]+)\\.([0-9]+)\\.([0-9]+)" rocm_version_matches ${rocm_version_string}) +if (rocm_version_matches) + set(ROCM_MAJOR_VERSION ${CMAKE_MATCH_1}) + set(ROCM_MINOR_VERSION ${CMAKE_MATCH_2}) + set(ROCM_PATCH_VERSION ${CMAKE_MATCH_3}) + + message(STATUS "ROCm version: ${ROCM_MAJOR_VERSION}.${ROCM_MINOR_VERSION}.${ROCM_PATCH_VERSION}") + + # Convert the version components to int for comparison + math(EXPR ROCM_VERSION "(10000 * ${ROCM_MAJOR_VERSION}) + (100 * ${ROCM_MINOR_VERSION}) + ${ROCM_PATCH_VERSION}") + add_definitions("-DROCM_VERSION=${ROCM_VERSION}") +else() + message(WARNING "Failed to extract ROCm version.") +endif() + +## Check for RCCL +find_package(RCCL CONFIG REQUIRED HINTS "${CMAKE_PREFIX_PATH}" PATHS "${ROCM_PATH}") +if (RCCL_FOUND) + message(STATUS "RCCL version : ${RCCL_VERSION}") + message(STATUS "RCCL include path : ${RCCL_INCLUDE_DIRS}") + message(STATUS "RCCL libraries : ${RCCL_LIBRARIES}") +endif() + +## Check for MPI (if enabled) +if (USE_MPI) + find_package(MPI REQUIRED) + if (MPI_FOUND) + message(STATUS "MPI include path : ${MPI_CXX_INCLUDE_PATH}") + message(STATUS "MPI libraries : ${MPI_CXX_LIBRARIES}") + add_definitions(-DMPI_SUPPORT) + else() + message ("-- no MPI library found") + endif() +else() + message ("-- MPI support disabled") +endif() + +set(ROCM_USE_DEV_COMPONENT OFF) # This repo doesn't have a dev component + +# Add all of the tests +add_subdirectory(src) + +rocm_setup_version(VERSION "2.14.1") + +# Create ROCm standard packages +rocm_create_package( + NAME rccl-tests + DESCRIPTION "Tests for the ROCm Communication Collectives Library" + MAINTAINER "RCCL Maintainer " +) diff --git a/projects/rccl-tests/LICENSE.txt b/projects/rccl-tests/LICENSE.txt new file mode 100644 index 0000000000..d2e566e3e2 --- /dev/null +++ b/projects/rccl-tests/LICENSE.txt @@ -0,0 +1,28 @@ + + Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved. + Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of NVIDIA CORPORATION, nor the names of their + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/projects/rccl-tests/Makefile b/projects/rccl-tests/Makefile new file mode 100644 index 0000000000..f652b78a99 --- /dev/null +++ b/projects/rccl-tests/Makefile @@ -0,0 +1,23 @@ +# +# Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. +# +# See LICENCE.txt for license information +# + +BUILDDIR ?= build +override BUILDDIR := $(abspath $(BUILDDIR)) + +.PHONY: all clean + +default: src.build + +TARGETS=src + +all: ${TARGETS:%=%.build} +clean: ${TARGETS:%=%.clean} + +%.build: + ${MAKE} -C $* build BUILDDIR=${BUILDDIR} + +%.clean: + ${MAKE} -C $* clean BUILDDIR=${BUILDDIR} diff --git a/projects/rccl-tests/NOTICES.txt b/projects/rccl-tests/NOTICES.txt new file mode 100644 index 0000000000..6f49d61763 --- /dev/null +++ b/projects/rccl-tests/NOTICES.txt @@ -0,0 +1,66 @@ +Notices and Licenses file +_______________________________________________________________ + +Dependencies on nvidia-nccl-tests v2.0.0 (BSD3) +Copyright (c) 2016-2017, NVIDIA CORPORATION. +Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National + Laboratory, the U.S. Department of Energy, nor the names of their + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The U.S. Department of Energy funded the development of this software +under subcontract 7078610 with Lawrence Berkeley National Laboratory. + + +nvidia-nccl-tests v2.0.0 (BSD2) +Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National + Laboratory, the U.S. Department of Energy, nor the names of their + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The U.S. Department of Energy funded the development of this software +under subcontract 7078610 with Lawrence Berkeley National Laboratory. diff --git a/projects/rccl-tests/README.md b/projects/rccl-tests/README.md new file mode 100644 index 0000000000..6ae8c81db9 --- /dev/null +++ b/projects/rccl-tests/README.md @@ -0,0 +1,189 @@ +# RCCL Tests + +These tests check both the performance and the correctness of RCCL operations. They can be compiled against [RCCL](https://github.com/ROCm/rccl). + +## Build + +To build the tests, just type `make` or `make -j` + +If HIP is not installed in `/opt/rocm`, you may specify `HIP_HOME`. Similarly, if RCCL (`librccl.so`) is not installed in `/opt/rocm/lib/`, you may specify `NCCL_HOME` and `CUSTOM_RCCL_LIB`. + +```shell +$ make HIP_HOME=/path/to/hip NCCL_HOME=/path/to/rccl +``` + +RCCL Tests rely on MPI to work on multiple processes, hence multiple nodes. + +> [!TIP] +> To compile RCCL tests with MPI support, you need to set `MPI=1` and set `MPI_HOME` to the path where MPI is installed. + +```shell +$ make MPI=1 MPI_HOME=/path/to/mpi HIP_HOME=/path/to/hip NCCL_HOME=/path/to/rccl +``` + +RCCL Tests can also be built using cmake. A typical sequence will be: + +```shell +$ mkdir build +$ cd build +$ cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH=/path/to/rocm .. +$ make +``` + +When using the cmake build procedure for building RCCL-Tests with custom/user-built `librccl.so`, please make sure that RCCL has been installed (i.e. using `make install`) and not pointing to the RCCL `build` directory, since cmake will check for cmake target and config files. This is not necessary as one can modify `LD_LIBRARY_PATH` to point to the custom/user-built `librccl.so` when running RCCL Tests. + +Using the cmake method also has the advantage that it automatically checks for MPI installation during the build. The tests can be compiled with MPI support by adding the `-DUSE_MPI=ON` flag to the cmake command line. + +> [!TIP] +> Users can choose to link against a particular MPI library by using one of these options: +> * setting the environment variable `MPI_HOME`. +> * by adding the path to the MPI library to the cmake prefix path with `-DCMAKE_PREFIX_PATH`. +> * including the paths to MPI `bin` and `lib` in the `PATH` and `LD_LIBRARY_PATH` environment variables, respectively. + +e.g., +```shell +$ mkdir build +$ cd build +$ cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="/path/to/mpi;/path/to/rocm" -DUSE_MPI=ON .. +$ make +``` + +By default, for both Makefile and `cmake` based builds, RCCL Tests will link against all supported GPU targets (defined in `src/Makefile` and as `DEFAULT_GPUS` in `CMakeLists.txt`). + +To target specific GPU(s), and potentially reduce build time, use: +* `GPU_TARGETS` as a `,` separated string listing GPU(s) to target for Makefile based build. +e.g. build RCCL-Tests using Makefile only for `gfx942` and `gfx950`. e.g., + ```shell + $ GPU_TARGETS="gfx942,gfx950" make MPI=1 MPI_HOME=/path/to/mpi NCCL_HOME=/opt/rocm + ``` +* `-DGPU_TARGETS` as a `;` separated string listing GPU(s) to target for `cmake` based build. +e.g. build RCCL-Tests using CMake for `gfx90a`, `gfx942` and `gfx1200`. e.g., + ```shell + $ cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="/path/to/mpi;/path/to/rocm" -DUSE_MPI=ON -DGPU_TARGETS="gfx90a;gfx942;gfx1200;" .. + ``` +* For CMake builds, we also have another flag `DBUILD_LOCAL_GPU_TARGET_ONLY` that queries and builds for the local GPU target only (similar to RCCL). + ```shell + $ cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="/path/to/mpi;/path/to/rocm" -DUSE_MPI=ON -DBUILD_LOCAL_GPU_TARGET_ONLY=ON .. + ``` + +`-DBUILD_LOCAL_GPU_TARGET_ONLY` will not work with `docker build`-based setups, as the docker build engine is unable to query the local GPU architecture. Please use `-DGPU_TARGETS` for CMake-based builds or `GPU_TARGETS` for Makefile-based builds when building RCCL-Tests using a Dockerfile and `docker build`. + +## Usage + +RCCL Tests can run on multiple processes, multiple threads, and multiple HIP devices per thread. The number of process is managed by MPI and is therefore not passed to the tests as argument. The total number of ranks (=HIP devices) will be equal to (number of processes)\*(number of threads)\*(number of GPUs per thread). + +### Quick examples + +Run on single node with 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes : + +```shell +$ ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8 +``` + +Run 64 MPI processes on nodes with 8 GPUs each, for a total of 64 GPUs spread across 8 nodes : +(NB: The rccl-tests binaries must be compiled with `MPI=1` for this case) + +```shell +$ mpirun -np 64 -N 8 ./build/all_reduce_perf -b 8 -e 8G -f 2 -g 1 +``` + +> [!TIP] +> For performance-oriented runs, on both single-node and multi-node, we suggest using 1 MPI process per GPU and `-g 1`. So, a run on 8 GPUs looks like : +> ```shell +> $ mpirun -np 8 --bind-to numa ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 1 +> ``` +> Running with 1 MPI process per GPU ensures a 1:1 mapping for CPUs and GPUs, which can be beneficial for smaller message sizes and better represents the real-world use of RCCL in Deep Learning frameworks like Pytorch and TensorFlow. + +### Performance + +See the [Performance](doc/PERFORMANCE.md) page for explanation about numbers, and in particular the "busbw" column. + +#### Environment variables +On some earlier versions of ROCm (before ROCm 6.4.0), setting `HSA_NO_SCRATCH_RECLAIM=1` as part of the environment is necessary to achieve better performance on MI300 GPUs. When running without MPI, a command similar to the following one should be sufficient: +```shell +HSA_NO_SCRATCH_RECLAIM=1 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8 +``` + +For MPI (using MPICH), you need to use a command similar to the following: +```shell +mpirun.mpich -np 8 -env NCCL_DEBUG=VERSION -env HSA_NO_SCRATCH_RECLAIM=1 ./build/all_reduce_perf -b 8M -e 128M -i 8388608 -g 1 -d bfloat16 +``` + +### Arguments + +All tests support the same set of arguments : + +* Number of GPUs + * `-t,--nthreads ` number of threads per process. Default : 1. + * `-g,--ngpus ` number of gpus per thread. Default : 1. +* Sizes to scan + * `-b,--minbytes ` minimum size to start with. Default : 32M. + * `-e,--maxbytes ` maximum size to end at. Default : 32M. + * Increments can be either fixed or a multiplication factor. Only one of those should be used + * `-i,--stepbytes ` fixed increment between sizes. Default : 1M. + * `-f,--stepfactor ` multiplication factor between sizes. Default : disabled. +* RCCL operations arguments + * `-o,--op ` Specify which reduction operation to perform. Only relevant for reduction operations like Allreduce, Reduce or ReduceScatter. Default : Sum. + * `-d,--datatype ` Specify which datatype to use. Default : Float. + * `-r,--root ` Specify which root to use. Only for operations with a root like broadcast or reduce. Default : 0. + * `-y,--memory_type ` Default: Coarse + * `-u,--cumask ` Default: None +* Performance + * `-n,--iters ` number of iterations. Default : 20. + * `-w,--warmup_iters ` number of warmup iterations (not timed). Default : 5. + * `-m,--agg_iters ` number of operations to aggregate together in each iteration. Default : 1. + * `-N,--run_cycles ` run & print each cycle. Default : 1; 0=infinite. + * `-a,--average <0/1/2/3>` Report performance as an average across all ranks (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. Default : 1. +* Test operation + * `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. Default : 0. + * `-c,--check ` perform count iterations, checking correctness of results on each iteration. This can be quite slow on large numbers of GPUs. Default : 1. + * `-z,--blocking <0/1>` Make RCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0. + * `-G,--hipgraph ` Capture iterations as a HIP graph and then replay specified number of times. Default : 0. + * `-C,--report_cputime <0/1>]` Report CPU time instead of latency. Default : 0. + * `-R,--local_register <0/1/2>` enable local (1) or symmetric (2) buffer registration on send/recv buffers. Default : 0. + * `-T,--timeout