diff --git a/projects/rccl-tests/.gitignore b/projects/rccl-tests/.gitignore index a0a013e438..9edaab3568 100644 --- a/projects/rccl-tests/.gitignore +++ b/projects/rccl-tests/.gitignore @@ -1,4 +1,7 @@ # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. # # See LICENCE.txt for license information -/build +build/ +*.gcov +/coverage/ +__pycache__/ diff --git a/projects/rccl-tests/CMakeLists.txt b/projects/rccl-tests/CMakeLists.txt index a772522ca4..f4ae1c76ac 100644 --- a/projects/rccl-tests/CMakeLists.txt +++ b/projects/rccl-tests/CMakeLists.txt @@ -1,73 +1,190 @@ -# ######################################################################## -# Copyright 2022 Advanced Micro Devices, Inc. -# ######################################################################## -#Adding pthread flag for linking -set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread") +# Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved. -cmake_minimum_required(VERSION 3.16.3 FATAL_ERROR) +# CMake version minimum requirements +#================================================================================================== +cmake_minimum_required(VERSION 3.16 FATAL_ERROR) -project(RCCL-tests VERSION 2.12.10 LANGUAGES CXX) - -# Get ROCm path from environment if available -if (DEFINED ENV{ROCM_PATH}) - set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to ROCm installation") -else() - set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to ROCm installation") +# CMake Toolchain file to define compilers and path to ROCm +#================================================================================================== +if (NOT CMAKE_TOOLCHAIN_FILE) + set(CMAKE_TOOLCHAIN_FILE "${CMAKE_CURRENT_SOURCE_DIR}/toolchain-linux.cmake") + message(STATUS "CMAKE_TOOLCHAIN_FILE: ${CMAKE_TOOLCHAIN_FILE}") endif() -# Set CMake/CPack variables -list( APPEND CMAKE_PREFIX_PATH ${ROCM_PATH} ${ROCM_PATH}/llvm) -set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}/install" CACHE PATH "Prefix install path") -set(CPACK_PACKAGING_INSTALL_PREFIX "${ROCM_PATH}" CACHE PATH "Path to install to when packaged.") -set(CMAKE_CXX_STANDARD 14) +# RCCL Tests project +#================================================================================================== +project(rccl-tests LANGUAGES CXX) -# Get additional packages required -find_package(ROCM 0.7.3 CONFIG REQUIRED PATHS "${ROCM_PATH}") -include(ROCMSetupVersion) -include(ROCMCreatePackage) -include(ROCMInstallTargets) -include(ROCMCheckTargetIds) -include(ROCMClients) +# Build options +#================================================================================================== +option(USE_MPI "Build RCCL-tests with MPI support." OFF) +option(BUILD_LOCAL_GPU_TARGET_ONLY "Build only for GPUs detected on this machine" OFF) -# Build variables -option(USE_MPI "Build RCCL-tests with MPI support.") +if (NOT CMAKE_BUILD_TYPE) + message(WARNING "CMAKE_BUILD_TYPE is not defined. Setting to Release") + set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Default build type") +endif() # Default GPU architectures to build #================================================================================================== set(DEFAULT_GPUS - gfx803 - gfx900:xnack- - gfx906:xnack- - gfx908:xnack- - gfx90a:xnack- - gfx90a:xnack+ - gfx940 - gfx941 + gfx906 + gfx908 + gfx90a gfx942 + gfx950 gfx1030 gfx1100 gfx1101 - gfx1102) + gfx1102 + gfx1200 + gfx1201) -set(AMDGPU_TARGETS ${DEFAULT_GPUS} CACHE STRING "Target default GPUs if AMDGPU_TARGETS is not defined.") -## Determine which GPU architectures to build for -if (COMMAND rocm_check_target_ids) - message(STATUS "Checking for ROCm support for GPU targets:") - rocm_check_target_ids(SUPPORTED_GPUS TARGETS "${AMDGPU_TARGETS}") -else() - message(WARNING "Unable to check for supported GPU targets. Falling back to default GPUs") - set(SUPPORTED_GPUS ${DEFAULT_GPUS}) +# Get additional packages required +include(CheckIncludeFiles) +include(CheckSymbolExists) +include(cmake/Dependencies.cmake) # rocm-cmake, rocm_local_targets +include(cmake/CheckSymbolExistsNoWarn.cmake) + +# Build only for local GPU architecture +if (BUILD_LOCAL_GPU_TARGET_ONLY) + message(STATUS "Building only for local GPU target") + if (COMMAND rocm_local_targets) + rocm_local_targets(DEFAULT_GPUS) + else() + message(WARNING "Unable to determine local GPU targets. Falling back to default GPUs.") + endif() endif() -set(GPU_TARGETS "${SUPPORTED_GPUS}" CACHE STRING "List of specific GPU architectures to build for.") + +# Determine which GPU architectures to build for +set(GPU_TARGETS "${DEFAULT_GPUS}" CACHE STRING "Target default GPUs if GPU_TARGETS is not defined.") + +# Check if clang compiler can offload to GPU_TARGETS +if (COMMAND rocm_check_target_ids) + message(STATUS "Checking for ROCm support for GPU targets: " "${GPU_TARGETS}") + rocm_check_target_ids(SUPPORTED_GPUS TARGETS ${GPU_TARGETS}) +else() + message(WARNING "Unable to check for supported GPU targets. Falling back to default GPUs.") + set(SUPPORTED_GPUS ${DEFAULT_GPUS}) +endif() + +set(GPU_TARGETS "${SUPPORTED_GPUS}") message(STATUS "Compiling for ${GPU_TARGETS}") -find_package(RCCL HINTS CONFIG REQUIRED PATHS "${ROCM_PATH}") +## NOTE: Reload rocm-cmake in order to update GPU_TARGETS +include(cmake/Dependencies.cmake) # Reloading to use desired GPU_TARGETS instead of defaults + +# Try to establish ROCM_PATH (for find_package) +#================================================================================================== +if(NOT DEFINED ROCM_PATH) + # Guess default location + set(ROCM_PATH "/opt/rocm") + message(WARNING "Unable to find ROCM_PATH: Falling back to ${ROCM_PATH}") +else() + message(STATUS "ROCM_PATH found: ${ROCM_PATH}") +endif() +set(ENV{ROCM_PATH} ${ROCM_PATH}) + +if("${CMAKE_CXX_COMPILER}" MATCHES ".*amdclang\\+\\+") + message(STATUS "Compiling with amdclang++") + set(COMPILER_EXE_NAME amdclang++) + set(COMPILER_GREP_STRING "AMD clang version") + set(COMPILER_AWK_CMD "awk -F\" \" '{ printf $4}'") +elseif("${CMAKE_CXX_COMPILER}" MATCHES ".*clang\\+\\+") + message(STATUS "Compiling with clang++") + set(COMPILER_EXE_NAME clang++) + set(COMPILER_GREP_STRING "AMD clang version") + set(COMPILER_AWK_CMD "awk -F\" \" '{ printf $4}'") +elseif("${CMAKE_CXX_COMPILER}" MATCHES ".*hipcc$") + message(STATUS "Compiling with hipcc") + set(COMPILER_EXE_NAME hipcc) + set(COMPILER_GREP_STRING "HIP version") + set(COMPILER_AWK_CMD "awk -F\" \" '{ printf $3}' | awk -F\"-\" '{ printf $1}'") +else() + message(FATAL_ERROR "RCCL-Tests can be built only with hipcc or amdclang++") +endif() + +# Set CMAKE flags +#================================================================================================== +set(CMAKE_INSTALL_PREFIX "${ROCM_PATH}" CACHE PATH "") +set(CMAKE_CXX_STANDARD 14) # We use C++14 features, this will add compile option: -std=c++14 +set(CMAKE_CXX_EXTENSIONS OFF) # Without this line, it will add -std=gnu++14 instead, which has some issues. +set(CPACK_PACKAGING_INSTALL_PREFIX "${ROCM_PATH}" CACHE PATH "Path to install to when packaged.") +if(ROCM_PATH) + #list(APPEND CMAKE_PREFIX_PATH # Temporary workaround + list(PREPEND CMAKE_PREFIX_PATH # Add ROCM_PATH to CMake search paths (for finding HIP / HSA + ${ROCM_PATH} + ${ROCM_PATH}/hip + ${ROCM_PATH}/llvm) +endif() + +# Check for required dependencies +#================================================================================================== +## Check for Threads +set(THREADS_PREFER_PTHREAD_FLAG ON) +find_package(Threads REQUIRED) + +##Adding pthread flag for linking +#set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread") + +## Check for HIP +find_package(hip REQUIRED) +message(STATUS "HIP compiler: ${HIP_COMPILER}") +message(STATUS "HIP runtime: ${HIP_RUNTIME}") +if(NOT "${HIP_COMPILER}" MATCHES "clang") + message(FATAL_ERROR "RCCL requires clang-based compiler (amdclang++ or hipcc)") +endif() + +## Check for compiler version +find_program(compiler_executable ${COMPILER_EXE_NAME}) +message(STATUS "${COMPILER_EXE_NAME} executable: ${compiler_executable}") +execute_process( + COMMAND bash "-c" "${compiler_executable} --version | grep \"${COMPILER_GREP_STRING}\" | ${COMPILER_AWK_CMD}" + OUTPUT_VARIABLE compiler_version_string) +message(STATUS "${COMPILER_EXE_NAME} version: ${compiler_version_string}") + +## Check for HIP version +find_program(hipconfig_executable hipconfig) +message(STATUS "hipconfig executable: ${hipconfig_executable}") +execute_process( + COMMAND bash "-c" "${hipconfig_executable} -v | awk -F\"-\" '{ printf $1 }'" + OUTPUT_VARIABLE hip_version_string) +message(STATUS "${COMPILER_EXE_NAME} HIP version: ${hip_version_string}") + +##Check for ROCm version +set(EXPLICIT_ROCM_VERSION "" CACHE STRING "Explicit ROCM version to compile to (auto detect if empty)") +if(EXPLICIT_ROCM_VERSION) + set(rocm_version_string "${EXPLICIT_ROCM_VERSION}") +elseif(ROCM_PATH) + message(STATUS "Reading ROCM version from ${ROCM_PATH}/.info/version") + file(READ "${ROCM_PATH}/.info/version" rocm_version_string) +else() + message(FATAL_ERROR "Could not determine ROCM version (set EXPLICIT_ROCM_VERSION or set ROCM_PATH to a valid installation)") +endif() +string(REGEX MATCH "([0-9]+)\\.([0-9]+)\\.([0-9]+)" rocm_version_matches ${rocm_version_string}) +if (rocm_version_matches) + set(ROCM_MAJOR_VERSION ${CMAKE_MATCH_1}) + set(ROCM_MINOR_VERSION ${CMAKE_MATCH_2}) + set(ROCM_PATCH_VERSION ${CMAKE_MATCH_3}) + + message(STATUS "ROCm version: ${ROCM_MAJOR_VERSION}.${ROCM_MINOR_VERSION}.${ROCM_PATCH_VERSION}") + + # Convert the version components to int for comparison + math(EXPR ROCM_VERSION "(10000 * ${ROCM_MAJOR_VERSION}) + (100 * ${ROCM_MINOR_VERSION}) + ${ROCM_PATCH_VERSION}") + add_definitions("-DROCM_VERSION=${ROCM_VERSION}") +else() + message(WARNING "Failed to extract ROCm version.") +endif() + +## Check for RCCL +find_package(RCCL CONFIG REQUIRED HINTS "${CMAKE_PREFIX_PATH}" PATHS "${ROCM_PATH}") if (RCCL_FOUND) message(STATUS "RCCL version : ${RCCL_VERSION}") message(STATUS "RCCL include path : ${RCCL_INCLUDE_DIRS}") message(STATUS "RCCL libraries : ${RCCL_LIBRARIES}") endif() +## Check for MPI (if enabled) if (USE_MPI) find_package(MPI REQUIRED) if (MPI_FOUND) diff --git a/projects/rccl-tests/Makefile b/projects/rccl-tests/Makefile index cf64f3db22..f652b78a99 100644 --- a/projects/rccl-tests/Makefile +++ b/projects/rccl-tests/Makefile @@ -11,7 +11,7 @@ override BUILDDIR := $(abspath $(BUILDDIR)) default: src.build -TARGETS=$(filter-out src/hypercube.cu, $(wildcard src/*)) +TARGETS=src all: ${TARGETS:%=%.build} clean: ${TARGETS:%=%.clean} diff --git a/projects/rccl-tests/README.md b/projects/rccl-tests/README.md index c89c15bb28..4fdb74ae0d 100644 --- a/projects/rccl-tests/README.md +++ b/projects/rccl-tests/README.md @@ -1,72 +1,108 @@ # RCCL Tests -These tests check both the performance and the correctness of RCCL operations. They can be compiled against [RCCL](https://github.com/ROCmSoftwarePlatform/rccl). +These tests check both the performance and the correctness of RCCL operations. They can be compiled against [RCCL](https://github.com/ROCm/rccl). ## Build To build the tests, just type `make`. -If HIP is not installed in /opt/rocm, you may specify HIP\_HOME. Similarly, if RCCL is not installed in /usr, you may specify NCCL\_HOME and CUSTOM\_RCCL\_LIB. +If HIP is not installed in `/opt/rocm`, you may specify `HIP_HOME`. Similarly, if RCCL (`librccl.so`) is not installed in `/opt/rocm/lib/`, you may specify `NCCL_HOME` and `CUSTOM_RCCL_LIB`. ```shell -$ make HIP_HOME=/path/to/hip NCCL_HOME=/path/to/rccl CUSTOM_RCCL_LIB=/path/to/rccl/lib/librccl.so +$ make HIP_HOME=/path/to/hip NCCL_HOME=/path/to/rccl ``` -RCCL tests rely on MPI to work on multiple processes, hence multiple nodes. If you want to compile the tests with MPI support, you need to set MPI=1 and set MPI\_HOME to the path where MPI is installed. +RCCL Tests rely on MPI to work on multiple processes, hence multiple nodes. + +> [!TIP] +> To compile RCCL tests with MPI support, you need to set `MPI=1` and set `MPI_HOME` to the path where MPI is installed. ```shell $ make MPI=1 MPI_HOME=/path/to/mpi HIP_HOME=/path/to/hip NCCL_HOME=/path/to/rccl ``` -RCCL tests can also be built using cmake. A typical sequence will be: +RCCL Tests can also be built using cmake. A typical sequence will be: ```shell $ mkdir build $ cd build -$ CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_PREFIX_PATH=/path/to/rccl .. +$ cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH=/path/to/rocm .. $ make ``` -When using the cmake build procedure, please make sure that RCCL has also been built using cmake (i.e. not using the install.sh script), since cmake will check -for cmake target and config files that are created during the RCCL build. +When using the cmake build procedure for building RCCL-Tests with custom/user-built `librccl.so`, please make sure that RCCL has been installed (i.e. using `make install`) and not pointing to the RCCL `build` directory, since cmake will check for cmake target and config files. This is not necessary as one can modify `LD_LIBRARY_PATH` to point to the custom/user-built `librccl.so` when running RCCL Tests. -Using the cmake method also has the advantage that the build is automatically checking for MPI installations. The tests can be compiled with MPI support by adding the `-DUSE_MPI=ON` flag to the cmake command line. A user can request to use a particular MPI library by setting the environment variable `MPI_HOME` or add the path of the MPI library to the cmake prefix path with `-DCMAKE_PREFIX_PATH`. +Using the cmake method also has the advantage that it automatically checks for MPI installation during the build. The tests can be compiled with MPI support by adding the `-DUSE_MPI=ON` flag to the cmake command line. +> [!TIP] +> Users can choose to link against a particular MPI library by using one of these options: +> * setting the environment variable `MPI_HOME`. +> * by adding the path to the MPI library to the cmake prefix path with `-DCMAKE_PREFIX_PATH`. +> * including the paths to MPI `bin` and `lib` in the `PATH` and `LD_LIBRARY_PATH` environment variables, respectively. + +e.g., +```shell +$ mkdir build +$ cd build +$ cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="/path/to/mpi;/path/to/rocm" -DUSE_MPI=ON .. +$ make +``` + +By default, for both Makefile and `cmake` based builds, RCCL Tests will link against all supported GPU targets (defined in `src/Makefile` and as `DEFAULT_GPUS` in `CMakeLists.txt`). + +To target specific GPU(s), and potentially reduce build time, use: +* `GPU_TARGETS` as a `,` separated string listing GPU(s) to target for Makefile based build. +e.g. build RCCL-Tests using Makefile only for `gfx942` and `gfx950`. e.g., + ```shell + $ GPU_TARGETS="gfx942,gfx950" make MPI=1 MPI_HOME=/path/to/mpi NCCL_HOME=/opt/rocm + ``` +* `-DGPU_TARGETS` as a `;` separated string listing GPU(s) to target for `cmake` based build. +e.g. build RCCL-Tests using CMake for `gfx90a`, `gfx942` and `gfx1200`. e.g., + ```shell + $ cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="/path/to/mpi;/path/to/rocm" -DUSE_MPI=ON -DGPU_TARGETS="gfx90a;gfx942;gfx1200;" .. + ``` +* For CMake builds, we also have another flag `DBUILD_LOCAL_GPU_TARGET_ONLY` that queries and builds for the local GPU target only (similar to RCCL). + ```shell + $ cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="/path/to/mpi;/path/to/rocm" -DUSE_MPI=ON -DBUILD_LOCAL_GPU_TARGET_ONLY=ON .. + ``` + +`-DBUILD_LOCAL_GPU_TARGET_ONLY` will not work with `docker build`-based setups, as the docker build engine is unable to query the local GPU architecture. Please use `-DGPU_TARGETS` for CMake-based builds or `GPU_TARGETS` for Makefile-based builds when building RCCL-Tests using a Dockerfile and `docker build`. ## Usage -RCCL tests can run on multiple processes, multiple threads, and multiple HIP devices per thread. The number of process is managed by MPI and is therefore not passed to the tests as argument. The total number of ranks (=HIP devices) will be equal to (number of processes)\*(number of threads)\*(number of GPUs per thread). +RCCL Tests can run on multiple processes, multiple threads, and multiple HIP devices per thread. The number of process is managed by MPI and is therefore not passed to the tests as argument. The total number of ranks (=HIP devices) will be equal to (number of processes)\*(number of threads)\*(number of GPUs per thread). ### Quick examples -Run on 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes : +Run on single node with 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes : ```shell $ ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8 ``` -Run with MPI on 10 processes (potentially on multiple nodes) with 4 GPUs each, for a total of 40 GPUs: +Run 64 MPI processes on nodes with 8 GPUs each, for a total of 64 GPUs spread across 8 nodes : +(NB: The rccl-tests binaries must be compiled with `MPI=1` for this case) ```shell -$ mpirun -np 10 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4 +$ mpirun -np 64 -N 8 ./build/all_reduce_perf -b 8 -e 8G -f 2 -g 1 ``` -For performance-oriented runs, on both single-node and multi-node, we suggest using 1 MPI process per GPU and `-g 1`. So, a run on 8 GPUs looks like : -```shell -$ mpirun -np 8 --bind-to numa ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 1 -``` -Running with 1 MPI process per GPU ensures a 1:1 mapping for CPUs and GPUs, which can be beneficial for smaller message sizes and better represents the real-world use of RCCL in Deep Learning frameworks like Pytorch and TensorFlow. +> [!TIP] +> For performance-oriented runs, on both single-node and multi-node, we suggest using 1 MPI process per GPU and `-g 1`. So, a run on 8 GPUs looks like : +> ```shell +> $ mpirun -np 8 --bind-to numa ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 1 +> ``` +> Running with 1 MPI process per GPU ensures a 1:1 mapping for CPUs and GPUs, which can be beneficial for smaller message sizes and better represents the real-world use of RCCL in Deep Learning frameworks like Pytorch and TensorFlow. ### Performance See the [Performance](doc/PERFORMANCE.md) page for explanation about numbers, and in particular the "busbw" column. -### Environment variables -On some older versions of ROCm before 6.4.0, setting `HSA_NO_SCRATCH_RECLAIM=1` - as part of the environment might be necessary to achieve better performance. When running without MPI, a command similar to the following one should be sufficient: +#### Environment variables +On some earlier versions of ROCm (before ROCm 6.4.0), setting `HSA_NO_SCRATCH_RECLAIM=1` as part of the environment is necessary to achieve better performance on MI300 GPUs. When running without MPI, a command similar to the following one should be sufficient: ```shell HSA_NO_SCRATCH_RECLAIM=1 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8 ``` -For MPI, you might need to use a command similar to the following: +For MPI (using MPICH), you need to use a command similar to the following: ```shell mpirun.mpich -np 8 -env NCCL_DEBUG=VERSION -env HSA_NO_SCRATCH_RECLAIM=1 ./build/all_reduce_perf -b 8M -e 128M -i 8388608 -g 1 -d bfloat16 ``` @@ -89,37 +125,58 @@ All tests support the same set of arguments : * `-d,--datatype ` Specify which datatype to use. Default : Float. * `-r,--root ` Specify which root to use. Only for operations with a root like broadcast or reduce. Default : 0. * `-y,--memory_type ` Default: Coarse - * `-s,--stress_cycles ` Default: 1 * `-u,--cumask ` Default: None * Performance * `-n,--iters ` number of iterations. Default : 20. * `-w,--warmup_iters ` number of warmup iterations (not timed). Default : 5. * `-m,--agg_iters ` number of operations to aggregate together in each iteration. Default : 1. + * `-N,--run_cycles ` run & print each cycle. Default : 1; 0=infinite. * `-a,--average <0/1/2/3>` Report performance as an average across all ranks (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. Default : 1. * Test operation * `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. Default : 0. * `-c,--check ` perform count iterations, checking correctness of results on each iteration. This can be quite slow on large numbers of GPUs. Default : 1. - * `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0. - * `-G,--cudagraph ` Capture iterations as a CUDA graph and then replay specified number of times. Default : 0. + * `-z,--blocking <0/1>` Make RCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0. + * `-G,--hipgraph ` Capture iterations as a HIP graph and then replay specified number of times. Default : 0. + * `-C,--report_cputime <0/1>]` Report CPU time instead of latency. Default : 0. + * `-R,--local_register <1/0>` enable local buffer registration on send/recv buffers. Default : 0. + * `-T,--timeout