diff --git a/CMakeLists.txt b/CMakeLists.txt index 05deda6be1..0e74ac100a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,8 +1,17 @@ # Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved. # Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License. +# CMake version minimum requirements +#================================================================================================== cmake_minimum_required(VERSION 3.5) +# CMake Toolchain file to define compilers and path to ROCm +#================================================================================================== +if (NOT CMAKE_TOOLCHAIN_FILE) + set(CMAKE_TOOLCHAIN_FILE "${CMAKE_CURRENT_SOURCE_DIR}/toolchain-linux.cmake") + message(STATUS "CMAKE_TOOLCHAIN_FILE: ${CMAKE_TOOLCHAIN_FILE}") +endif() + # RCCL project #================================================================================================== project(rccl CXX) diff --git a/README.md b/README.md index 9a3a81fe7d..3e0463748e 100644 --- a/README.md +++ b/README.md @@ -18,11 +18,18 @@ The collective operations are implemented using ring and tree algorithms and hav RCCL directly depends on HIP runtime plus the HIP-Clang compiler, which are part of the ROCm software stack. For ROCm installation instructions, see https://github.com/ROCm/ROCm. -The root of this repository has a helper script 'install.sh' to build and install RCCL on Ubuntu with a single command. It does not take a lot of options and hard-codes configuration that can be specified through invoking cmake directly, but it's a great way to get started quickly and can serve as an example of how to build/install. +The root of this repository has a helper script `install.sh` to build and install RCCL with a single command. It hard-codes configurations that can be specified through invoking cmake directly, but it's a great way to get started quickly and can serve as an example of how to build/install RCCL. + +### To build the library using the install script: ```shell -./install.sh --help +./install.sh +``` +For more info on build options/flags when using the install script, use `./install.sh --help` +```shell +./install.sh --help +RCCL build & installation helper script Options: --address-sanitizer Build with address sanitizer enabled -d|--dependencies Install RCCL depdencencies @@ -33,37 +40,38 @@ The root of this repository has a helper script 'install.sh' to build and instal -f|--fast Quick-build RCCL (local gpu arch only, no backtrace, and collective trace support) -h|--help Prints this help message -i|--install Install RCCL library (see --prefix argument below) - -j|--jobs Specify how many parallel compilation jobs to run (nproc by default) + -j|--jobs Specify how many parallel compilation jobs to run ($nproc by default) -l|--local_gpu_only Only compile for local GPU architecture + --amdgpu_targets Only compile for specified GPU architecture(s). For multiple targets, seperate by ';' (builds for all supported GPU architectures by default) --no_clean Don't delete files if they already exist --npkit-enable Compile with npkit enabled --roctx-enable Compile with roctx enabled (example usage: rocprof --roctx-trace ./rccl-program) -p|--package_build Build RCCL package - --prefix Specify custom directory to install RCCL to (default: /opt/rocm) + --prefix Specify custom directory to install RCCL to (default: `/opt/rocm`) --rm-legacy-include-dir Remove legacy include dir Packaging added for file/folder reorg backward compatibility --run_tests_all Run all rccl unit tests (must be built already) -r|--run_tests_quick Run small subset of rccl unit tests (must be built already) --static Build RCCL as a static library instead of shared library -t|--tests_build Build rccl unit tests, but do not run - --time-trace Plot the build time of RCCL + --time-trace Plot the build time of RCCL (requires `ninja-build` package installed on the system) --verbose Show compile commands ``` ## Manual build -### To build the library : +### To build the library using CMake: ```shell $ git clone https://github.com/ROCm/rccl.git $ cd rccl $ mkdir build $ cd build -$ CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_PREFIX_PATH=/opt/rocm/ .. +$ cmake .. $ make -j 16 # Or some other suitable number of parallel jobs ``` -You may substitute an installation path of your own choosing by passing CMAKE_INSTALL_PREFIX. For example: +You may substitute an installation path of your own choosing by passing `CMAKE_INSTALL_PREFIX`. For example: ```shell -$ CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_PREFIX_PATH=/opt/rocm/ -DCMAKE_INSTALL_PREFIX=$PWD/rccl-install .. +$ cmake -DCMAKE_INSTALL_PREFIX=$PWD/rccl-install .. ``` Note: ensure rocm-cmake is installed, `apt install rocm-cmake`. @@ -123,11 +131,9 @@ Please refer to the [RCCL Documentation Site](https://rocm.docs.amd.com/projects Run the steps below to build documentation locally. -``` +```shell cd docs - pip3 install -r sphinx/requirements.txt - python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html ``` diff --git a/install.sh b/install.sh index 771ef80f9a..f8cd414007 100755 --- a/install.sh +++ b/install.sh @@ -16,12 +16,13 @@ build_package=false build_release=true build_static=false build_tests=false -build_verbose=0 +build_verbose=false clean_build=true collective_trace=true enable_ninja="" install_dependencies=false install_library=false +install_prefix="${ROCM_PATH}" msccl_kernel_enabled=true num_parallel_jobs=$(nproc) npkit_enabled=false @@ -53,13 +54,13 @@ function display_help() echo " --npkit-enable Compile with npkit enabled" echo " --roctx-enable Compile with roctx enabled (example usage: rocprof --roctx-trace ./rccl-program)" echo " -p|--package_build Build RCCL package" - echo " --prefix Specify custom directory to install RCCL to (default: /opt/rocm)" + echo " --prefix Specify custom directory to install RCCL to (default: \`/opt/rocm\`)" echo " --rm-legacy-include-dir Remove legacy include dir Packaging added for file/folder reorg backward compatibility" echo " --run_tests_all Run all rccl unit tests (must be built already)" echo " -r|--run_tests_quick Run small subset of rccl unit tests (must be built already)" echo " --static Build RCCL as a static library instead of shared library" echo " -t|--tests_build Build rccl unit tests, but do not run" - echo " --time-trace Plot the build time of RCCL" + echo " --time-trace Plot the build time of RCCL (requires \`ninja-build\` package installed on the system)" echo " --verbose Show compile commands" } @@ -69,14 +70,14 @@ function display_help() # check if we have a modern version of getopt that can handle whitespace and long parameters getopt -T -if [[ $? -eq 4 ]]; then +if [[ "$?" -eq 4 ]]; then GETOPT_PARSE=$(getopt --name "${0}" --options dfhij:lprt --longoptions address-sanitizer,dependencies,debug,enable_backtrace,disable-colltrace,disable-msccl-kernel,fast,help,install,jobs:,local_gpu_only,amdgpu_targets:,no_clean,npkit-enable,roctx-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,static,tests_build,time-trace,verbose -- "$@") else echo "Need a new version of getopt" exit 1 fi -if [[ $? -ne 0 ]]; then +if [[ "$?" -ne 0 ]]; then echo "getopt invocation failed; could not parse the command line"; exit 1 fi @@ -101,14 +102,14 @@ while true; do --npkit-enable) npkit_enabled=true; shift ;; --roctx-enable) roctx_enabled=true; shift ;; -p | --package_build) build_package=true; shift ;; - --prefix) install_prefix=${2}; shift 2 ;; + --prefix) install_library=true; install_prefix=${2}; shift 2 ;; --rm-legacy-include-dir) build_freorg_bkwdcomp=false; shift ;; -r | --run_tests_quick) run_tests=true; shift ;; --run_tests_all) run_tests=true; run_tests_all=true; shift ;; --static) build_static=true; shift ;; -t | --tests_build) build_tests=true; shift ;; --time-trace) time_trace=true; shift ;; - --verbose) build_verbose=1; shift ;; + --verbose) build_verbose=true; shift ;; --) shift ; break ;; *) echo "Unexpected command line parameter received; aborting"; exit 1 @@ -116,8 +117,6 @@ while true; do esac done -ROCM_BIN_PATH=$ROCM_PATH/bin - # /etc/*-release files describe the system if [[ -e "/etc/os-release" ]]; then source /etc/os-release @@ -129,22 +128,36 @@ else exit 2 fi +# CMake executable +cmake_executable=cmake +time_trace_ninja_msg="apt-get install ninja-build" +case "${OS_ID}" in + centos|rhel) + cmake_executable=cmake3 + time_trace_ninja_msg="dnf install ninja-build" + ;; +esac + +# CMake build options; starts with toolchain info +cmake_common_options="--toolchain=toolchain-linux.cmake" + # throw error code after running a command in the install script check_exit_code( ) { - if (( $1 != 0 )); then - exit $1 - fi + if (( $1 != 0 )); then + exit "$1" + fi } -if [[ "$build_release" == true ]]; then +# set RCCL-UnitTests path +if [[ "${build_release}" == true ]]; then unit_test_path="./build/release/test/rccl-UnitTests" else unit_test_path="./build/debug/test/rccl-UnitTests" fi -if ($run_tests) && [[ -f $unit_test_path ]]; then - if [[ "$build_tests" == false ]]; then +if [[ "${run_tests}" == true ]] && [[ -f "${unit_test_path}" ]]; then + if [[ "${build_tests}" == false ]]; then clean_build=false fi fi @@ -153,7 +166,7 @@ fi # prep # ################################################# # ensure a clean build environment -if ($clean_build); then +if [[ "${clean_build}" == true ]]; then if [[ "${build_release}" == true ]]; then rm -rf build/release else @@ -164,7 +177,8 @@ fi # Create and go to the build directory. mkdir -p build; cd build -if ($build_release); then +# Create and go to build type directory +if [[ "${build_release}" == true ]]; then mkdir -p release; cd release else mkdir -p debug; cd debug @@ -190,17 +204,15 @@ fi # Backward compatibility wrappers if [[ "${build_freorg_bkwdcomp}" == true ]]; then cmake_common_options="${cmake_common_options} -DBUILD_FILE_REORG_BACKWARD_COMPATIBILITY=ON" -else - cmake_common_options="${cmake_common_options} -DBUILD_FILE_REORG_BACKWARD_COMPATIBILITY=OFF" fi # Build local GPU arch only -if [[ "$build_local_gpu_only" == true ]]; then +if [[ "${build_local_gpu_only}" == true ]]; then cmake_common_options="${cmake_common_options} -DBUILD_LOCAL_GPU_TARGET_ONLY=ON" fi # Build for specified GPU target(s) only -if [[ ! -z "$build_amdgpu_targets" ]]; then +if [[ ! -z "${build_amdgpu_targets}" ]]; then cmake_common_options="${cmake_common_options} -DAMDGPU_TARGETS=${build_amdgpu_targets}" fi @@ -214,29 +226,29 @@ if [[ "${collective_trace}" == false ]]; then cmake_common_options="${cmake_common_options} -DCOLLTRACE=OFF" fi +# Disable msccl kernel if [[ "${msccl_kernel_enabled}" == false ]]; then cmake_common_options="${cmake_common_options} -DENABLE_MSCCL_KERNEL=OFF" fi # Install dependencies -if ($install_dependencies); then +if [[ "${install_dependencies}" == true ]]; then cmake_common_options="${cmake_common_options} -DINSTALL_DEPENDENCIES=ON" fi +# Install RCCL library +if [[ "${install_library}" == true ]]; then + cmake_common_options="${cmake_common_options} -DCMAKE_INSTALL_PREFIX=${install_prefix}" +fi + # Enable ROCTX if [[ "${roctx_enabled}" == true ]]; then cmake_common_options="${cmake_common_options} -DROCTX=ON" fi -cmake_executable=cmake -case "${OS_ID}" in - centos|rhel) - cmake_executable=cmake3 - ;; -esac - +# Enable NPKit npkit_options="" -if ($npkit_enabled); then +if [[ "${npkit_enabled}" == true ]]; then npkit_options="-DENABLE_NPKIT \ -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU \ -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU \ @@ -347,59 +359,80 @@ fi check_exit_code "$?" -if ($time_trace); then +# Enable ninja build for time tracing +if [[ "${time_trace}" == true ]]; then + if ! hash ninja &>/dev/null ; then + echo "ninja could not be found" + echo "Use \"${time_trace_ninja_msg}\" to install ninja" + exit 1 + fi build_system="ninja" enable_ninja="-GNinja" else build_system="make" fi -if ($build_tests) || (($run_tests) && [[ ! -f ./test/rccl-UnitTests ]]); then - CXX=$ROCM_BIN_PATH/hipcc $cmake_executable $cmake_common_options -DBUILD_TESTS=ON -DNPKIT_FLAGS="${npkit_options}" -DCMAKE_INSTALL_PREFIX=$ROCM_PATH -DROCM_PATH=$ROCM_PATH -DONLY_FUNCS="$ONLY_FUNCS" $enable_ninja ../../. +# Add common CMake options +cmake_common_options="${cmake_common_options} -DROCM_PATH=${ROCM_PATH} -DONLY_FUNCS=${ONLY_FUNCS} ${enable_ninja}" + +# Build RCCL-UnitTests, if enabled +if [[ "${build_tests}" == true ]] || ([[ "${run_tests}" == true ]] && [[ ! -x ./test/rccl-UnitTests ]]); then + cmake_common_options="${cmake_common_options} -DBUILD_TESTS=ON" +fi + +# Initiate RCCL CMake +# Passing NPKIT_FLAGS separately (not as part of ${cmake_common_options}) as +# ${npkit_options} need to be passed "as-is" i.e. with `-D` to CMakeLists.txt +${cmake_executable} ${cmake_common_options} -DNPKIT_FLAGS="${npkit_options}" ../../. +check_exit_code "$?" + +# Enable verbose output from Makefile +if [[ "${build_verbose}" == true ]]; then + build_system="${build_system} VERBOSE=1" +fi + +# Initiate RCCL build (and install) +if [[ "${install_library}" == true ]]; then + ${build_system} -j ${num_parallel_jobs} install else - CXX=$ROCM_BIN_PATH/hipcc $cmake_executable $cmake_common_options -DBUILD_TESTS=OFF -DNPKIT_FLAGS="${npkit_options}" -DCMAKE_INSTALL_PREFIX=$ROCM_PATH -DROCM_PATH=$ROCM_PATH -DONLY_FUNCS="$ONLY_FUNCS" $enable_ninja ../../. + ${build_system} -j ${num_parallel_jobs} fi check_exit_code "$?" -if ($install_library); then - VERBOSE=${build_verbose} $build_system -j $num_parallel_jobs install -else - VERBOSE=${build_verbose} $build_system -j $num_parallel_jobs -fi -check_exit_code "$?" - -if ($build_package); then +# Initiate package build with `make package`, if enabled +if [[ "${build_package}" == true ]]; then make package check_exit_code "$?" fi -# Optionally, run tests if they're enabled. -if ($run_tests); then - if (test -f "./test/rccl-UnitTests"); then - if ($run_tests_all); then +# Optionally, run RCCL-UnitTests, if they're enabled. +if [[ "${run_tests}" == true ]]; then + if [[ -x "./test/rccl-UnitTests" ]]; then + if [[ "${run_tests_all}" == true ]]; then ./test/rccl-UnitTests else ./test/rccl-UnitTests --gtest_filter="AllReduce.*" fi else - echo "rccl unit tests have not been built yet; please re-run script with -t to build rccl unit tests." + echo "RCCL-UnitTests have not been built yet; Please re-run script with \"-t\" to build RCCL-UnitTests." exit 1 fi fi -if ($time_trace); then - search_dir="../../" - time_trace_dir=$(find "$search_dir" -type d -name "time-trace" -print -quit) +# Generate time trace for RCCL build using tools/time-trace +if [[ "${time_trace}" == true ]]; then + search_dir="../../tools" + time_trace_dir=$(find "${search_dir}" -type d -name "time-trace" -print -quit) - if [ "$time_trace_dir" ]; then - time_trace_script="$time_trace_dir/rccl-TimeTrace.sh" - if [ -x "$time_trace_script" ]; then + if [[ -n "${time_trace_dir}" ]]; then + time_trace_script="${time_trace_dir}/rccl-TimeTrace.sh" + if [[ -x "${time_trace_script}" ]]; then echo "Generating RCCL-compile-timeline.html..." - (cd "$time_trace_dir" && ./rccl-TimeTrace.sh) + (cd "${time_trace_dir}" && ./rccl-TimeTrace.sh) else - echo "Error: Unable to execute $time_trace_script. Make sure the file has the correct permissions." + echo "Error: Unable to execute ${time_trace_script}. Make sure the file has the correct permissions." fi else - echo "Error: time-trace folder not found in $search_dir." + echo "Error: time-trace folder not found in ${search_dir}." fi fi diff --git a/toolchain-linux.cmake b/toolchain-linux.cmake new file mode 100644 index 0000000000..5a5a8b0416 --- /dev/null +++ b/toolchain-linux.cmake @@ -0,0 +1,19 @@ + +if (DEFINED ENV{ROCM_PATH}) + set(rocm_bin "$ENV{ROCM_PATH}/bin") +else() + set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to the ROCm installation.") + set(rocm_bin "/opt/rocm/bin") +endif() + +if (NOT DEFINED ENV{CXX}) + set(CMAKE_CXX_COMPILER "${rocm_bin}/hipcc" CACHE PATH "Path to the C++ compiler") +else() + set(CMAKE_CXX_COMPILER "$ENV{CXX}" CACHE PATH "Path to the C++ compiler") +endif() + +if (NOT DEFINED ENV{CC}) + set(CMAKE_C_COMPILER "${rocm_bin}/hipcc" CACHE PATH "Path to the C compiler") +else() + set(CMAKE_C_COMPILER "$ENV{CC}" CACHE PATH "Path to the C compiler") +endif()