[BUILD] Update install.sh for RCCL build (#1191)

Signed-off-by: nileshnegi <Nilesh.Negi@amd.com>
This commit is contained in:
Nilesh M Negi
2024-05-31 17:58:34 -05:00
کامیت شده توسط GitHub
والد 1249a6c3fd
کامیت 5aaf7121d9
4فایلهای تغییر یافته به همراه135 افزوده شده و 68 حذف شده
+9
مشاهده پرونده
@@ -1,8 +1,17 @@
# Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
# Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
# CMake version minimum requirements
#==================================================================================================
cmake_minimum_required(VERSION 3.5)
# CMake Toolchain file to define compilers and path to ROCm
#==================================================================================================
if (NOT CMAKE_TOOLCHAIN_FILE)
set(CMAKE_TOOLCHAIN_FILE "${CMAKE_CURRENT_SOURCE_DIR}/toolchain-linux.cmake")
message(STATUS "CMAKE_TOOLCHAIN_FILE: ${CMAKE_TOOLCHAIN_FILE}")
endif()
# RCCL project
#==================================================================================================
project(rccl CXX)
+18 -12
مشاهده پرونده
@@ -18,11 +18,18 @@ The collective operations are implemented using ring and tree algorithms and hav
RCCL directly depends on HIP runtime plus the HIP-Clang compiler, which are part of the ROCm software stack.
For ROCm installation instructions, see https://github.com/ROCm/ROCm.
The root of this repository has a helper script 'install.sh' to build and install RCCL on Ubuntu with a single command. It does not take a lot of options and hard-codes configuration that can be specified through invoking cmake directly, but it's a great way to get started quickly and can serve as an example of how to build/install.
The root of this repository has a helper script `install.sh` to build and install RCCL with a single command. It hard-codes configurations that can be specified through invoking cmake directly, but it's a great way to get started quickly and can serve as an example of how to build/install RCCL.
### To build the library using the install script:
```shell
./install.sh --help
./install.sh
```
For more info on build options/flags when using the install script, use `./install.sh --help`
```shell
./install.sh --help
RCCL build & installation helper script
Options:
--address-sanitizer Build with address sanitizer enabled
-d|--dependencies Install RCCL depdencencies
@@ -33,37 +40,38 @@ The root of this repository has a helper script 'install.sh' to build and instal
-f|--fast Quick-build RCCL (local gpu arch only, no backtrace, and collective trace support)
-h|--help Prints this help message
-i|--install Install RCCL library (see --prefix argument below)
-j|--jobs Specify how many parallel compilation jobs to run (nproc by default)
-j|--jobs Specify how many parallel compilation jobs to run ($nproc by default)
-l|--local_gpu_only Only compile for local GPU architecture
--amdgpu_targets Only compile for specified GPU architecture(s). For multiple targets, seperate by ';' (builds for all supported GPU architectures by default)
--no_clean Don't delete files if they already exist
--npkit-enable Compile with npkit enabled
--roctx-enable Compile with roctx enabled (example usage: rocprof --roctx-trace ./rccl-program)
-p|--package_build Build RCCL package
--prefix Specify custom directory to install RCCL to (default: /opt/rocm)
--prefix Specify custom directory to install RCCL to (default: `/opt/rocm`)
--rm-legacy-include-dir Remove legacy include dir Packaging added for file/folder reorg backward compatibility
--run_tests_all Run all rccl unit tests (must be built already)
-r|--run_tests_quick Run small subset of rccl unit tests (must be built already)
--static Build RCCL as a static library instead of shared library
-t|--tests_build Build rccl unit tests, but do not run
--time-trace Plot the build time of RCCL
--time-trace Plot the build time of RCCL (requires `ninja-build` package installed on the system)
--verbose Show compile commands
```
## Manual build
### To build the library :
### To build the library using CMake:
```shell
$ git clone https://github.com/ROCm/rccl.git
$ cd rccl
$ mkdir build
$ cd build
$ CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_PREFIX_PATH=/opt/rocm/ ..
$ cmake ..
$ make -j 16 # Or some other suitable number of parallel jobs
```
You may substitute an installation path of your own choosing by passing CMAKE_INSTALL_PREFIX. For example:
You may substitute an installation path of your own choosing by passing `CMAKE_INSTALL_PREFIX`. For example:
```shell
$ CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_PREFIX_PATH=/opt/rocm/ -DCMAKE_INSTALL_PREFIX=$PWD/rccl-install ..
$ cmake -DCMAKE_INSTALL_PREFIX=$PWD/rccl-install ..
```
Note: ensure rocm-cmake is installed, `apt install rocm-cmake`.
@@ -123,11 +131,9 @@ Please refer to the [RCCL Documentation Site](https://rocm.docs.amd.com/projects
Run the steps below to build documentation locally.
```
```shell
cd docs
pip3 install -r sphinx/requirements.txt
python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
```
+89 -56
مشاهده پرونده
@@ -16,12 +16,13 @@ build_package=false
build_release=true
build_static=false
build_tests=false
build_verbose=0
build_verbose=false
clean_build=true
collective_trace=true
enable_ninja=""
install_dependencies=false
install_library=false
install_prefix="${ROCM_PATH}"
msccl_kernel_enabled=true
num_parallel_jobs=$(nproc)
npkit_enabled=false
@@ -53,13 +54,13 @@ function display_help()
echo " --npkit-enable Compile with npkit enabled"
echo " --roctx-enable Compile with roctx enabled (example usage: rocprof --roctx-trace ./rccl-program)"
echo " -p|--package_build Build RCCL package"
echo " --prefix Specify custom directory to install RCCL to (default: /opt/rocm)"
echo " --prefix Specify custom directory to install RCCL to (default: \`/opt/rocm\`)"
echo " --rm-legacy-include-dir Remove legacy include dir Packaging added for file/folder reorg backward compatibility"
echo " --run_tests_all Run all rccl unit tests (must be built already)"
echo " -r|--run_tests_quick Run small subset of rccl unit tests (must be built already)"
echo " --static Build RCCL as a static library instead of shared library"
echo " -t|--tests_build Build rccl unit tests, but do not run"
echo " --time-trace Plot the build time of RCCL"
echo " --time-trace Plot the build time of RCCL (requires \`ninja-build\` package installed on the system)"
echo " --verbose Show compile commands"
}
@@ -69,14 +70,14 @@ function display_help()
# check if we have a modern version of getopt that can handle whitespace and long parameters
getopt -T
if [[ $? -eq 4 ]]; then
if [[ "$?" -eq 4 ]]; then
GETOPT_PARSE=$(getopt --name "${0}" --options dfhij:lprt --longoptions address-sanitizer,dependencies,debug,enable_backtrace,disable-colltrace,disable-msccl-kernel,fast,help,install,jobs:,local_gpu_only,amdgpu_targets:,no_clean,npkit-enable,roctx-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,static,tests_build,time-trace,verbose -- "$@")
else
echo "Need a new version of getopt"
exit 1
fi
if [[ $? -ne 0 ]]; then
if [[ "$?" -ne 0 ]]; then
echo "getopt invocation failed; could not parse the command line";
exit 1
fi
@@ -101,14 +102,14 @@ while true; do
--npkit-enable) npkit_enabled=true; shift ;;
--roctx-enable) roctx_enabled=true; shift ;;
-p | --package_build) build_package=true; shift ;;
--prefix) install_prefix=${2}; shift 2 ;;
--prefix) install_library=true; install_prefix=${2}; shift 2 ;;
--rm-legacy-include-dir) build_freorg_bkwdcomp=false; shift ;;
-r | --run_tests_quick) run_tests=true; shift ;;
--run_tests_all) run_tests=true; run_tests_all=true; shift ;;
--static) build_static=true; shift ;;
-t | --tests_build) build_tests=true; shift ;;
--time-trace) time_trace=true; shift ;;
--verbose) build_verbose=1; shift ;;
--verbose) build_verbose=true; shift ;;
--) shift ; break ;;
*) echo "Unexpected command line parameter received; aborting";
exit 1
@@ -116,8 +117,6 @@ while true; do
esac
done
ROCM_BIN_PATH=$ROCM_PATH/bin
# /etc/*-release files describe the system
if [[ -e "/etc/os-release" ]]; then
source /etc/os-release
@@ -129,22 +128,36 @@ else
exit 2
fi
# CMake executable
cmake_executable=cmake
time_trace_ninja_msg="apt-get install ninja-build"
case "${OS_ID}" in
centos|rhel)
cmake_executable=cmake3
time_trace_ninja_msg="dnf install ninja-build"
;;
esac
# CMake build options; starts with toolchain info
cmake_common_options="--toolchain=toolchain-linux.cmake"
# throw error code after running a command in the install script
check_exit_code( )
{
if (( $1 != 0 )); then
exit $1
fi
if (( $1 != 0 )); then
exit "$1"
fi
}
if [[ "$build_release" == true ]]; then
# set RCCL-UnitTests path
if [[ "${build_release}" == true ]]; then
unit_test_path="./build/release/test/rccl-UnitTests"
else
unit_test_path="./build/debug/test/rccl-UnitTests"
fi
if ($run_tests) && [[ -f $unit_test_path ]]; then
if [[ "$build_tests" == false ]]; then
if [[ "${run_tests}" == true ]] && [[ -f "${unit_test_path}" ]]; then
if [[ "${build_tests}" == false ]]; then
clean_build=false
fi
fi
@@ -153,7 +166,7 @@ fi
# prep
# #################################################
# ensure a clean build environment
if ($clean_build); then
if [[ "${clean_build}" == true ]]; then
if [[ "${build_release}" == true ]]; then
rm -rf build/release
else
@@ -164,7 +177,8 @@ fi
# Create and go to the build directory.
mkdir -p build; cd build
if ($build_release); then
# Create and go to build type directory
if [[ "${build_release}" == true ]]; then
mkdir -p release; cd release
else
mkdir -p debug; cd debug
@@ -190,17 +204,15 @@ fi
# Backward compatibility wrappers
if [[ "${build_freorg_bkwdcomp}" == true ]]; then
cmake_common_options="${cmake_common_options} -DBUILD_FILE_REORG_BACKWARD_COMPATIBILITY=ON"
else
cmake_common_options="${cmake_common_options} -DBUILD_FILE_REORG_BACKWARD_COMPATIBILITY=OFF"
fi
# Build local GPU arch only
if [[ "$build_local_gpu_only" == true ]]; then
if [[ "${build_local_gpu_only}" == true ]]; then
cmake_common_options="${cmake_common_options} -DBUILD_LOCAL_GPU_TARGET_ONLY=ON"
fi
# Build for specified GPU target(s) only
if [[ ! -z "$build_amdgpu_targets" ]]; then
if [[ ! -z "${build_amdgpu_targets}" ]]; then
cmake_common_options="${cmake_common_options} -DAMDGPU_TARGETS=${build_amdgpu_targets}"
fi
@@ -214,29 +226,29 @@ if [[ "${collective_trace}" == false ]]; then
cmake_common_options="${cmake_common_options} -DCOLLTRACE=OFF"
fi
# Disable msccl kernel
if [[ "${msccl_kernel_enabled}" == false ]]; then
cmake_common_options="${cmake_common_options} -DENABLE_MSCCL_KERNEL=OFF"
fi
# Install dependencies
if ($install_dependencies); then
if [[ "${install_dependencies}" == true ]]; then
cmake_common_options="${cmake_common_options} -DINSTALL_DEPENDENCIES=ON"
fi
# Install RCCL library
if [[ "${install_library}" == true ]]; then
cmake_common_options="${cmake_common_options} -DCMAKE_INSTALL_PREFIX=${install_prefix}"
fi
# Enable ROCTX
if [[ "${roctx_enabled}" == true ]]; then
cmake_common_options="${cmake_common_options} -DROCTX=ON"
fi
cmake_executable=cmake
case "${OS_ID}" in
centos|rhel)
cmake_executable=cmake3
;;
esac
# Enable NPKit
npkit_options=""
if ($npkit_enabled); then
if [[ "${npkit_enabled}" == true ]]; then
npkit_options="-DENABLE_NPKIT \
-DENABLE_NPKIT_EVENT_TIME_SYNC_GPU \
-DENABLE_NPKIT_EVENT_TIME_SYNC_CPU \
@@ -347,59 +359,80 @@ fi
check_exit_code "$?"
if ($time_trace); then
# Enable ninja build for time tracing
if [[ "${time_trace}" == true ]]; then
if ! hash ninja &>/dev/null ; then
echo "ninja could not be found"
echo "Use \"${time_trace_ninja_msg}\" to install ninja"
exit 1
fi
build_system="ninja"
enable_ninja="-GNinja"
else
build_system="make"
fi
if ($build_tests) || (($run_tests) && [[ ! -f ./test/rccl-UnitTests ]]); then
CXX=$ROCM_BIN_PATH/hipcc $cmake_executable $cmake_common_options -DBUILD_TESTS=ON -DNPKIT_FLAGS="${npkit_options}" -DCMAKE_INSTALL_PREFIX=$ROCM_PATH -DROCM_PATH=$ROCM_PATH -DONLY_FUNCS="$ONLY_FUNCS" $enable_ninja ../../.
# Add common CMake options
cmake_common_options="${cmake_common_options} -DROCM_PATH=${ROCM_PATH} -DONLY_FUNCS=${ONLY_FUNCS} ${enable_ninja}"
# Build RCCL-UnitTests, if enabled
if [[ "${build_tests}" == true ]] || ([[ "${run_tests}" == true ]] && [[ ! -x ./test/rccl-UnitTests ]]); then
cmake_common_options="${cmake_common_options} -DBUILD_TESTS=ON"
fi
# Initiate RCCL CMake
# Passing NPKIT_FLAGS separately (not as part of ${cmake_common_options}) as
# ${npkit_options} need to be passed "as-is" i.e. with `-D` to CMakeLists.txt
${cmake_executable} ${cmake_common_options} -DNPKIT_FLAGS="${npkit_options}" ../../.
check_exit_code "$?"
# Enable verbose output from Makefile
if [[ "${build_verbose}" == true ]]; then
build_system="${build_system} VERBOSE=1"
fi
# Initiate RCCL build (and install)
if [[ "${install_library}" == true ]]; then
${build_system} -j ${num_parallel_jobs} install
else
CXX=$ROCM_BIN_PATH/hipcc $cmake_executable $cmake_common_options -DBUILD_TESTS=OFF -DNPKIT_FLAGS="${npkit_options}" -DCMAKE_INSTALL_PREFIX=$ROCM_PATH -DROCM_PATH=$ROCM_PATH -DONLY_FUNCS="$ONLY_FUNCS" $enable_ninja ../../.
${build_system} -j ${num_parallel_jobs}
fi
check_exit_code "$?"
if ($install_library); then
VERBOSE=${build_verbose} $build_system -j $num_parallel_jobs install
else
VERBOSE=${build_verbose} $build_system -j $num_parallel_jobs
fi
check_exit_code "$?"
if ($build_package); then
# Initiate package build with `make package`, if enabled
if [[ "${build_package}" == true ]]; then
make package
check_exit_code "$?"
fi
# Optionally, run tests if they're enabled.
if ($run_tests); then
if (test -f "./test/rccl-UnitTests"); then
if ($run_tests_all); then
# Optionally, run RCCL-UnitTests, if they're enabled.
if [[ "${run_tests}" == true ]]; then
if [[ -x "./test/rccl-UnitTests" ]]; then
if [[ "${run_tests_all}" == true ]]; then
./test/rccl-UnitTests
else
./test/rccl-UnitTests --gtest_filter="AllReduce.*"
fi
else
echo "rccl unit tests have not been built yet; please re-run script with -t to build rccl unit tests."
echo "RCCL-UnitTests have not been built yet; Please re-run script with \"-t\" to build RCCL-UnitTests."
exit 1
fi
fi
if ($time_trace); then
search_dir="../../"
time_trace_dir=$(find "$search_dir" -type d -name "time-trace" -print -quit)
# Generate time trace for RCCL build using tools/time-trace
if [[ "${time_trace}" == true ]]; then
search_dir="../../tools"
time_trace_dir=$(find "${search_dir}" -type d -name "time-trace" -print -quit)
if [ "$time_trace_dir" ]; then
time_trace_script="$time_trace_dir/rccl-TimeTrace.sh"
if [ -x "$time_trace_script" ]; then
if [[ -n "${time_trace_dir}" ]]; then
time_trace_script="${time_trace_dir}/rccl-TimeTrace.sh"
if [[ -x "${time_trace_script}" ]]; then
echo "Generating RCCL-compile-timeline.html..."
(cd "$time_trace_dir" && ./rccl-TimeTrace.sh)
(cd "${time_trace_dir}" && ./rccl-TimeTrace.sh)
else
echo "Error: Unable to execute $time_trace_script. Make sure the file has the correct permissions."
echo "Error: Unable to execute ${time_trace_script}. Make sure the file has the correct permissions."
fi
else
echo "Error: time-trace folder not found in $search_dir."
echo "Error: time-trace folder not found in ${search_dir}."
fi
fi
+19
مشاهده پرونده
@@ -0,0 +1,19 @@
if (DEFINED ENV{ROCM_PATH})
set(rocm_bin "$ENV{ROCM_PATH}/bin")
else()
set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to the ROCm installation.")
set(rocm_bin "/opt/rocm/bin")
endif()
if (NOT DEFINED ENV{CXX})
set(CMAKE_CXX_COMPILER "${rocm_bin}/hipcc" CACHE PATH "Path to the C++ compiler")
else()
set(CMAKE_CXX_COMPILER "$ENV{CXX}" CACHE PATH "Path to the C++ compiler")
endif()
if (NOT DEFINED ENV{CC})
set(CMAKE_C_COMPILER "${rocm_bin}/hipcc" CACHE PATH "Path to the C compiler")
else()
set(CMAKE_C_COMPILER "$ENV{CC}" CACHE PATH "Path to the C compiler")
endif()