From bb558484506e0ec1098f8704698ea3c2cf36e43c Mon Sep 17 00:00:00 2001 From: gilbertlee-amd <44450918+gilbertlee-amd@users.noreply.github.com> Date: Thu, 22 Jun 2023 14:30:44 -0600 Subject: [PATCH] Limiting # parallel jobs in install script to 16 by default, and new -j/--jobs flag (#785) --- .jenkins/extended.groovy | 2 +- .jenkins/precheckin.groovy | 2 +- .jenkins/staticlibrary.groovy | 2 +- README.md | 39 +++++++++++----- install.sh | 83 ++++++++++++++++------------------- 5 files changed, 70 insertions(+), 58 deletions(-) diff --git a/.jenkins/extended.groovy b/.jenkins/extended.groovy index eb2c857aa8..f2cb5d0846 100644 --- a/.jenkins/extended.groovy +++ b/.jenkins/extended.groovy @@ -17,7 +17,7 @@ def runCI = def prj = new rocProject('rccl', 'Extended') prj.timeout.test = 600 - prj.paths.build_command = './install.sh -t --npkit-enable --limit-nprocs' + prj.paths.build_command = './install.sh -t --npkit-enable' // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) diff --git a/.jenkins/precheckin.groovy b/.jenkins/precheckin.groovy index 4791b1979f..7c8eef8690 100644 --- a/.jenkins/precheckin.groovy +++ b/.jenkins/precheckin.groovy @@ -18,7 +18,7 @@ def runCI = def prj = new rocProject('rccl', 'PreCheckin') prj.timeout.test = 300 - prj.paths.build_command = './install.sh -t --fast --limit-nprocs' + prj.paths.build_command = './install.sh -t --fast' // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) diff --git a/.jenkins/staticlibrary.groovy b/.jenkins/staticlibrary.groovy index 8be7f8480f..e75ff7ec97 100644 --- a/.jenkins/staticlibrary.groovy +++ b/.jenkins/staticlibrary.groovy @@ -12,7 +12,7 @@ def runCI = def prj = new rocProject('rccl', 'Static Library PreCheckin') prj.timeout.test = 1440 - prj.paths.build_command = './install.sh -t --static --limit-nprocs' + prj.paths.build_command = './install.sh -t --static' def nodes = new dockerNodes(nodeDetails, jobName, prj) diff --git a/README.md b/README.md index ce9dc6d4bc..c6bdea642b 100644 --- a/README.md +++ b/README.md @@ -20,16 +20,33 @@ For ROCm installation instructions, see https://github.com/RadeonOpenCompute/ROC The root of this repository has a helper script 'install.sh' to build and install RCCL on Ubuntu with a single command. It does not take a lot of options and hard-codes configuration that can be specified through invoking cmake directly, but it's a great way to get started quickly and can serve as an example of how to build/install. -* `./install.sh` -- builds library including rccl unit tests -* `./install.sh -i` -- builds and installs the library to /opt/rocm/rccl; installation path can be changed with --prefix argument (see below.) -* `./install.sh -d` -- installs all necessary dependencies for RCCL. Should be re-invoked if the build folder is removed. -* `./install.sh -h` -- shows help -* `./install.sh -t` -- builds library including rccl unit tests -* `./install.sh -r` -- runs rccl unit tests (must be already built) -* `./install.sh -p` -- builds RCCL package -* `./install.sh --static` -- builds RCCL as a static library (default: shared) -* `./install.sh --prefix` -- specify custom path to install RCCL to (default:/opt/rocm) -* `./install.sh --npkit-enable` -- enable compilation of npkit profiler framework with all options +```shell +./install.sh --help + + Options: + --address-sanitizer Build with address sanitizer enabled + --build_allreduce_only Build only AllReduce + sum + float kernel + -d|--dependencies Install RCCL depdencencies + --debug Build debug library + --disable_backtrace Build without custom backtrace support + --disable-colltrace Build without collective trace + -f|--fast Quick-build RCCL (local gpu arch only, no backtrace, and collective trace support) + -h|--help Prints this help message + -i|--install Install RCCL library (see --prefix argument below) + -j|--jobs Specify how many parallel compilation jobs to run (16 by default) + -l|--local_gpu_only Only compile for local GPU architecture + --no_clean Don't delete files if they already exist + --npkit-enable Compile with npkit enabled + -p|--package_build Build RCCL package + --prefix Specify custom directory to install RCCL to (default: /opt/rocm) + --rm-legacy-include-dir Remove legacy include dir Packaging added for file/folder reorg backward compatibility + --run_tests_all Run all rccl unit tests (must be built already) + -r|--run_tests_quick Run small subset of rccl unit tests (must be built already) + --static Build RCCL as a static library instead of shared library + -t|--tests_build Build rccl unit tests, but do not run + --time-trace Plot the build time of RCCL + --verbose Show compile commands +``` ## Manual build @@ -41,7 +58,7 @@ $ cd rccl $ mkdir build $ cd build $ CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_PREFIX_PATH=/opt/rocm/ .. -$ make -j +$ make -j 16 # Or some other suitable number of parallel jobs ``` You may substitute an installation path of your own choosing by passing CMAKE_INSTALL_PREFIX. For example: ```shell diff --git a/install.sh b/install.sh index 5964aeaafc..42859bd48b 100755 --- a/install.sh +++ b/install.sh @@ -1,24 +1,51 @@ #!/bin/bash # Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved. +# ################################################# +# global variables +# ################################################# +ROCM_PATH=${ROCM_PATH:="/opt/rocm"} + +# Default values +build_address_sanitizer=false +build_allreduce_only=false +build_bfd=true +build_freorg_bkwdcomp=true +build_local_gpu_only=false +build_package=false +build_release=true +build_static=false +build_tests=false +build_verbose=0 +clean_build=true +collective_trace=true +enable_ninja="" +install_dependencies=false +install_library=false +num_parallel_jobs=16 +npkit_enabled=false +run_tests=false +run_tests_all=false +time_trace=false + # ################################################# # helper functions # ################################################# function display_help() { echo "RCCL build & installation helper script" - echo "./install [-h|--help] " + echo " Options:" echo " --address-sanitizer Build with address sanitizer enabled" echo " --build_allreduce_only Build only AllReduce + sum + float kernel" echo " -d|--dependencies Install RCCL depdencencies" echo " --debug Build debug library" echo " --disable_backtrace Build without custom backtrace support" echo " --disable-colltrace Build without collective trace" - echo " --fast Quick-build RCCL (local gpu arch only, no backtrace, and collective trace support)" + echo " -f|--fast Quick-build RCCL (local gpu arch only, no backtrace, and collective trace support)" echo " -h|--help Prints this help message" echo " -i|--install Install RCCL library (see --prefix argument below)" - echo " -l|--limit-nprocs Limit the number of procs to 16 while building" - echo " --local_gpu_only Only compile for local GPU architecture" + echo " -j|--jobs Specify how many parallel compilation jobs to run ($num_parallel_jobs by default)" + echo " -l|--local_gpu_only Only compile for local GPU architecture" echo " --no_clean Don't delete files if they already exist" echo " --npkit-enable Compile with npkit enabled" echo " -p|--package_build Build RCCL package" @@ -32,32 +59,6 @@ function display_help() echo " --verbose Show compile commands" } -# ################################################# -# global variables -# ################################################# -ROCM_PATH=${ROCM_PATH:="/opt/rocm"} - -build_address_sanitizer=false -build_allreduce_only=false -collective_trace=true -install_dependencies=false -build_release=true -build_bfd=true -install_library=false -build_local_gpu_only=false -clean_build=true -npkit_enabled=false -build_package=false -build_freorg_bkwdcomp=true -run_tests=false -run_tests_all=false -build_static=false -build_tests=false -build_verbose=0 -time_trace=false -enable_all_jobs=true -enable_ninja="" - # ################################################# # Parameter parsing # ################################################# @@ -65,7 +66,7 @@ enable_ninja="" # check if we have a modern version of getopt that can handle whitespace and long parameters getopt -T if [[ $? -eq 4 ]]; then - GETOPT_PARSE=$(getopt --name "${0}" --longoptions address-sanitizer,build_allreduce_only,dependencies,debug,disable_backtrace,disable-colltrace,fast,help,install,limit-nprocs,local_gpu_only,no_clean,npkit-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,tests_build,time-trace,verbose --options hidptrs -- "$@") + GETOPT_PARSE=$(getopt --name "${0}" --options dfhij:lprt --longoptions address-sanitizer,build_allreduce_only,dependencies,debug,disable_backtrace,disable-colltrace,fast,help,install,jobs:,local_gpu_only,no_clean,npkit-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,tests_build,time-trace,verbose -- "$@") else echo "Need a new version of getopt" exit 1 @@ -86,15 +87,15 @@ while true; do --debug) build_release=false; shift ;; --disable_backtrace) build_bfd=false; shift ;; --disable-colltrace) collective_trace=false; shift ;; - --fast) build_bfd=false; build_local_gpu_only=true; collective_trace=false; shift ;; + -f | --fast) build_bfd=false; build_local_gpu_only=true; collective_trace=false; shift ;; -h | --help) display_help; exit 0 ;; -i | --install) install_library=true; shift ;; - -l | --limit-nprocs) enable_all_jobs=false; shift ;; - --local_gpu_only) build_local_gpu_only=true; shift ;; + -j | --jobs) num_parallel_jobs=${2}; shift 2 ;; + -l | --local_gpu_only) build_local_gpu_only=true; shift ;; --no_clean) clean_build=false; shift ;; --npkit-enable) npkit_enabled=true; shift ;; -p | --package_build) build_package=true; shift ;; - --prefix) install_prefix=${2} shift 2 ;; + --prefix) install_prefix=${2}; shift 2 ;; --rm-legacy-include-dir) build_freorg_bkwdcomp=false; shift ;; -r | --run_tests_quick) run_tests=true; shift ;; --run_tests_all) run_tests=true; run_tests_all=true; shift ;; @@ -305,12 +306,6 @@ fi check_exit_code "$?" -if ($enable_all_jobs); then - job_number=$(nproc) -else - job_number=16 -fi - if ($time_trace); then build_system="ninja" enable_ninja="-GNinja" @@ -326,9 +321,9 @@ fi check_exit_code "$?" if ($install_library); then - VERBOSE=${build_verbose} $build_system -j $job_number install + VERBOSE=${build_verbose} $build_system -j $num_parallel_jobs install else - VERBOSE=${build_verbose} $build_system -j $job_number + VERBOSE=${build_verbose} $build_system -j $num_parallel_jobs fi check_exit_code "$?" @@ -366,4 +361,4 @@ if ($time_trace); then else echo "Error: time-trace folder not found in $search_dir." fi -fi \ No newline at end of file +fi