Limiting # parallel jobs in install script to 16 by default, and new -j/--jobs flag (#785)
[ROCm/rccl commit: bb55848450]
Cette révision appartient à :
@@ -17,7 +17,7 @@ def runCI =
|
||||
def prj = new rocProject('rccl', 'Extended')
|
||||
|
||||
prj.timeout.test = 600
|
||||
prj.paths.build_command = './install.sh -t --npkit-enable --limit-nprocs'
|
||||
prj.paths.build_command = './install.sh -t --npkit-enable'
|
||||
|
||||
// Define test architectures, optional rocm version argument is available
|
||||
def nodes = new dockerNodes(nodeDetails, jobName, prj)
|
||||
|
||||
@@ -18,7 +18,7 @@ def runCI =
|
||||
def prj = new rocProject('rccl', 'PreCheckin')
|
||||
|
||||
prj.timeout.test = 300
|
||||
prj.paths.build_command = './install.sh -t --fast --limit-nprocs'
|
||||
prj.paths.build_command = './install.sh -t --fast'
|
||||
|
||||
// Define test architectures, optional rocm version argument is available
|
||||
def nodes = new dockerNodes(nodeDetails, jobName, prj)
|
||||
|
||||
@@ -12,7 +12,7 @@ def runCI =
|
||||
def prj = new rocProject('rccl', 'Static Library PreCheckin')
|
||||
|
||||
prj.timeout.test = 1440
|
||||
prj.paths.build_command = './install.sh -t --static --limit-nprocs'
|
||||
prj.paths.build_command = './install.sh -t --static'
|
||||
|
||||
def nodes = new dockerNodes(nodeDetails, jobName, prj)
|
||||
|
||||
|
||||
@@ -20,16 +20,33 @@ For ROCm installation instructions, see https://github.com/RadeonOpenCompute/ROC
|
||||
|
||||
The root of this repository has a helper script 'install.sh' to build and install RCCL on Ubuntu with a single command. It does not take a lot of options and hard-codes configuration that can be specified through invoking cmake directly, but it's a great way to get started quickly and can serve as an example of how to build/install.
|
||||
|
||||
* `./install.sh` -- builds library including rccl unit tests
|
||||
* `./install.sh -i` -- builds and installs the library to /opt/rocm/rccl; installation path can be changed with --prefix argument (see below.)
|
||||
* `./install.sh -d` -- installs all necessary dependencies for RCCL. Should be re-invoked if the build folder is removed.
|
||||
* `./install.sh -h` -- shows help
|
||||
* `./install.sh -t` -- builds library including rccl unit tests
|
||||
* `./install.sh -r` -- runs rccl unit tests (must be already built)
|
||||
* `./install.sh -p` -- builds RCCL package
|
||||
* `./install.sh --static` -- builds RCCL as a static library (default: shared)
|
||||
* `./install.sh --prefix` -- specify custom path to install RCCL to (default:/opt/rocm)
|
||||
* `./install.sh --npkit-enable` -- enable compilation of npkit profiler framework with all options
|
||||
```shell
|
||||
./install.sh --help
|
||||
|
||||
Options:
|
||||
--address-sanitizer Build with address sanitizer enabled
|
||||
--build_allreduce_only Build only AllReduce + sum + float kernel
|
||||
-d|--dependencies Install RCCL depdencencies
|
||||
--debug Build debug library
|
||||
--disable_backtrace Build without custom backtrace support
|
||||
--disable-colltrace Build without collective trace
|
||||
-f|--fast Quick-build RCCL (local gpu arch only, no backtrace, and collective trace support)
|
||||
-h|--help Prints this help message
|
||||
-i|--install Install RCCL library (see --prefix argument below)
|
||||
-j|--jobs Specify how many parallel compilation jobs to run (16 by default)
|
||||
-l|--local_gpu_only Only compile for local GPU architecture
|
||||
--no_clean Don't delete files if they already exist
|
||||
--npkit-enable Compile with npkit enabled
|
||||
-p|--package_build Build RCCL package
|
||||
--prefix Specify custom directory to install RCCL to (default: /opt/rocm)
|
||||
--rm-legacy-include-dir Remove legacy include dir Packaging added for file/folder reorg backward compatibility
|
||||
--run_tests_all Run all rccl unit tests (must be built already)
|
||||
-r|--run_tests_quick Run small subset of rccl unit tests (must be built already)
|
||||
--static Build RCCL as a static library instead of shared library
|
||||
-t|--tests_build Build rccl unit tests, but do not run
|
||||
--time-trace Plot the build time of RCCL
|
||||
--verbose Show compile commands
|
||||
```
|
||||
|
||||
## Manual build
|
||||
|
||||
@@ -41,7 +58,7 @@ $ cd rccl
|
||||
$ mkdir build
|
||||
$ cd build
|
||||
$ CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_PREFIX_PATH=/opt/rocm/ ..
|
||||
$ make -j
|
||||
$ make -j 16 # Or some other suitable number of parallel jobs
|
||||
```
|
||||
You may substitute an installation path of your own choosing by passing CMAKE_INSTALL_PREFIX. For example:
|
||||
```shell
|
||||
|
||||
@@ -1,24 +1,51 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
# #################################################
|
||||
# global variables
|
||||
# #################################################
|
||||
ROCM_PATH=${ROCM_PATH:="/opt/rocm"}
|
||||
|
||||
# Default values
|
||||
build_address_sanitizer=false
|
||||
build_allreduce_only=false
|
||||
build_bfd=true
|
||||
build_freorg_bkwdcomp=true
|
||||
build_local_gpu_only=false
|
||||
build_package=false
|
||||
build_release=true
|
||||
build_static=false
|
||||
build_tests=false
|
||||
build_verbose=0
|
||||
clean_build=true
|
||||
collective_trace=true
|
||||
enable_ninja=""
|
||||
install_dependencies=false
|
||||
install_library=false
|
||||
num_parallel_jobs=16
|
||||
npkit_enabled=false
|
||||
run_tests=false
|
||||
run_tests_all=false
|
||||
time_trace=false
|
||||
|
||||
# #################################################
|
||||
# helper functions
|
||||
# #################################################
|
||||
function display_help()
|
||||
{
|
||||
echo "RCCL build & installation helper script"
|
||||
echo "./install [-h|--help] "
|
||||
echo " Options:"
|
||||
echo " --address-sanitizer Build with address sanitizer enabled"
|
||||
echo " --build_allreduce_only Build only AllReduce + sum + float kernel"
|
||||
echo " -d|--dependencies Install RCCL depdencencies"
|
||||
echo " --debug Build debug library"
|
||||
echo " --disable_backtrace Build without custom backtrace support"
|
||||
echo " --disable-colltrace Build without collective trace"
|
||||
echo " --fast Quick-build RCCL (local gpu arch only, no backtrace, and collective trace support)"
|
||||
echo " -f|--fast Quick-build RCCL (local gpu arch only, no backtrace, and collective trace support)"
|
||||
echo " -h|--help Prints this help message"
|
||||
echo " -i|--install Install RCCL library (see --prefix argument below)"
|
||||
echo " -l|--limit-nprocs Limit the number of procs to 16 while building"
|
||||
echo " --local_gpu_only Only compile for local GPU architecture"
|
||||
echo " -j|--jobs Specify how many parallel compilation jobs to run ($num_parallel_jobs by default)"
|
||||
echo " -l|--local_gpu_only Only compile for local GPU architecture"
|
||||
echo " --no_clean Don't delete files if they already exist"
|
||||
echo " --npkit-enable Compile with npkit enabled"
|
||||
echo " -p|--package_build Build RCCL package"
|
||||
@@ -32,32 +59,6 @@ function display_help()
|
||||
echo " --verbose Show compile commands"
|
||||
}
|
||||
|
||||
# #################################################
|
||||
# global variables
|
||||
# #################################################
|
||||
ROCM_PATH=${ROCM_PATH:="/opt/rocm"}
|
||||
|
||||
build_address_sanitizer=false
|
||||
build_allreduce_only=false
|
||||
collective_trace=true
|
||||
install_dependencies=false
|
||||
build_release=true
|
||||
build_bfd=true
|
||||
install_library=false
|
||||
build_local_gpu_only=false
|
||||
clean_build=true
|
||||
npkit_enabled=false
|
||||
build_package=false
|
||||
build_freorg_bkwdcomp=true
|
||||
run_tests=false
|
||||
run_tests_all=false
|
||||
build_static=false
|
||||
build_tests=false
|
||||
build_verbose=0
|
||||
time_trace=false
|
||||
enable_all_jobs=true
|
||||
enable_ninja=""
|
||||
|
||||
# #################################################
|
||||
# Parameter parsing
|
||||
# #################################################
|
||||
@@ -65,7 +66,7 @@ enable_ninja=""
|
||||
# check if we have a modern version of getopt that can handle whitespace and long parameters
|
||||
getopt -T
|
||||
if [[ $? -eq 4 ]]; then
|
||||
GETOPT_PARSE=$(getopt --name "${0}" --longoptions address-sanitizer,build_allreduce_only,dependencies,debug,disable_backtrace,disable-colltrace,fast,help,install,limit-nprocs,local_gpu_only,no_clean,npkit-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,tests_build,time-trace,verbose --options hidptrs -- "$@")
|
||||
GETOPT_PARSE=$(getopt --name "${0}" --options dfhij:lprt --longoptions address-sanitizer,build_allreduce_only,dependencies,debug,disable_backtrace,disable-colltrace,fast,help,install,jobs:,local_gpu_only,no_clean,npkit-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,tests_build,time-trace,verbose -- "$@")
|
||||
else
|
||||
echo "Need a new version of getopt"
|
||||
exit 1
|
||||
@@ -86,15 +87,15 @@ while true; do
|
||||
--debug) build_release=false; shift ;;
|
||||
--disable_backtrace) build_bfd=false; shift ;;
|
||||
--disable-colltrace) collective_trace=false; shift ;;
|
||||
--fast) build_bfd=false; build_local_gpu_only=true; collective_trace=false; shift ;;
|
||||
-f | --fast) build_bfd=false; build_local_gpu_only=true; collective_trace=false; shift ;;
|
||||
-h | --help) display_help; exit 0 ;;
|
||||
-i | --install) install_library=true; shift ;;
|
||||
-l | --limit-nprocs) enable_all_jobs=false; shift ;;
|
||||
--local_gpu_only) build_local_gpu_only=true; shift ;;
|
||||
-j | --jobs) num_parallel_jobs=${2}; shift 2 ;;
|
||||
-l | --local_gpu_only) build_local_gpu_only=true; shift ;;
|
||||
--no_clean) clean_build=false; shift ;;
|
||||
--npkit-enable) npkit_enabled=true; shift ;;
|
||||
-p | --package_build) build_package=true; shift ;;
|
||||
--prefix) install_prefix=${2} shift 2 ;;
|
||||
--prefix) install_prefix=${2}; shift 2 ;;
|
||||
--rm-legacy-include-dir) build_freorg_bkwdcomp=false; shift ;;
|
||||
-r | --run_tests_quick) run_tests=true; shift ;;
|
||||
--run_tests_all) run_tests=true; run_tests_all=true; shift ;;
|
||||
@@ -305,12 +306,6 @@ fi
|
||||
|
||||
check_exit_code "$?"
|
||||
|
||||
if ($enable_all_jobs); then
|
||||
job_number=$(nproc)
|
||||
else
|
||||
job_number=16
|
||||
fi
|
||||
|
||||
if ($time_trace); then
|
||||
build_system="ninja"
|
||||
enable_ninja="-GNinja"
|
||||
@@ -326,9 +321,9 @@ fi
|
||||
check_exit_code "$?"
|
||||
|
||||
if ($install_library); then
|
||||
VERBOSE=${build_verbose} $build_system -j $job_number install
|
||||
VERBOSE=${build_verbose} $build_system -j $num_parallel_jobs install
|
||||
else
|
||||
VERBOSE=${build_verbose} $build_system -j $job_number
|
||||
VERBOSE=${build_verbose} $build_system -j $num_parallel_jobs
|
||||
fi
|
||||
check_exit_code "$?"
|
||||
|
||||
@@ -366,4 +361,4 @@ if ($time_trace); then
|
||||
else
|
||||
echo "Error: time-trace folder not found in $search_dir."
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
Référencer dans un nouveau ticket
Bloquer un utilisateur