diff --git a/CMakeLists.txt b/CMakeLists.txt index c8281f23ed..635ae3b3c4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -35,6 +35,7 @@ option(ENABLE_NPKIT "Enable NPKit" option(ENABLE_IFC "Enable indirect function call" OFF) option(GENERATE_SYM_KERNELS "Generate symmetric memory kernels" OFF) option(INSTALL_DEPENDENCIES "Force install dependencies" OFF) +option(REPORT_KERNEL_RESOURCE_USE "Append -Rpass-analysis=kernel to CXX flags" OFF) option(ROCTX "Enable ROCTX" ON) option(PROFILE "Enable profiling" OFF) option(TIMETRACE "Enable time-trace during compilation" OFF) @@ -1186,6 +1187,10 @@ if (HAVE_KERNARG_PRELOAD) target_compile_options(rccl PRIVATE -mllvm --amdgpu-kernarg-preload-count=16) endif() +if (REPORT_KERNEL_RESOURCE_USE) + target_link_options(rccl PRIVATE -Rpass-analysis=kernel-resource-usage) +endif() + if (DUMP_ASM) # Save temporary files from kernel compilation message(STATUS "Disassembling librccl.so to asm") # Maintain symbols but without changing code. Keep additional data in dwarf section of binary. diff --git a/install.sh b/install.sh index 6fa49d6d51..25ab0eb1ad 100755 --- a/install.sh +++ b/install.sh @@ -32,6 +32,7 @@ enable_mscclpp_clip=false num_parallel_jobs=$(nproc) npkit_enabled=false openmp_test_enabled=false +kernel_resource_use=false roctx_enabled=true run_tests=false run_tests_all=false @@ -61,6 +62,7 @@ function display_help() echo " -h|--help Prints this help message" echo " -i|--install Install RCCL library (see --prefix argument below)" echo " -j|--jobs Specify how many parallel compilation jobs to run ($num_parallel_jobs by default)" + echo " --kernel-resource-use Dump GPU kernel resource usage (e.g., VGPRs, scratch, spill) at link stage" echo " -l|--local_gpu_only Only compile for local GPU architecture" echo " --amdgpu_targets Only compile for specified GPU architecture(s). For multiple targets, separate by ';' (builds for all supported GPU architectures by default)" echo " --no_clean Don't delete files if they already exist" @@ -86,7 +88,7 @@ function display_help() # check if we have a modern version of getopt that can handle whitespace and long parameters getopt -T if [[ "$?" -eq 4 ]]; then - GETOPT_PARSE=$(getopt --name "${0}" --options cdfhij:lprt --longoptions address-sanitizer,dependencies,debug,dump-asm,enable-code-coverage,enable_backtrace,disable-colltrace,disable-msccl-kernel,enable-mscclpp,fast,help,install,jobs:,local_gpu_only,amdgpu_targets:,no_clean,npkit-enable,log-trace,openmp-test-enable,roctx-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,static,tests_build,time-trace,force-reduce-pipeline,generate-sym-kernels,verbose -- "$@") + GETOPT_PARSE=$(getopt --name "${0}" --options cdfhij:lprt --longoptions address-sanitizer,dependencies,debug,dump-asm,enable-code-coverage,enable_backtrace,disable-colltrace,disable-msccl-kernel,enable-mscclpp,fast,help,install,jobs:,kernel-resource-use,local_gpu_only,amdgpu_targets:,no_clean,npkit-enable,log-trace,openmp-test-enable,roctx-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,static,tests_build,time-trace,force-reduce-pipeline,generate-sym-kernels,verbose -- "$@") else echo "Need a new version of getopt" exit 1 @@ -116,6 +118,7 @@ while true; do -h | --help) display_help; exit 0 ;; -i | --install) install_library=true; shift ;; -j | --jobs) num_parallel_jobs=${2}; shift 2 ;; + --kernel-resource-use) kernel_resource_use=true; shift ;; -l | --local_gpu_only) build_local_gpu_only=true; shift ;; --amdgpu_targets) build_amdgpu_targets=${2}; shift 2 ;; --no_clean) clean_build=false; shift ;; @@ -271,6 +274,10 @@ if [[ "${install_library}" == true ]]; then cmake_common_options="${cmake_common_options} -DCMAKE_INSTALL_PREFIX=${install_prefix}" fi +if [[ "${kernel_resource_use}" == true ]]; then + cmake_common_options="${cmake_common_options} -DREPORT_KERNEL_RESOURCE_USE=ON" +fi + # Enable trace debug level if [[ "${log_trace}" == true ]]; then cmake_common_options="${cmake_common_options} -DTRACE=ON"