Add optional bf16 software-triggered pipelining for reduceCopyPacks (#1758)
- Introduced double-buffering to reduce copy overhead and overlap BF16 arithmetic with data prefetching. - Aimed to improve performance of reduction-based collectives by up to 10%. - Implemented based on recommendations from Guennadi Riguer (AMD) - Added --force-reduce-pipeline option to install.sh to activate this optimization for BF16 reductions. - Feature is disabled by default to prevent regressions with large messages until auto-tuning logic is upstreamed. --------- Co-authored-by: Jeffrey Novotny <jnovotny@amd.com> Co-authored-by: Pedram Alizadeh <pmohamma@amd.com>
Šī revīzija ir iekļauta:
revīziju iesūtīja
GitHub
vecāks
1c3d1b3842
revīzija
0ce20e7e07
+9
-1
@@ -35,6 +35,7 @@ roctx_enabled=true
|
||||
run_tests=false
|
||||
run_tests_all=false
|
||||
time_trace=false
|
||||
force_reduce_pipeline=false
|
||||
|
||||
# #################################################
|
||||
# helper functions
|
||||
@@ -71,6 +72,7 @@ function display_help()
|
||||
echo " -t|--tests_build Build rccl unit tests, but do not run"
|
||||
echo " --time-trace Plot the build time of RCCL (requires \`ninja-build\` package installed on the system)"
|
||||
echo " --verbose Show compile commands"
|
||||
echo " --force-reduce-pipeline Force reduce_copy sw pipeline to be used for every reduce-based collectives and datatypes"
|
||||
}
|
||||
|
||||
# #################################################
|
||||
@@ -80,7 +82,7 @@ function display_help()
|
||||
# check if we have a modern version of getopt that can handle whitespace and long parameters
|
||||
getopt -T
|
||||
if [[ "$?" -eq 4 ]]; then
|
||||
GETOPT_PARSE=$(getopt --name "${0}" --options cdfhij:lprt --longoptions address-sanitizer,dependencies,debug,enable-code-coverage,enable_backtrace,disable-colltrace,disable-msccl-kernel,disable-mscclpp,fast,help,install,jobs:,local_gpu_only,amdgpu_targets:,no_clean,npkit-enable,log-trace,openmp-test-enable,roctx-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,static,tests_build,time-trace,verbose -- "$@")
|
||||
GETOPT_PARSE=$(getopt --name "${0}" --options cdfhij:lprt --longoptions address-sanitizer,dependencies,debug,enable-code-coverage,enable_backtrace,disable-colltrace,disable-msccl-kernel,disable-mscclpp,fast,help,install,jobs:,local_gpu_only,amdgpu_targets:,no_clean,npkit-enable,log-trace,openmp-test-enable,roctx-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,static,tests_build,time-trace,force-reduce-pipeline,verbose -- "$@")
|
||||
else
|
||||
echo "Need a new version of getopt"
|
||||
exit 1
|
||||
@@ -123,6 +125,7 @@ while true; do
|
||||
-t | --tests_build) build_tests=true; shift ;;
|
||||
--time-trace) time_trace=true; shift ;;
|
||||
--verbose) build_verbose=true; shift ;;
|
||||
--force-reduce-pipeline) force_reduce_pipeline=true; shift ;;
|
||||
--) shift ; break ;;
|
||||
*) echo "Unexpected command line parameter received; aborting";
|
||||
exit 1
|
||||
@@ -277,6 +280,11 @@ if [[ "${openmp_test_enabled}" == true ]]; then
|
||||
cmake_common_options="${cmake_common_options} -DOPENMP_TESTS_ENABLED=ON"
|
||||
fi
|
||||
|
||||
# Force Reduce pipeline
|
||||
if [[ "${force_reduce_pipeline}" == true ]]; then
|
||||
cmake_common_options="${cmake_common_options} -DFORCE_REDUCE_PIPELINING=ON"
|
||||
fi
|
||||
|
||||
# Enable NPKit
|
||||
if [[ "${npkit_enabled}" == true ]]; then
|
||||
cmake_common_options="${cmake_common_options} -DENABLE_NPKIT=ON"
|
||||
|
||||
Atsaukties uz šo jaunā problēmā
Block a user