Add optional bf16 software-triggered pipelining for reduceCopyPacks (#1758)

- Introduced double-buffering to reduce copy overhead and overlap BF16 arithmetic with data prefetching.
- Aimed to improve performance of reduction-based collectives by up to 10%.
- Implemented based on recommendations from Guennadi Riguer (AMD)
- Added --force-reduce-pipeline option to install.sh to activate this optimization for BF16 reductions.
- Feature is disabled by default to prevent regressions with large messages until auto-tuning logic is upstreamed.
---------

Co-authored-by: Jeffrey Novotny <jnovotny@amd.com>
Co-authored-by: Pedram Alizadeh <pmohamma@amd.com>
Šī revīzija ir iekļauta:
Mustafa Abduljabbar
2025-07-25 10:57:05 -04:00
revīziju iesūtīja GitHub
vecāks 1c3d1b3842
revīzija 0ce20e7e07
4 mainīti faili ar 238 papildinājumiem un 1 dzēšanām
+9 -1
Parādīt failu
@@ -35,6 +35,7 @@ roctx_enabled=true
run_tests=false
run_tests_all=false
time_trace=false
force_reduce_pipeline=false
# #################################################
# helper functions
@@ -71,6 +72,7 @@ function display_help()
echo " -t|--tests_build Build rccl unit tests, but do not run"
echo " --time-trace Plot the build time of RCCL (requires \`ninja-build\` package installed on the system)"
echo " --verbose Show compile commands"
echo " --force-reduce-pipeline Force reduce_copy sw pipeline to be used for every reduce-based collectives and datatypes"
}
# #################################################
@@ -80,7 +82,7 @@ function display_help()
# check if we have a modern version of getopt that can handle whitespace and long parameters
getopt -T
if [[ "$?" -eq 4 ]]; then
GETOPT_PARSE=$(getopt --name "${0}" --options cdfhij:lprt --longoptions address-sanitizer,dependencies,debug,enable-code-coverage,enable_backtrace,disable-colltrace,disable-msccl-kernel,disable-mscclpp,fast,help,install,jobs:,local_gpu_only,amdgpu_targets:,no_clean,npkit-enable,log-trace,openmp-test-enable,roctx-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,static,tests_build,time-trace,verbose -- "$@")
GETOPT_PARSE=$(getopt --name "${0}" --options cdfhij:lprt --longoptions address-sanitizer,dependencies,debug,enable-code-coverage,enable_backtrace,disable-colltrace,disable-msccl-kernel,disable-mscclpp,fast,help,install,jobs:,local_gpu_only,amdgpu_targets:,no_clean,npkit-enable,log-trace,openmp-test-enable,roctx-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,static,tests_build,time-trace,force-reduce-pipeline,verbose -- "$@")
else
echo "Need a new version of getopt"
exit 1
@@ -123,6 +125,7 @@ while true; do
-t | --tests_build) build_tests=true; shift ;;
--time-trace) time_trace=true; shift ;;
--verbose) build_verbose=true; shift ;;
--force-reduce-pipeline) force_reduce_pipeline=true; shift ;;
--) shift ; break ;;
*) echo "Unexpected command line parameter received; aborting";
exit 1
@@ -277,6 +280,11 @@ if [[ "${openmp_test_enabled}" == true ]]; then
cmake_common_options="${cmake_common_options} -DOPENMP_TESTS_ENABLED=ON"
fi
# Force Reduce pipeline
if [[ "${force_reduce_pipeline}" == true ]]; then
cmake_common_options="${cmake_common_options} -DFORCE_REDUCE_PIPELINING=ON"
fi
# Enable NPKit
if [[ "${npkit_enabled}" == true ]]; then
cmake_common_options="${cmake_common_options} -DENABLE_NPKIT=ON"