From 546b37e35a404bedf4bb9b0603a2ba855be27ec7 Mon Sep 17 00:00:00 2001 From: Bertan Dogancay <111835151+BertanDogancay@users.noreply.github.com> Date: Mon, 15 Sep 2025 12:19:35 -0400 Subject: [PATCH] [BUILD] Stop generating sym kernels by default (#1907) * Stop generating sym kernels by default [ROCm/rccl commit: 93d86dd8e3a00bc655d189ad38af11bdc75ecba3] --- projects/rccl/CMakeLists.txt | 23 +++++++++++++---------- projects/rccl/install.sh | 12 ++++++++++-- projects/rccl/src/enqueue.cc | 7 +++++++ 3 files changed, 30 insertions(+), 12 deletions(-) diff --git a/projects/rccl/CMakeLists.txt b/projects/rccl/CMakeLists.txt index b486bf4783..a823248740 100644 --- a/projects/rccl/CMakeLists.txt +++ b/projects/rccl/CMakeLists.txt @@ -32,6 +32,7 @@ option(ENABLE_MSCCLPP_EXECUTOR "Enable MSCCL++ Executor" option(ENABLE_MSCCLPP_FORMAT_CHECKS "Enable formatting checks in MSCCL++" OFF) option(ENABLE_NPKIT "Enable NPKit" OFF) option(ENABLE_IFC "Enable indirect function call" OFF) +option(GENERATE_SYM_KERNELS "Generate symmetric memory kernels" OFF) option(INSTALL_DEPENDENCIES "Force install dependencies" OFF) option(ROCTX "Enable ROCTX" ON) option(PROFILE "Enable profiling" OFF) @@ -862,16 +863,18 @@ if (gen_py_result) message(FATAL_ERROR "${CMAKE_SOURCE_DIR}/src/device/generate.py failed") endif() -# Execute the python script to generate required symmetric memory kernels -execute_process( - COMMAND ${Python3_EXECUTABLE} ${CMAKE_SOURCE_DIR}/src/device/symmetric/generate.py ${GEN_SYM_DIR} - WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} - RESULT_VARIABLE gen_sym_py_result - ERROR_VARIABLE gen_sym_py_error -) -if (gen_sym_py_result) - message(SEND_ERROR "Error: ${gen_sym_py_error}") - message(FATAL_ERROR "${CMAKE_SOURCE_DIR}/src/device/symmetric/generate.py failed") +if (GENERATE_SYM_KERNELS) + # Execute the python script to generate required symmetric memory kernels + execute_process( + COMMAND ${Python3_EXECUTABLE} ${CMAKE_SOURCE_DIR}/src/device/symmetric/generate.py ${GEN_SYM_DIR} + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + RESULT_VARIABLE gen_sym_py_result + ERROR_VARIABLE gen_sym_py_error + ) + if (gen_sym_py_result) + message(SEND_ERROR "Error: ${gen_sym_py_error}") + message(FATAL_ERROR "${CMAKE_SOURCE_DIR}/src/device/symmetric/generate.py failed") + endif() endif() # Find the generated files in the output directory diff --git a/projects/rccl/install.sh b/projects/rccl/install.sh index 73734f9c9b..3b4c55a889 100755 --- a/projects/rccl/install.sh +++ b/projects/rccl/install.sh @@ -36,6 +36,7 @@ run_tests=false run_tests_all=false time_trace=false force_reduce_pipeline=false +generate_sym_kernels=false # ################################################# # helper functions @@ -73,6 +74,7 @@ function display_help() echo " --time-trace Plot the build time of RCCL (requires \`ninja-build\` package installed on the system)" echo " --verbose Show compile commands" echo " --force-reduce-pipeline Force reduce_copy sw pipeline to be used for every reduce-based collectives and datatypes" + echo " --generate-sym-kernels Generate symmetric memory kernels" } # ################################################# @@ -82,7 +84,7 @@ function display_help() # check if we have a modern version of getopt that can handle whitespace and long parameters getopt -T if [[ "$?" -eq 4 ]]; then - GETOPT_PARSE=$(getopt --name "${0}" --options cdfhij:lprt --longoptions address-sanitizer,dependencies,debug,enable-code-coverage,enable_backtrace,disable-colltrace,disable-msccl-kernel,enable-mscclpp,fast,help,install,jobs:,local_gpu_only,amdgpu_targets:,no_clean,npkit-enable,log-trace,openmp-test-enable,roctx-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,static,tests_build,time-trace,force-reduce-pipeline,verbose -- "$@") + GETOPT_PARSE=$(getopt --name "${0}" --options cdfhij:lprt --longoptions address-sanitizer,dependencies,debug,enable-code-coverage,enable_backtrace,disable-colltrace,disable-msccl-kernel,enable-mscclpp,fast,help,install,jobs:,local_gpu_only,amdgpu_targets:,no_clean,npkit-enable,log-trace,openmp-test-enable,roctx-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,static,tests_build,time-trace,force-reduce-pipeline,generate-sym-kernels,verbose -- "$@") else echo "Need a new version of getopt" exit 1 @@ -125,7 +127,8 @@ while true; do -t | --tests_build) build_tests=true; shift ;; --time-trace) time_trace=true; shift ;; --verbose) build_verbose=true; shift ;; - --force-reduce-pipeline) force_reduce_pipeline=true; shift ;; + --force-reduce-pipeline) force_reduce_pipeline=true; shift ;; + --generate-sym-kernels) generate_sym_kernels=true; shift ;; --) shift ; break ;; *) echo "Unexpected command line parameter received; aborting"; exit 1 @@ -285,6 +288,11 @@ if [[ "${force_reduce_pipeline}" == true ]]; then cmake_common_options="${cmake_common_options} -DFORCE_REDUCE_PIPELINING=ON" fi +# Generate symmetric memory kernels +if [[ "${generate_sym_kernels}" == true ]]; then + cmake_common_options="${cmake_common_options} -DGENERATE_SYM_KERNELS=ON" +fi + # Enable NPKit if [[ "${npkit_enabled}" == true ]]; then cmake_common_options="${cmake_common_options} -DENABLE_NPKIT=ON" diff --git a/projects/rccl/src/enqueue.cc b/projects/rccl/src/enqueue.cc index 8a91642596..3f2088193e 100644 --- a/projects/rccl/src/enqueue.cc +++ b/projects/rccl/src/enqueue.cc @@ -96,10 +96,15 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* ma CUDACHECK(hipDeviceGetAttribute(&WarpSize, hipDeviceAttributeWarpSize, cudaDev)); int ncclMaxSharedMem = rcclShmemDynamicSize(cudaArch, WarpSize); +#ifdef GENERATE_SYM_KERNELS for (int sym=0; sym <= 1; sym++) { int kcount = sym==0 ? KernelCount : ncclSymKernelCount; for (int k=0; k < kcount; k++) { void* fn = sym==0 ? ncclKerns[k].kernelFn : ncclSymKernelList[k]; +#else + for (int k = 0; k < KernelCount; k++) { + void* fn = ncclKerns[k].kernelFn; +#endif cudaFuncAttributes attr = {0}; if (fn == nullptr) continue; @@ -130,7 +135,9 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* ma } next_kernel:; } +#ifdef GENERATE_SYM_KERNELS } +#endif return result; }